In [None]:
#No need to generate real hypothesis testing- just focus on 
# statistical hypotehesis in EDA, such as median, mode, etc


In [None]:
# model selection & tuning
from sklearn.model_selection import train_test_split, GridSearchCV , RandomizedSearchCV

from scipy.stats import uniform, randint

# Feature selection 
from sklearn.feature_selection import RFE , SelectKBest, f_classif



In [1]:
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
# Modelling imports
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

In [4]:
# define evaluate models function
# calculates mean absolute error, mean squared error, root mean squared error
# and r2 score
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [5]:
# additional evaluate models
def evaluate_model2(true, predicted, model, X_test):
    precision = precision_score(true, predicted , zero_division = 0)
    recall = recall_score(true, predicted , zero_division = 0)
    f1 = f1_score(true , predicted, zero_division = 0)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(true, y_pred_proba)
    pr_auc = average_precision_score(true, y_pred_proba)
    return precision, recall, f1, roc_auc, pr_auc

In [6]:
#Function to train and evaluate ML models:
def classification(X_train, y_train, X_test, y_test):

    # Dictionary of all models to see which one has best performance
    models = {
        "Logistic Regression": LogisticRegression(),
        # Logisitc regression with L1 regularization
        "Lasso": LogisticRegression(penalty='l1', solver='liblinear'),
        # Logisitic regression with L2 regularization
        "Ridge": LogisticRegression(penalty='l2', solver='liblinear'),

        "K-Neighbors Classifier": KNeighborsClassifier(),

        "Decision Tree": DecisionTreeClassifier(),

        "Random Forest Classifier": RandomForestClassifier(),
        # extreme gradient boosting
        "XGBClassifier": XGBClassifier(), 
        #
        "CatBoosting Classifier": CatBoostClassifier(verbose=False),

        "AdaBoost Classifier": AdaBoostClassifier(),

        "GradientBoosting Classifier": GradientBoostingClassifier()
    }
    # define empty list
    model_list = []
    r2_list =[]

    # for every model in the list
    for i in range(len(list(models))):
        # set model to model at index i
        model = list(models.values())[i]

        # Train model
        model.fit(X_train, y_train) 

        # Make predictions
        y_train_pred = model.predict(X_train)
        accuracy_train = accuracy_score(y_train , y_train_pred)

        y_test_pred = model.predict(X_test)
        accuracy_test = accuracy_score(y_test , y_test_pred)

        # Compute additional metrics like Precision , recall , F1-score , and ROC AUC
        precision = precision_score(y_test, y_test_pred , zero_division = 0)
        recall = recall_score(y_test , y_test_pred , zero_division = 0)
        f1 = f1_score(y_test , y_test_pred, zero_division = 0)
        roc_auc = roc_auc_score(y_test , model.predict_proba(X_test)[:, 1])
        
        # Evaluate Train and Test dataset
        # y_train is actual value, y_train_pred is predicted values from X_train
        model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
        # y_test is actual value, y_test_pred is predicted value from X_test
        model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


        # Evaluate Train and Test dataset with additional metrics
        model_train_precision, model_train_recall, model_train_f1, model_train_roc_auc = evaluate_model2(y_train, y_train_pred, model, X_train)

        model_test_precision, model_test_recall, model_test_f1, model_test_roc_auc = evaluate_model2(y_test, y_test_pred, model, X_test)


        print(list(models.keys())[i])

        # append each model to model_list
        model_list.append(list(models.keys())[i])
        
        print('Model performance for Training set')
        print("- Training Accuracy: {:.4f}".format(accuracy_train))
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
        print("- Precision: {:.4f}".format(model_train_precision))
        print("- Recall: {:.4f}".format(model_train_recall))
        print("- F1 Score: {:.4f}".format(model_train_f1))
        print("- ROC_AUC: {:.4f}".format(model_train_roc_auc))

        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- Testing Accuracy: {:.4f}".format(accuracy_test))
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        print("- Precision: {:.4f}".format(model_test_precision))
        print("- Recall: {:.4f}".format(model_test_recall))
        print("- F1 Score: {:.4f}".format(model_test_f1))
        print("- ROC_AUC: {:.4f}".format(model_test_roc_auc))
        #r2_list.append(model_test_r2)
    
        print('='*35)
        print('\n')
    

In [6]:
X_train=pd.read_csv('with_outlier_scaled_unbalanced_X_train.csv')
y_train=pd.read_csv('y_train.csv')

In [7]:
X_test=pd.read_csv('X_test.csv')
y_test=pd.read_csv('y_test.csv')

In [8]:
# Run the model on with outliers, scaled, unbalanced data:
classification(X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Logistic Regression
Model performance for Training set
- Training Accuracy: 0.8475
- Root Mean Squared Error: 0.3906
- Mean Absolute Error: 0.1525
- R2 Score: -0.1474
- Precision: 0.5512
- Recall: 0.1815
- F1 Score: 0.2730
- ROC_AUC: 0.8142
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  y = column_or_1d(y, warn=True)


Lasso
Model performance for Training set
- Training Accuracy: 0.8475
- Root Mean Squared Error: 0.3905
- Mean Absolute Error: 0.1525
- R2 Score: -0.1470
- Precision: 0.5517
- Recall: 0.1817
- F1 Score: 0.2734
- ROC_AUC: 0.8143
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  y = column_or_1d(y, warn=True)


Ridge
Model performance for Training set
- Training Accuracy: 0.8475
- Root Mean Squared Error: 0.3905
- Mean Absolute Error: 0.1525
- R2 Score: -0.1470
- Precision: 0.5517
- Recall: 0.1815
- F1 Score: 0.2731
- ROC_AUC: 0.8142
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  return self._fit(X, y)


K-Neighbors Classifier
Model performance for Training set
- Training Accuracy: 0.8733
- Root Mean Squared Error: 0.3559
- Mean Absolute Error: 0.1267
- R2 Score: 0.0473
- Precision: 0.6799
- Recall: 0.3736
- F1 Score: 0.4822
- ROC_AUC: 0.8957
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7028
- Root Mean Squared Error: 0.5452
- Mean Absolute Error: 0.2972
- R2 Score: -1.2515
- Precision: 0.2625
- Recall: 0.4969
- F1 Score: 0.3435
- ROC_AUC: 0.6398


Decision Tree
Model performance for Training set
- Training Accuracy: 0.9871
- Root Mean Squared Error: 0.1138
- Mean Absolute Error: 0.0129
- R2 Score: 0.9027
- Precision: 0.9969
- Recall: 0.9209
- F1 Score: 0.9574
- ROC_AUC: 0.9992
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7977
- Root Mean Squared Error: 0.4497
- Mean Absolute Error: 0.2023
- R2 Score: -0.5321
- Precision: 0.3364
- Recall: 0.3006
- F1 Score: 0.3175
- ROC_AUC: 0.5953




  return fit_method(estimator, *args, **kwargs)


Random Forest Classifier
Model performance for Training set
- Training Accuracy: 0.9870
- Root Mean Squared Error: 0.1139
- Mean Absolute Error: 0.0130
- R2 Score: 0.9024
- Precision: 0.9890
- Recall: 0.9281
- F1 Score: 0.9576
- ROC_AUC: 0.9972
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7014
- Root Mean Squared Error: 0.5464
- Mean Absolute Error: 0.2986
- R2 Score: -1.2618
- Precision: 0.2793
- Recall: 0.5746
- F1 Score: 0.3759
- ROC_AUC: 0.6534


XGBClassifier
Model performance for Training set
- Training Accuracy: 0.8598
- Root Mean Squared Error: 0.3744
- Mean Absolute Error: 0.1402
- R2 Score: -0.0545
- Precision: 0.6556
- Recall: 0.2358
- F1 Score: 0.3468
- ROC_AUC: 0.8432
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.3555
- Root Mean Squared Error: 0.8028
- Mean Absolute Error: 0.6445
- R2 Score: -3.8820
- Precision: 0.1705
- Recall: 0.8066
- F1 Score: 0.2815
- ROC_AUC: 0.5983


CatBoosting Cl

  y = column_or_1d(y, warn=True)


AdaBoost Classifier
Model performance for Training set
- Training Accuracy: 0.8493
- Root Mean Squared Error: 0.3882
- Mean Absolute Error: 0.1507
- R2 Score: -0.1337
- Precision: 0.5580
- Recall: 0.2179
- F1 Score: 0.3134
- ROC_AUC: 0.8184
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8318
- Root Mean Squared Error: 0.4102
- Mean Absolute Error: 0.1682
- R2 Score: -0.2742
- Precision: 0.4231
- Recall: 0.2059
- F1 Score: 0.2770
- ROC_AUC: 0.7614




  y = column_or_1d(y, warn=True)


GradientBoosting Classifier
Model performance for Training set
- Training Accuracy: 0.8512
- Root Mean Squared Error: 0.3857
- Mean Absolute Error: 0.1488
- R2 Score: -0.1192
- Precision: 0.5819
- Recall: 0.2041
- F1 Score: 0.3022
- ROC_AUC: 0.8233
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.3130
- Root Mean Squared Error: 0.8289
- Mean Absolute Error: 0.6870
- R2 Score: -4.2042
- Precision: 0.1675
- Recall: 0.8539
- F1 Score: 0.2801
- ROC_AUC: 0.6027




In [10]:
X_train=pd.read_csv('with_outlier_scaled_balanced_X_train.csv')
y_train=pd.read_csv('balanced_y_train.csv')

In [11]:
# Run the model on with outliers, scaled, balanced data:

classification(X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)


Logistic Regression
Model performance for Training set
- Training Accuracy: 0.7428
- Root Mean Squared Error: 0.5071
- Mean Absolute Error: 0.2572
- R2 Score: -0.0287
- Precision: 0.7337
- Recall: 0.7624
- F1 Score: 0.7478
- ROC_AUC: 0.8172
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  y = column_or_1d(y, warn=True)


Lasso
Model performance for Training set
- Training Accuracy: 0.7429
- Root Mean Squared Error: 0.5070
- Mean Absolute Error: 0.2571
- R2 Score: -0.0284
- Precision: 0.7338
- Recall: 0.7623
- F1 Score: 0.7478
- ROC_AUC: 0.8172
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  y = column_or_1d(y, warn=True)


Ridge
Model performance for Training set
- Training Accuracy: 0.7428
- Root Mean Squared Error: 0.5071
- Mean Absolute Error: 0.2572
- R2 Score: -0.0287
- Precision: 0.7337
- Recall: 0.7622
- F1 Score: 0.7477
- ROC_AUC: 0.8172
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1565
- Root Mean Squared Error: 0.9184
- Mean Absolute Error: 0.8435
- R2 Score: -5.3891
- Precision: 0.1565
- Recall: 1.0000
- F1 Score: 0.2707
- ROC_AUC: 0.5000




  return self._fit(X, y)


K-Neighbors Classifier
Model performance for Training set
- Training Accuracy: 0.8562
- Root Mean Squared Error: 0.3792
- Mean Absolute Error: 0.1438
- R2 Score: 0.4247
- Precision: 0.8098
- Recall: 0.9311
- F1 Score: 0.8662
- ROC_AUC: 0.9475
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.6361
- Root Mean Squared Error: 0.6033
- Mean Absolute Error: 0.3639
- R2 Score: -1.7567
- Precision: 0.2519
- Recall: 0.6728
- F1 Score: 0.3666
- ROC_AUC: 0.6668


Decision Tree
Model performance for Training set
- Training Accuracy: 0.9898
- Root Mean Squared Error: 0.1012
- Mean Absolute Error: 0.0102
- R2 Score: 0.9590
- Precision: 0.9981
- Recall: 0.9814
- F1 Score: 0.9897
- ROC_AUC: 0.9998
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7244
- Root Mean Squared Error: 0.5250
- Mean Absolute Error: 0.2756
- R2 Score: -1.0876
- Precision: 0.2378
- Recall: 0.3449
- F1 Score: 0.2815
- ROC_AUC: 0.5699




  return fit_method(estimator, *args, **kwargs)


Random Forest Classifier
Model performance for Training set
- Training Accuracy: 0.9897
- Root Mean Squared Error: 0.1013
- Mean Absolute Error: 0.0103
- R2 Score: 0.9590
- Precision: 0.9937
- Recall: 0.9858
- F1 Score: 0.9897
- ROC_AUC: 0.9991
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7365
- Root Mean Squared Error: 0.5133
- Mean Absolute Error: 0.2635
- R2 Score: -0.9959
- Precision: 0.3161
- Recall: 0.5876
- F1 Score: 0.4111
- ROC_AUC: 0.7158


XGBClassifier
Model performance for Training set
- Training Accuracy: 0.8662
- Root Mean Squared Error: 0.3657
- Mean Absolute Error: 0.1338
- R2 Score: 0.4650
- Precision: 0.9089
- Recall: 0.8141
- F1 Score: 0.8589
- ROC_AUC: 0.9447
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.6160
- Root Mean Squared Error: 0.6197
- Mean Absolute Error: 0.3840
- R2 Score: -1.9086
- Precision: 0.2090
- Recall: 0.5220
- F1 Score: 0.2985
- ROC_AUC: 0.6340


CatBoosting Cla

  y = column_or_1d(y, warn=True)


AdaBoost Classifier
Model performance for Training set
- Training Accuracy: 0.7892
- Root Mean Squared Error: 0.4591
- Mean Absolute Error: 0.2108
- R2 Score: 0.1568
- Precision: 0.7756
- Recall: 0.8139
- F1 Score: 0.7943
- ROC_AUC: 0.8755
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1789
- Root Mean Squared Error: 0.9061
- Mean Absolute Error: 0.8211
- R2 Score: -5.2194
- Precision: 0.1598
- Recall: 0.9976
- F1 Score: 0.2755
- ROC_AUC: 0.7242




  y = column_or_1d(y, warn=True)


GradientBoosting Classifier
Model performance for Training set
- Training Accuracy: 0.8328
- Root Mean Squared Error: 0.4088
- Mean Absolute Error: 0.1672
- R2 Score: 0.3314
- Precision: 0.8305
- Recall: 0.8364
- F1 Score: 0.8334
- ROC_AUC: 0.9205
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.1789
- Root Mean Squared Error: 0.9061
- Mean Absolute Error: 0.8211
- R2 Score: -5.2194
- Precision: 0.1598
- Recall: 0.9976
- F1 Score: 0.2755
- ROC_AUC: 0.7139




In [16]:
# sort values by greatest R2_score
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
4,Decision Tree,-0.443963
1,Lasso,-1.029521
5,Random Forest Regressor,-1.22119
8,AdaBoost Regressor,-1.686442
3,K-Neighbors Regressor,-2.243427
7,CatBoosting Regressor,-2.79566
6,XGBRegressor,-5.117326
2,Ridge,-6458.140573
0,Linear Regression,-6495.55068


In [8]:
#Function to train and evaluate ML models that don't require feature scaling:
def no_scale_classification(X_train, y_train, X_test, y_test):

    # Dictionary of all models to see which one has best performance
    models = {
        "Decision Tree": DecisionTreeClassifier(),

        "Random Forest Classifier": RandomForestClassifier(),
        # extreme gradient boosting
        "XGBClassifier": XGBClassifier(), 
        #
        "CatBoosting Classifier": CatBoostClassifier(verbose=False),

        "AdaBoost Classifier": AdaBoostClassifier(),

        "GradientBoosting Classifier": GradientBoostingClassifier()
    }
    # define empty list
    model_list = []
    r2_list =[]

    # for every model in the list
    for i in range(len(list(models))):
        # set model to model at index i
        model = list(models.values())[i]

        # Train model
        model.fit(X_train, y_train) 

        # Get predicted probabilities for the positive class
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Make predictions
        y_train_pred = model.predict(X_train)
        accuracy_train = accuracy_score(y_train , y_train_pred)

        y_test_pred = model.predict(X_test)
        accuracy_test = accuracy_score(y_test , y_test_pred)

        # Compute additional metrics like Precision , recall , F1-score , and ROC AUC, and PR AUC
        # precision = precision_score(y_test, y_test_pred , zero_division = 0)
        # recall = recall_score(y_test , y_test_pred , zero_division = 0)
        # f1 = f1_score(y_test , y_test_pred, zero_division = 0)
        # roc_auc = roc_auc_score(y_test , y_pred_proba)
        # pr_auc_score = average_precision_score(y_test, y_pred_proba)
        
        # Evaluate Train and Test dataset
        # y_train is actual value, y_train_pred is predicted values from X_train
        model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
        # y_test is actual value, y_test_pred is predicted value from X_test
        model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)


        # Evaluate Train and Test dataset with additional metrics
        model_train_precision, model_train_recall, model_train_f1, model_train_roc_auc, model_train_pr_auc = evaluate_model2(y_train, y_train_pred, model, X_train)

        model_test_precision, model_test_recall, model_test_f1, model_test_roc_auc, model_test_pr_auc = evaluate_model2(y_test, y_test_pred, model, X_test)


        print(list(models.keys())[i])

        # append each model to model_list
        model_list.append(list(models.keys())[i])
        
        print('Model performance for Training set')
        print("- Training Accuracy: {:.4f}".format(accuracy_train))
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))
        print("- Precision: {:.4f}".format(model_train_precision))
        print("- Recall: {:.4f}".format(model_train_recall))
        print("- F1 Score: {:.4f}".format(model_train_f1))
        print("- ROC_AUC: {:.4f}".format(model_train_roc_auc))
        print("- PR_AUC: {:.4f}".format(model_train_pr_auc))

        print('----------------------------------')
        
        print('Model performance for Test set')
        print("- Testing Accuracy: {:.4f}".format(accuracy_test))
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        print("- Precision: {:.4f}".format(model_test_precision))
        print("- Recall: {:.4f}".format(model_test_recall))
        print("- F1 Score: {:.4f}".format(model_test_f1))
        print("- ROC_AUC: {:.4f}".format(model_test_roc_auc))
        print("- PR_AUC: {:.4f}".format(model_test_pr_auc))
        #r2_list.append(model_test_r2)
    
        print('='*35)
        print('\n')
    

In [22]:
X_train=pd.read_csv('with_outlier_unscaled_unbalanced_X_train.csv')
y_train=pd.read_csv('y_train.csv')

In [23]:
no_scale_classification(X_train, y_train, X_test, y_test)

Decision Tree
Model performance for Training set
- Training Accuracy: 0.9871
- Root Mean Squared Error: 0.1138
- Mean Absolute Error: 0.0129
- R2 Score: 0.9027
- Precision: 0.9969
- Recall: 0.9209
- F1 Score: 0.9574
- ROC_AUC: 0.9992
- PR_AUC: 0.9947
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7845
- Root Mean Squared Error: 0.4643
- Mean Absolute Error: 0.2155
- R2 Score: -0.6327
- Precision: 0.3198
- Recall: 0.3347
- F1 Score: 0.3271
- ROC_AUC: 0.5963
- PR_AUC: 0.2144




  return fit_method(estimator, *args, **kwargs)


Random Forest Classifier
Model performance for Training set
- Training Accuracy: 0.9870
- Root Mean Squared Error: 0.1139
- Mean Absolute Error: 0.0130
- R2 Score: 0.9024
- Precision: 0.9876
- Recall: 0.9295
- F1 Score: 0.9577
- ROC_AUC: 0.9972
- PR_AUC: 0.9910
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8385
- Root Mean Squared Error: 0.4018
- Mean Absolute Error: 0.1615
- R2 Score: -0.2230
- Precision: 0.4674
- Recall: 0.2268
- F1 Score: 0.3054
- ROC_AUC: 0.7858
- PR_AUC: 0.3798


XGBClassifier
Model performance for Training set
- Training Accuracy: 0.8598
- Root Mean Squared Error: 0.3744
- Mean Absolute Error: 0.1402
- R2 Score: -0.0545
- Precision: 0.6556
- Recall: 0.2358
- F1 Score: 0.3468
- ROC_AUC: 0.8432
- PR_AUC: 0.5225
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8506
- Root Mean Squared Error: 0.3866
- Mean Absolute Error: 0.1494
- R2 Score: -0.1320
- Precision: 0.5616
- Recall: 0.2061
- 

  y = column_or_1d(y, warn=True)


AdaBoost Classifier
Model performance for Training set
- Training Accuracy: 0.8493
- Root Mean Squared Error: 0.3882
- Mean Absolute Error: 0.1507
- R2 Score: -0.1337
- Precision: 0.5580
- Recall: 0.2179
- F1 Score: 0.3134
- ROC_AUC: 0.8184
- PR_AUC: 0.4477
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8515
- Root Mean Squared Error: 0.3854
- Mean Absolute Error: 0.1485
- R2 Score: -0.1251
- Precision: 0.5628
- Recall: 0.2286
- F1 Score: 0.3251
- ROC_AUC: 0.8231
- PR_AUC: 0.4496




  y = column_or_1d(y, warn=True)


GradientBoosting Classifier
Model performance for Training set
- Training Accuracy: 0.8512
- Root Mean Squared Error: 0.3857
- Mean Absolute Error: 0.1488
- R2 Score: -0.1192
- Precision: 0.5819
- Recall: 0.2041
- F1 Score: 0.3022
- ROC_AUC: 0.8233
- PR_AUC: 0.4602
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8522
- Root Mean Squared Error: 0.3845
- Mean Absolute Error: 0.1478
- R2 Score: -0.1199
- Precision: 0.5758
- Recall: 0.2104
- F1 Score: 0.3082
- ROC_AUC: 0.8262
- PR_AUC: 0.4564




In [9]:
# No scale, balanced data
X_train=pd.read_csv('with_outlier_unscaled_balanced_X_train.csv')
y_train=pd.read_csv('balanced_y_train.csv')

In [25]:
no_scale_classification(X_train, y_train, X_test, y_test)

Decision Tree
Model performance for Training set
- Training Accuracy: 0.9898
- Root Mean Squared Error: 0.1012
- Mean Absolute Error: 0.0102
- R2 Score: 0.9590
- Precision: 0.9981
- Recall: 0.9814
- F1 Score: 0.9897
- ROC_AUC: 0.9998
- PR_AUC: 0.9997
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7518
- Root Mean Squared Error: 0.4982
- Mean Absolute Error: 0.2482
- R2 Score: -0.8799
- Precision: 0.2974
- Recall: 0.4299
- F1 Score: 0.3516
- ROC_AUC: 0.6171
- PR_AUC: 0.2203




  return fit_method(estimator, *args, **kwargs)


Random Forest Classifier
Model performance for Training set
- Training Accuracy: 0.9898
- Root Mean Squared Error: 0.1012
- Mean Absolute Error: 0.0102
- R2 Score: 0.9590
- Precision: 0.9940
- Recall: 0.9855
- F1 Score: 0.9897
- ROC_AUC: 0.9992
- PR_AUC: 0.9993
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8136
- Root Mean Squared Error: 0.4318
- Mean Absolute Error: 0.1864
- R2 Score: -0.4120
- Precision: 0.4094
- Recall: 0.4318
- F1 Score: 0.4203
- ROC_AUC: 0.7886
- PR_AUC: 0.3776


XGBClassifier
Model performance for Training set
- Training Accuracy: 0.8663
- Root Mean Squared Error: 0.3656
- Mean Absolute Error: 0.1337
- R2 Score: 0.4654
- Precision: 0.9062
- Recall: 0.8173
- F1 Score: 0.8595
- ROC_AUC: 0.9449
- PR_AUC: 0.9548
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8344
- Root Mean Squared Error: 0.4070
- Mean Absolute Error: 0.1656
- R2 Score: -0.2547
- Precision: 0.4696
- Recall: 0.4496
- F

  y = column_or_1d(y, warn=True)


AdaBoost Classifier
Model performance for Training set
- Training Accuracy: 0.8165
- Root Mean Squared Error: 0.4283
- Mean Absolute Error: 0.1835
- R2 Score: 0.2662
- Precision: 0.8107
- Recall: 0.8259
- F1 Score: 0.8182
- ROC_AUC: 0.9063
- PR_AUC: 0.9162
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.7802
- Root Mean Squared Error: 0.4688
- Mean Absolute Error: 0.2198
- R2 Score: -0.6649
- Precision: 0.3807
- Recall: 0.6451
- F1 Score: 0.4788
- ROC_AUC: 0.8175
- PR_AUC: 0.4395




  y = column_or_1d(y, warn=True)


GradientBoosting Classifier
Model performance for Training set
- Training Accuracy: 0.8455
- Root Mean Squared Error: 0.3930
- Mean Absolute Error: 0.1545
- R2 Score: 0.3821
- Precision: 0.8592
- Recall: 0.8265
- F1 Score: 0.8425
- ROC_AUC: 0.9298
- PR_AUC: 0.9404
----------------------------------
Model performance for Test set
- Testing Accuracy: 0.8136
- Root Mean Squared Error: 0.4317
- Mean Absolute Error: 0.1864
- R2 Score: -0.4116
- Precision: 0.4261
- Recall: 0.5501
- F1 Score: 0.4802
- ROC_AUC: 0.8229
- PR_AUC: 0.4490




In [None]:
# Initialize Gradient Boosting Classifier (baseline) and run it
gbrt = GradientBoostingClassifier(random_state=52)
classification(gbrt)
