In [45]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier, DMatrix

import numpy as np

import pandas as pd
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

import dask.dataframe as dd 

StatementMeta(Spark11, 19, 45, Finished, Available)

In [2]:
# Loading cleaned data

df = dd.read_csv('***/Cleaned_data.csv')
data = df.compute() 

data_copy = df.compute()

StatementMeta(Spark11, 19, 2, Finished, Available)

In [3]:
# Removing extra-added column

data = data.loc[:, ~df.columns.str.contains('^Unnamed')]
data.drop(['annual_inc_lg', 'tot_cur_bal_lg'], inplace=True, axis=1)

StatementMeta(Spark11, 19, 3, Finished, Available)

In [4]:
data.head()

StatementMeta(Spark11, 19, 4, Finished, Available)

Unnamed: 0,installment,addr_state,annual_inc,earliest_cr_line,emp_length,home_ownership,application_type,int_rate,loan_amnt,num_actv_bc_tl,loan_status,mort_acc,tot_cur_bal,open_acc,pub_rec,purpose,revol_util,sub_grade,term,total_acc,verification_status,fico
0,123.03,PA,55000.0,2003,10+ years,MORTGAGE,Individual,13.99,3600.0,2.0,0,1.0,144904.0,7.0,0.0,debt_consolidation,29.7,C4,36,13.0,Not Verified,677.0
1,820.28,SD,65000.0,1999,10+ years,MORTGAGE,Individual,11.99,24700.0,5.0,0,4.0,204396.0,22.0,0.0,small_business,19.2,C1,36,38.0,Not Verified,717.0
2,432.66,IL,63000.0,2000,10+ years,MORTGAGE,Joint App,10.78,20000.0,2.0,0,5.0,189699.0,6.0,0.0,home_improvement,56.2,B4,60,18.0,Not Verified,697.0
3,289.91,PA,104433.0,1998,3 years,MORTGAGE,Individual,22.45,10400.0,4.0,0,6.0,331730.0,12.0,0.0,major_purchase,64.5,F1,60,35.0,Source Verified,697.0
4,405.18,GA,34000.0,1987,4 years,RENT,Individual,13.44,11950.0,2.0,0,0.0,12798.0,5.0,0.0,debt_consolidation,68.4,C3,36,6.0,Source Verified,692.0


In [5]:
# Getting non-categorical (numerical) columns - will be used to check correlation with the target value

non_categorical = data.select_dtypes(exclude=['object']).columns
check_correlation = list(non_categorical.copy())
check_correlation.insert(0, check_correlation.pop(check_correlation.index('loan_status')))

StatementMeta(Spark11, 19, 5, Finished, Available)

In [6]:
def loan_status_lin_corr():
    linear_corr = pd.DataFrame()

    for col in data[check_correlation].columns:
        linear_corr.loc[col, 'pearson_corr'] = data[col].corr(data['loan_status'])
        linear_corr['abs_pearson_corr'] = abs(linear_corr['pearson_corr'])
    
    linear_corr.sort_values('abs_pearson_corr', ascending=False, inplace=True)
    linear_corr.drop('abs_pearson_corr', axis=1, inplace=True)
    linear_corr.drop('loan_status', axis=0, inplace=True)

    return linear_corr

StatementMeta(Spark11, 19, 6, Finished, Available)

In [7]:
lin_corr = loan_status_lin_corr()
lin_corr

StatementMeta(Spark11, 19, 7, Finished, Available)

Unnamed: 0,pearson_corr
int_rate,0.260228
term,0.17752
fico,-0.128228
mort_acc,-0.076709
tot_cur_bal,-0.071234
loan_amnt,0.06397
revol_util,0.05819
installment,0.050133
earliest_cr_line,0.043544
annual_inc,-0.04243


In [8]:
num_cols = list(data.select_dtypes(exclude=['object']).columns)
cat_cols = list(data.select_dtypes(include=['object']).columns)

StatementMeta(Spark11, 19, 8, Finished, Available)

In [9]:
# Get info on the missing data

def list_missing_data():
    for column in data.columns:
        if data[column].isna().sum() != 0:
            missing = data[column].isna().sum()
            portion = (missing / data.shape[0]) * 100
            print(f"'{column}': number of missing values '{missing}' ==> '{portion:.3f}%'")

StatementMeta(Spark11, 19, 9, Finished, Available)

In [10]:
list_missing_data()

StatementMeta(Spark11, 19, 10, Finished, Available)

'emp_length': number of missing values '76024' ==> '5.985%'
'revol_util': number of missing values '776' ==> '0.061%'


In [11]:
# Transforming the nominal values into the numerical, for easier comparison

data['emp_length'].replace('10+ years', '10 years', inplace=True)
data['emp_length'].replace('< 1 year', '0 years', inplace=True)

data['emp_length'].value_counts(dropna=False).sort_index()

StatementMeta(Spark11, 19, 11, Finished, Available)

0 years     100736
1 year       82998
10 years    422068
2 years     114353
3 years     101018
4 years      74582
5 years      78013
6 years      58286
7 years      56048
8 years      57688
9 years      48403
NaN          76024
Name: emp_length, dtype: int64

In [12]:
data['emp_length'].map( lambda x: str(x).split()[0]).value_counts(dropna=True).sort_index()
data['emp_length'] = data['emp_length'].map( lambda x: float(str(x).split()[0]))

StatementMeta(Spark11, 19, 12, Finished, Available)

In [13]:
data['emp_length'][0:5]

StatementMeta(Spark11, 19, 13, Finished, Available)

0    10.0
1    10.0
2    10.0
3     3.0
4     4.0
Name: emp_length, dtype: float64

In [14]:
# Filling in the missing data with the mean value, using the imputer

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(data[['emp_length','revol_util']])
data[['emp_length','revol_util']] = imputer.transform(data[['emp_length','revol_util']])
# data

StatementMeta(Spark11, 19, 14, Finished, Available)

In [15]:
data.head()

StatementMeta(Spark11, 19, 15, Finished, Available)

Unnamed: 0,installment,addr_state,annual_inc,earliest_cr_line,emp_length,home_ownership,application_type,int_rate,loan_amnt,num_actv_bc_tl,loan_status,mort_acc,tot_cur_bal,open_acc,pub_rec,purpose,revol_util,sub_grade,term,total_acc,verification_status,fico
0,123.03,PA,55000.0,2003,10.0,MORTGAGE,Individual,13.99,3600.0,2.0,0,1.0,144904.0,7.0,0.0,debt_consolidation,29.7,C4,36,13.0,Not Verified,677.0
1,820.28,SD,65000.0,1999,10.0,MORTGAGE,Individual,11.99,24700.0,5.0,0,4.0,204396.0,22.0,0.0,small_business,19.2,C1,36,38.0,Not Verified,717.0
2,432.66,IL,63000.0,2000,10.0,MORTGAGE,Joint App,10.78,20000.0,2.0,0,5.0,189699.0,6.0,0.0,home_improvement,56.2,B4,60,18.0,Not Verified,697.0
3,289.91,PA,104433.0,1998,3.0,MORTGAGE,Individual,22.45,10400.0,4.0,0,6.0,331730.0,12.0,0.0,major_purchase,64.5,F1,60,35.0,Source Verified,697.0
4,405.18,GA,34000.0,1987,4.0,RENT,Individual,13.44,11950.0,2.0,0,0.0,12798.0,5.0,0.0,debt_consolidation,68.4,C3,36,6.0,Source Verified,692.0


In [16]:
# Defining the encoder function, which will transorm nominal values into numerical
# 'to make it easier for the classifier'

def encode_cat(data):
    d2 = pd.get_dummies(data, columns=cat_cols)
    return d2

StatementMeta(Spark11, 19, 16, Finished, Available)

In [17]:
data_encoded = encode_cat(data)

StatementMeta(Spark11, 19, 17, Finished, Available)

In [18]:
data_encoded.head()

StatementMeta(Spark11, 19, 18, Finished, Available)

Unnamed: 0,installment,annual_inc,earliest_cr_line,int_rate,loan_amnt,num_actv_bc_tl,loan_status,mort_acc,tot_cur_bal,open_acc,pub_rec,revol_util,term,total_acc,fico,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,emp_length_0.0,emp_length_1.0,emp_length_2.0,emp_length_3.0,emp_length_4.0,emp_length_5.0,emp_length_5.998204645312776,emp_length_6.0,emp_length_7.0,emp_length_8.0,emp_length_9.0,emp_length_10.0,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,application_type_Individual,application_type_Joint App,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,sub_grade_A1,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,123.03,55000.0,2003,13.99,3600.0,2.0,0,1.0,144904.0,7.0,0.0,29.7,36,13.0,677.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,820.28,65000.0,1999,11.99,24700.0,5.0,0,4.0,204396.0,22.0,0.0,19.2,36,38.0,717.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,432.66,63000.0,2000,10.78,20000.0,2.0,0,5.0,189699.0,6.0,0.0,56.2,60,18.0,697.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,289.91,104433.0,1998,22.45,10400.0,4.0,0,6.0,331730.0,12.0,0.0,64.5,60,35.0,697.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
4,405.18,34000.0,1987,13.44,11950.0,2.0,0,0.0,12798.0,5.0,0.0,68.4,36,6.0,692.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [19]:
# Let's check percentege of the positive/negative class after we have dropped some values in the cleaning part

loan_data = data['loan_status'].value_counts()
loan_data_norm = data['loan_status'].value_counts(normalize=True)

print('Positive class: {}({} %)'.format(loan_data[0], round(loan_data_norm[0], 4)*100.0))
print('Negative class: {}({} %)'.format(loan_data[1], round(loan_data_norm[1], 4)*100.0))
print('Proportion:', round(loan_data[0] / loan_data[1], 2), ': 1')

StatementMeta(Spark11, 19, 19, Finished, Available)

Positive class: 1013752(79.81 %)
Negative class: 256465(20.19 %)
Proportion: 3.95 : 1


In [20]:
#     REMINDER

#     Confusion Matrix: a table showing correct predictions and types of incorrect predictions.
#     Precision: the number of true positives divided by all positive predictions. Precision is also called Positive Predictive Value. It is a measure of a classifier’s exactness. Low precision indicates a high number of false positives.
#     Recall: the number of true positives divided by the number of positive values in the test data. Recall is also called Sensitivity or the True Positive Rate. It is a measure of a classifier’s completeness. Low recall indicates a high number of false negatives.
#     F1: Score: the weighted average of precision and recall.

StatementMeta(Spark11, 19, 20, Finished, Available)

In [21]:
X = data_encoded.loc[:, data_encoded.columns != 'loan_status']
y = data_encoded['loan_status']
X_col = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

StatementMeta(Spark11, 19, 21, Finished, Available)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

StatementMeta(Spark11, 19, 22, Finished, Available)

(1016173, 135)
(1016173,)
(254044, 135)
(254044,)


In [23]:
# Let's save encoded data to csv

data.to_csv('***/Encoded_data.csv')  

StatementMeta(Spark11, 19, 23, Finished, Available)

In [24]:
# Since the features are varying in different ranges, we can have a problem
# It is important that we rescale the feature so that the model evaluation time ddecreases

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

StatementMeta(Spark11, 19, 24, Finished, Available)

In [25]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

StatementMeta(Spark11, 19, 25, Finished, Available)

(1016173, 135)
(1016173,)
(254044, 135)
(254044,)


In [26]:
# Leaving this here just in case I need it for something

# def return_scores(test, predicted):
#     roc_auc_sc = roc_auc_score(test, predicted)
#     accuracy_sc = accuracy_score(test, predicted)
#     recall_sc = recall_score(test, predicted)
#     f1_sc = f1_score(test, predicted)
#     precision_sc = precision_score(test, predicted)

#     return roc_auc_sc, accuracy_sc, recall_sc, f1_sc, precision_sc



# def get_model_score(model, model_params):
# cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
#     rep = 1

#     roc = []
#     acc = []
#     rec = []
#     f1 = []
#     prec = []

#     model = model(**model_params)

#     for train_index, test_index in cv.split(X, y):
#         print(rep)
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#         model.fit(X_train, y_train)
#         y_predict = model.predict(X_test)
#         roc_auc_sc, accuracy_sc, recall_sc, f1_sc, precision_sc = return_scores(y_test, y_predict)
#         roc.append(roc_auc_sc)
#         acc.append(accuracy_sc)
#         rec.append(recall_sc)
#         f1.append(f1_sc)
#         prec.append(precision_sc)
#         rep += 1


#     print('Mean ROC AUC score: {}'.format(np.mean(roc)))
#     print('Mean accuracy score: {}'.format(np.mean(acc)))
#     print('Mean recall score: {}'.format(np.mean(rec)))
#     print('Mean F1 score: {}'.format(np.mean(f1)))
#     print('Mean precision score: {}'.format(np.mean(prec)))


StatementMeta(Spark11, 19, 26, Finished, Available)

In [27]:
# accuracy was pretty high for training algorithms, but that value is misleading because our class is imbalanced
# this explains why accuracy cannot be relied on in this problem
# we can add penalties (add class weights) to make up for the low amount values in the class 1

StatementMeta(Spark11, 19, 27, Finished, Available)

In [28]:
# we have two classes - class_1 and class_2
# If the model makes a mistake where the true label is class_2 
# it is going to be penalized 10 times more than if it makes a mistake on a sample where the true class is class_1
# we want to have something like this because given the class distribution in the data, 
# the model will have a tendency of overfitting on the class_1 since it is overpopulated by default 
# by setting the class weights we are imposing an implicit constraint on the model that it is equally 
# bad to make a wrong prediction on 4 instances of the class_1 and 1 wrong prediction on an instance of the class_2.

StatementMeta(Spark11, 19, 28, Finished, Available)

In [29]:
# Scoring is set to roc_auc
# roc_auc_score is defined as the area under the ROC curve, 
# which is the curve having False Positive Rate on the x-axis and True Positive Rate on the y-axis at all classification thresholds.
# When 0.5 < AUC < 1, there is a high chance that the classifier will be able to distinguish the positive class values from the negative class values. 
# This is so because the classifier is able to detect more numbers of True positives and True negatives than False negatives and False positives.
# https://www.analyticsvidhya.com/blog/2020/06/auc-roc-curve-machine-learning/


def algorithm_pipeline(X_train, X_test, y_train, y_test, 
                       model, param_grid, cv=5, scoring_fit='roc_auc',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train, y_train)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test)
    else:
      pred = fitted_model.predict(X_test)
    
    return fitted_model, pred

StatementMeta(Spark11, 19, 29, Finished, Available)

In [30]:
# About XGBClassifier and how it works can be found on this link: https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost-HowItWorks.html
# The reason why it takes so long to execute is because it is an accurate algorithm, 
# but it is not very scalable as during each split find procedure it iterates over all entries of input data. 
# In practice, this means long training time

model = XGBClassifier()
param_grid = {
    'n_estimators': [50],
    'scale_pos_weight': [4],
    'use_label_encoder': [False]    
    }

xgb_model, xgb_y_pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=2)

StatementMeta(Spark11, 19, 30, Finished, Available)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


[CV]  n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 1.7min
[CV]  n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 1.7min
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] n_estimators=50, scale_pos_weight=4, use_label_encoder=False ....
[CV] n_estimators=50, scale_pos_weight=4, use_label_encoder=False ....
[CV]  n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 2.7min
[CV]  n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 2.7min


In [31]:
# ROC_AUC score is 0.719

xgb_estimator = xgb_model.best_estimator_
xbg_best_score = xgb_model.best_score_
xgb_best_params = xgb_model.best_params_

print(xbg_best_score)
print(xgb_best_params)

StatementMeta(Spark11, 19, 31, Finished, Available)

0.7192166023516142
{'n_estimators': 50, 'scale_pos_weight': 4, 'use_label_encoder': False}


In [32]:
# As the name suggests, "Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset 
# and takes the average to improve the predictive accuracy of that dataset." 
# Instead of relying on one decision tree, the random forest takes the prediction from each tree and based on the majority votes of predictions, 
# and it predicts the final output.
# The greater number of trees in the forest leads to higher accuracy and prevents the problem of overfitting. 

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50],
    'class_weight': [{0:1, 1:4}]
}

rf_model, rf_y_pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=2)

StatementMeta(Spark11, 19, 32, Finished, Available)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] class_weight={0: 1, 1: 4}, n_estimators=50 ......................
[CV] class_weight={0: 1, 1: 4}, n_estimators=50 ......................
[CV] ....... class_weight={0: 1, 1: 4}, n_estimators=50, total= 1.7min
[CV] ....... class_weight={0: 1, 1: 4}, n_estimators=50, total= 1.7min


In [33]:
# ROC_AUC score is 0.697

rf_estimator = rf_model.best_estimator_
rf_best_score = rf_model.best_score_
rf_best_params = rf_model.best_params_

print(rf_best_score)
print(rf_best_params)

StatementMeta(Spark11, 19, 33, Finished, Available)

0.6975254334454482
{'class_weight': {0: 1, 1: 4}, 'n_estimators': 50}


In [34]:
# DecisionTreeClassifier
# We can fit a DecisionTreeClassifier model on this dataset since it is a good model to 
# test due to it's sensitivity to the class distribution in the training dataset.
# Decision trees frequently perform well on imbalanced data. 
# They work by learning a hierarchy of if/else questions and this can force both classes to be addressed.
# Also, let's try evluating the model using repeated stratified k-fold cv

model = DecisionTreeClassifier()
param_grid = {
    'class_weight': [{0:1, 1:4}]
}

dtc_model, dtc_y_pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=2)

StatementMeta(Spark11, 19, 34, Finished, Available)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] class_weight={0: 1, 1: 4} .......................................
[CV] class_weight={0: 1, 1: 4} .......................................
[CV] ........................ class_weight={0: 1, 1: 4}, total=  23.0s
[CV] ........................ class_weight={0: 1, 1: 4}, total=  23.6s


In [35]:
rf_estimator = dtc_model.best_estimator_
rf_best_score = dtc_model.best_score_
rf_best_params = dtc_model.best_params_

print(rf_best_score)
print(rf_best_params)

StatementMeta(Spark11, 19, 35, Finished, Available)

0.5509641263020226
{'class_weight': {0: 1, 1: 4}}


In [36]:
# TODO in the future: try LogisticRegression

StatementMeta(Spark11, 19, 36, Finished, Available)

In [37]:
# XGBClassifier had the best performance out of all these three classiififers with the roc_auc_score ~ 0.719
# Let's try to find best hyperparameters and check if the classifier can perform even better on the test set

model = XGBClassifier()
param_grid = {
    'n_estimators': [50],
    'scale_pos_weight': [4],
    'use_label_encoder': [False],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]

    }
xgb_1_model, xgb_y_1_pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=2)

StatementMeta(Spark11, 19, 37, Submitted, Running)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] learning_rate=0.001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV] learning_rate=0.001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV] learning_rate=0.0001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV] learning_rate=0.0001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV]  learning_rate=0.001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 5.3min
[CV] learning_rate=0.01, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV]  learning_rate=0.001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 5.3min
[CV] learning_rate=0.01, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV]  learning_rate=0.0001, n_estimators=50, scale_pos_weight=4, use_label_encoder=False, total= 5.4min
[CV] learning_rate=0.1, n_estimators=50, scale_pos_weight=4, use_label_encoder=False 
[CV]  learning_r

In [41]:
# We got the same score as with the previous XGBClassifier

xgb_1_estimator = xgb_1_model.best_estimator_
xgb_1_best_score = xgb_1_model.best_score_
xgb_1_best_params = xgb_1_model.best_params_

print(xgb_1_best_score)
print(xgb_1_best_params)

StatementMeta(Spark11, 19, 41, Finished, Available)

0.7192166023516142
{'learning_rate': 0.3, 'n_estimators': 50, 'scale_pos_weight': 4, 'use_label_encoder': False}


In [53]:
# Classifier did a bit better than the last time abs
# Precision and recall values are not great, so maybe we should try to tweak the hyperparameters better, add/remove feattures or even use different
# scoring methog in the cross validation

y_pred = xgb_1_estimator.predict(X_test)
y_prob = xgb_1_estimator.predict_proba(X_test)[:,1]

score = roc_auc_score(y_test, y_prob)
matrix = confusion_matrix(y_test, y_pred)
print('score:', (score))
print('Classification report:\n {}\n'.format(classification_report(y_test, y_pred)))
print('Confusion matrix:\n {}\n'.format(matrix))

StatementMeta(Spark11, 19, 53, Finished, Available)

score: 0.7218408941085838
Classification report:
               precision    recall  f1-score   support

           0       0.89      0.64      0.74    202464
           1       0.32      0.68      0.44     51580

    accuracy                           0.65    254044
   macro avg       0.61      0.66      0.59    254044
weighted avg       0.77      0.65      0.68    254044


Confusion matrix:
 [[129276  73188]
 [ 16350  35230]]

