In [2]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [8]:
import sklearn.externals as extjoblib
import joblib

In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

import lightgbm as lgb
import gc

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.mixture import GaussianMixture

pd.options.display.max_rows=999

In [None]:
df1 = pd.read_csv('2020_Competition_Training.csv',low_memory=False)
df1.shape

#### Drop Categorical Variable (Indicator variable) and Continous Variable (Highly Multicollinearity)

In [None]:
drop_data=pd.read_csv('drop_columns.csv')
dropped_columns=drop_data['names'].tolist()
df2=df1.drop(dropped_columns,axis=1)
df2.shape

#### Update Variable DataType

In [None]:
#pd.DataFrame(df2.dtypes).to_csv('data_type.csv')

interval_data=pd.read_csv('Interval_Variables.csv')
interval_columns=interval_data['names'].tolist()

categorical_data=pd.read_csv('Categorical_Variables.csv')
categorical_columns=categorical_data['names'].tolist()

#### Convert to Categorical Data (Using Label Encoding)

In [None]:
for i in categorical_columns:
    df2[i] = df2[i].astype('category')
    df2[i] = df2[i].cat.codes
    df2[i].fillna(df2[i].mode(),inplace=True)

In [None]:
for i in interval_columns:
    df2[i] = (pd.to_numeric(df2[i]))*1.00000
    df2[i].fillna(df2[i].mean(),inplace=True)
#pd.DataFrame(df2.dtypes).to_csv('data_type_updated.csv')

Clustering Analysis:

In [None]:
df3 = df2.drop(['transportation_issues','person_id_syn','zip_cd','cnty_cd','state_cd'], 1)

In [None]:
gmm = GaussianMixture(n_components=2)
gmm.fit(df3)

In [None]:
#predictions from gmm
labels = gmm.predict(df3)

In [None]:
unique, counts = np.unique(labels, return_counts=True)
np.asarray((unique, counts)).T

#For 4 clusters
array([[    0,  4192],
       [    1, 19981],
       [    2, 42457],
       [    3,  2942]], dtype=int64)
       
#For 5 clusters
array([[    0, 19586],
       [    1,  3000],
       [    2, 38343],
       [    3,  1798],
       [    4,  6845]], dtype=int64)
       
#For 6 clusters
array([[    0,  1423],
       [    1,  2098],
       [    2, 20661],
       [    3,  4958],
       [    4, 37689],
       [    5,  2743]], dtype=int64)

#### Perform stratified train and test split

In [None]:
df3['Cluster']=labels

In [None]:
df3['transportation_issues'] = df2['transportation_issues']

In [None]:
my_crosstab = pd.crosstab(index=df3['Cluster'], columns=df3['transportation_issues'],margins=True)   # Include row and column totals
my_crosstab

In [None]:
df3_Cluster = df3[df3['Cluster']==1]

In [None]:
X = df3_Cluster.drop(['transportation_issues'], 1)

In [None]:
Y = df3_Cluster[['transportation_issues']]
Y['transportation_issues'] = Y['transportation_issues'].astype('category')
Y['transportation_issues'] = Y['transportation_issues'].cat.codes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, stratify = Y)

In [None]:
print("Length of Training (X):",len(X_train))
print("Length of Training (Y):",len(y_train))
print("Length of Test (X):",len(X_test))
print("Length of Test (Y):",len(y_test))

In [None]:
print("Distribution of Y in Overall Data: ",Y['transportation_issues'].value_counts(normalize=True))
print("Distribution of Y in Train Data: ",y_train['transportation_issues'].value_counts(normalize=True))
print("Distribution of Y in Test Data: ",y_test['transportation_issues'].value_counts(normalize=True))

#### Light GBM as a Feature Importance Algorithm

In [None]:
#X_train = X_train.values.astype(np.float32, copy=False)
train_data=lgb.Dataset(X_train, label=y_train)

#Max Depth: 30, AUC Train: 0.8356306010875826, AUC Test:  0.7512941064532952
#Max Depth: 25, AUC Train: 0.8356306010875826, AUC Test:  0.7512941064532952
#Max Depth: 20, AUC Train: 0.8356306010875826, AUC Test:  0.7512941064532952
#Max Depth: 15, AUC Train: 0.8356306010875826, AUC Test:  0.7512941064532952
#Max Depth: 12, AUC Train: 0.8347654095204213, AUC Test:  0.7516753138653118
#Max Depth: 10, AUC Train: 0.8346912618193673, AUC Test:  0.7519381243918067
#Max Depth: 9,  AUC Train: 0.8344044285140264, AUC Test:  0.751128879407961
#Max Depth: 8,  AUC Train: 0.8328881168017364, AUC Test:  0.7509371813463662

#Max Depth: <=0, AUC Train:  0.932791181302066, AUC Test:  0.7378888286695064,  CV = 5
#Max Depth: -1 , AUC Train:  0.8064933442735, AUC Test:  0.7515543566006821,  CV = 5

In [None]:
# set default parameters for 1st round training
params = {'boosting_type': 'gbdt',
         'max_depth' : -1,
          'objective': 'binary',
          'nthread': 5,
          'num_leaves': 64,
          'learning_rate': 0.02,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1.2,
          'reg_lambda': 1.2,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
          'extra_trees' : 'True',
          'is_unbalance' : 'True'
          }

In [None]:
gridParams = {
    'learning_rate': [0.02,0.03],
    'n_estimators': [8,16],
    'num_leaves': [20, 24, 27],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], 
    'colsample_bytree' : [0.64, 0.65],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1, 1.2],
    'reg_lambda' : [ 1.2, 1.4],
    }

In [None]:
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
       objective = 'binary',
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

In [None]:
mdl.get_params().keys()

In [None]:
# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=2, cv=8, n_jobs=-1)

In [None]:
# Run the grid
grid.fit(X_train, y_train)

In [None]:
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Using parameters already set above, replace in the best from the grid search
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate']
#params['num_leaves'] = grid.best_params_['num_leaves']
params['reg_alpha'] = grid.best_params_['reg_alpha']
params['reg_lambda'] = grid.best_params_['reg_lambda']
params['subsample'] = grid.best_params_['subsample']

In [None]:
print('Fitting with params: ')
print(params)

#Train model on selected parameters and number of iterations
lgbm = lgb.train(params,
                 train_data,
                 450,
                 #early_stopping_rounds= 40,
                 verbose_eval= 4
                 )

In [None]:
#Predict on train set
predictions_lgbm_prob = lgbm.predict(X_train)
predictions_lgbm_01 = np.where(predictions_lgbm_prob > 0.5, 1, 0) #Turn probability to 0-1 binary output

In [None]:
my_crosstab = pd.crosstab(index=y_train['transportation_issues'], columns=predictions_lgbm_01,margins=True)   # Include row and column totals
my_crosstab

In [None]:
#Plot Variable Importances
lgb.plot_importance(lgbm, max_num_features=15, importance_type='split')

#Print accuracy
acc_lgbm = accuracy_score(y_train,predictions_lgbm_01)
print('Overall accuracy of Light GBM model:', acc_lgbm)

#Print Area Under Curve
plt.figure()
false_positive_rate, recall, thresholds = roc_curve(y_train, predictions_lgbm_prob)
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

print('AUC score:', roc_auc)

#Print Confusion Matrix
plt.figure()
cm = confusion_matrix(y_train, predictions_lgbm_01)
labels = ['No Default', 'Default']
plt.figure(figsize=(8,6))
sns.heatmap(cm, xticklabels = labels, yticklabels = labels, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

In [None]:
#Predict on train set
predictions_lgbm_prob = lgbm.predict(X_test)
predictions_lgbm_01 = np.where(predictions_lgbm_prob > 0.5, 1, 0) #Turn probability to 0-1 binary output

In [None]:
#Plot Variable Importances
lgb.plot_importance(lgbm, max_num_features=10, importance_type='split')

#Print accuracy
acc_lgbm = accuracy_score(y_test,predictions_lgbm_01)
print('Overall accuracy of Light GBM model:', acc_lgbm)

#Print Area Under Curve
plt.figure()
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions_lgbm_prob)
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

print('AUC score:', roc_auc)

#Print Confusion Matrix
plt.figure()
cm = confusion_matrix(y_test, predictions_lgbm_01)
labels = ['No Default', 'Default']
plt.figure(figsize=(8,6))
sns.heatmap(cm, xticklabels = labels, yticklabels = labels, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

In [None]:
#X_train, X_test, y_train, y_test
X_train_trim = X_train[['est_age']]
X_test_trim = X_test[['est_age']]

classifier = LogisticRegression(random_state = 0) 
classifier.fit(X_train_trim, y_train) 

y_pred = classifier.predict(X_train_trim)

In [None]:
cm = confusion_matrix(y_train, y_pred)
print("Confusion Matrix : \n", cm)

In [None]:
print ("Accuracy : ", accuracy_score(y_train, y_pred)) 

In [None]:
y_pred = classifier.predict(X_test_trim)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : \n", cm)
print ("Accuracy : ", accuracy_score(y_test, y_pred)) 

In [None]:
#Print Area Under Curve
plt.figure()
false_positive_rate, recall, thresholds = roc_curve(y_test,y_pred)
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic (ROC)')
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()

print('AUC score:', roc_auc)

#Print Confusion Matrix
plt.figure()
cm = confusion_matrix(y_test, y_pred)
labels = ['No Default', 'Default']
plt.figure(figsize=(8,6))
sns.heatmap(cm, xticklabels = labels, yticklabels = labels, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()