In [62]:
import warnings
import numpy as np
import pandas as pd
%matplotlib inline
from scipy.io import arff
import missingno as msno

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from collections import Counter
from collections import OrderedDict
from imblearn.over_sampling import SMOTE 

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

import random

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
# from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


In [63]:
def load_arff_raw_data():
    N=5
    return [arff.loadarff('E:/Study material/BDBA/study material/Sem-IV/Master Thesis/data/' + str(i+1) + 'year.arff') for i in range(N)]

def load_dataframes():
    return [pd.DataFrame(data_i_year[0]) for data_i_year in load_arff_raw_data()]

def set_new_headers(dataframes):
    cols = ['X' + str(i+1) for i in range(len(dataframes[0].columns)-1)]
    cols.append('Y')
    for df in dataframes:
        df.columns = cols

dataframes = load_dataframes()

set_new_headers(dataframes)    

dataframes[0].head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X56,X57,X58,X59,X60,X61,X62,X63,X64,Y
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,b'0'
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,b'0'
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,b'0'
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,b'0'
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,b'0'


In [64]:
def convert_columns_type_float(dfs):
    for i in range(5):
        index = 1
        while(index<=63):
            colname = dfs[i].columns[index]
            col = getattr(dfs[i], colname)
            dfs[i][colname] = col.astype(float)
            index+=1
            
convert_columns_type_float(dataframes)

In [65]:
def convert_class_label_type_int(dfs):
    for i in range(len(dfs)):
        col = getattr(dfs[i], 'Y')
        dfs[i]['Y'] = col.astype(int)
        
convert_class_label_type_int(dataframes)

In [66]:

def drop_nan_rows(dataframes, verbose=False):
    clean_dataframes = [df.dropna(axis=0, how='any') for df in dataframes]
    if verbose:
        for i in range(len(dataframes)):
            print(str(i+1)+'year:','Original Length=', len(dataframes[i]), '\tCleaned Length=', len(clean_dataframes[i]), '\tMissing Data=', len(dataframes[i])-len(clean_dataframes[i]))
    return clean_dataframes

nan_dropped_dataframes = drop_nan_rows(dataframes, verbose=True)

1year: Original Length= 7027 	Cleaned Length= 3194 	Missing Data= 3833
2year: Original Length= 10173 	Cleaned Length= 4088 	Missing Data= 6085
3year: Original Length= 10503 	Cleaned Length= 4885 	Missing Data= 5618
4year: Original Length= 9792 	Cleaned Length= 4769 	Missing Data= 5023
5year: Original Length= 5910 	Cleaned Length= 3031 	Missing Data= 2879


In [67]:
from fancyimpute import IterativeImputer as MICE
# Obtaining the completed features for all the 5 dataframes by doing MICE (Multiple Imputation from Chained Equations)
def perform_MICE_imputation(dfs):
    mice_imputed_datasets = [MICE(verbose=False).fit_transform(dfs[i]) for i in range(len(dfs))]
    return [pd.DataFrame(data=mice_imputed_datasets[i]) for i in range(len(dfs))]
    
mice_imputed_dataframes = perform_MICE_imputation(dataframes)
set_new_headers(mice_imputed_dataframes)

In [71]:
imputed_dataframes_dictionary = OrderedDict()

imputed_dataframes_dictionary['MICE'] = mice_imputed_dataframes
# imputed_dataframes_dictionary['NONE'] = nan_dropped_dataframes

In [72]:
def check_data_imbalance(dfs):
    for i in range(len(dfs)):
        print('Dataset: '+str(i+1)+'year')
        print(dfs[i].groupby('Y').size())
        minority_percent = (dfs[i]['Y'].tolist().count(1) / len(dfs[i]['Y'].tolist()))*100
        print('Minority (label 1) percentage: '+  str(minority_percent) + '%')
        print('-'*64)
        print(dfs)
        
check_data_imbalance(dataframes)

Dataset: 1year
Y
0    6756
1     271
dtype: int64
Minority (label 1) percentage: 3.856553294435748%
----------------------------------------------------------------
[            X1       X2       X3       X4        X5        X6        X7  \
0     0.200550  0.37951  0.39641  2.04720   32.3510  0.388250  0.249760   
1     0.209120  0.49988  0.47225  1.94470   14.7860  0.000000  0.258340   
2     0.248660  0.69592  0.26713  1.55480   -1.1523  0.000000  0.309060   
3     0.081483  0.30734  0.45879  2.49280   51.9520  0.149880  0.092704   
4     0.187320  0.61323  0.22960  1.40630   -7.3128  0.187320  0.187320   
...        ...      ...      ...      ...       ...       ...       ...   
7022  0.018371  0.47410 -0.13619  0.60839  -18.4490  0.018371  0.018371   
7023 -0.013359  0.58354 -0.02265  0.92896  -42.2320 -0.013359 -0.015036   
7024  0.006338  0.50276  0.43923  1.87360    9.7417  0.006338  0.012022   
7025 -0.041643  0.84810 -0.12852  0.57485 -121.9200  0.000000 -0.036795   
7026  0.0

In [73]:
def split_dataframes_features_labels(dfs):
    feature_dfs = [dfs[i].iloc[:,0:64] for i in range(len(dfs))]
    label_dfs = [dfs[i].iloc[:,64] for i in range(len(dfs))]
    return feature_dfs, label_dfs


def oversample_data_SMOTE(dfs, verbose=False):
    smote = SMOTE(random_state=42, k_neighbors=10)
    feature_dfs, label_dfs = split_dataframes_features_labels(dfs)
    resampled_feature_arrays = []
    resampled_label_arrays = []
    for i in range(len(dfs)):
        if verbose: print('Dataset: ' + str(i+1) + 'year:')
        if verbose: print('Original dataset shape {}'.format(Counter(label_dfs[i])))
        dfi_features_res, dfi_label_res = smote.fit_sample(feature_dfs[i], label_dfs[i])
        if verbose: print('Resampled dataset shape {}\n'.format(Counter(dfi_label_res)))
        resampled_feature_arrays.append(dfi_features_res)
        resampled_label_arrays.append(dfi_label_res)        
    return resampled_feature_arrays, resampled_label_arrays

def restructure_arrays_to_dataframes(feature_arrays, label_arrays):
    resampled_dfs = []
    for i in range(len(feature_arrays)):
        feature_df = pd.DataFrame(data=feature_arrays[i])
        label_df = pd.DataFrame(data=label_arrays[i])
        label_df.columns=['Y'] 
        resampled_dfs.append(feature_df.join(label_df))
    set_new_headers(resampled_dfs)    
    return resampled_dfs

def perform_oversampling_on_imputed_dataframes(df_dict):
    imputed_oversampled_dataframes_dictionary = OrderedDict()
    for key,dfs in df_dict.items():
        print('SMOTE Oversampling for ' + key + ' imputed dataframes\n')
        smote_feature_arrays, smote_label_arrays = oversample_data_SMOTE(dfs, verbose=True)
        oversampled_dataframes = restructure_arrays_to_dataframes(smote_feature_arrays, smote_label_arrays)
        imputed_oversampled_dataframes_dictionary[key] = oversampled_dataframes
        print('-'*100)
    return imputed_oversampled_dataframes_dictionary


imputed_oversampled_dataframes_dictionary = perform_oversampling_on_imputed_dataframes(imputed_dataframes_dictionary)

SMOTE Oversampling for MICE imputed dataframes

Dataset: 1year:
Original dataset shape Counter({0.0: 6756, 1.0: 271})
Resampled dataset shape Counter({0.0: 6756, 1.0: 6756})

Dataset: 2year:
Original dataset shape Counter({0.0: 9773, 1.0: 400})
Resampled dataset shape Counter({0.0: 9773, 1.0: 9773})

Dataset: 3year:
Original dataset shape Counter({0.0: 10008, 1.0: 495})
Resampled dataset shape Counter({0.0: 10008, 1.0: 10008})

Dataset: 4year:
Original dataset shape Counter({0.0: 9277, 1.0: 515})
Resampled dataset shape Counter({0.0: 9277, 1.0: 9277})

Dataset: 5year:
Original dataset shape Counter({0.0: 5500, 1.0: 410})
Resampled dataset shape Counter({0.0: 5500, 1.0: 5500})

----------------------------------------------------------------------------------------------------


In [74]:
def prepare_kfold_cv_data(k, X, y, verbose=False):
    X = X.values
    y = y.values
    kf = KFold(n_splits=k, shuffle=False, random_state=42)
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    for train_index, test_index in kf.split(X):
        X_train.append(X[train_index])
        y_train.append(y[train_index])
        X_test.append(X[test_index])
        y_test.append(y[test_index])
    return X_train, y_train, X_test, y_test

In [81]:
# Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB()

In [82]:
# Logistic Regression classifier
# lr_classifier = LogisticRegression(penalty = 'l1', random_state = 0)
lr_classifier = LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [83]:
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

In [84]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy')

In [85]:
# eXtreme Gradient Boosting Classifier (XGBClassifier)
xgb_classifier = XGBClassifier()

In [86]:
# import lightgbm as lgb

# params = {}
# params['learning_rate'] = 0.003
# # params['boosting_type'] = 'gbdt'
# params['metric'] = 'binary_logloss'
# params['sub_feature'] = 0.5
# params['num_leaves'] = 10
# params['min_data'] = 50
# params['max_depth'] = 10
# param['metric'] = ['auc', 'binary_logloss']
# lgbm_classifier = lgb.LGBMClassifier(num_leaves = 150 ,max_depth= 42 ,learning_rate =.05 ,max_bin= 200 , colsample_bytree=1.0, min_child_samples=20)

lgb_classifier = LGBMClassifier()


In [87]:
models_dictionary = OrderedDict()

models_dictionary['Gaussian Naive Bayes'] = gnb_classifier
models_dictionary['Logistic Regression'] = lr_classifier
models_dictionary['Decision Tree'] = dt_classifier
models_dictionary['Extreme Gradient Boosting'] = xgb_classifier
models_dictionary['Light Gradient Boosting'] = lgb_classifier
models_dictionary['Random Forest'] = rf_classifier

In [88]:
def perform_data_modeling(_models_, _imputers_, verbose=False, k_folds=5):
    
    model_results = OrderedDict()
    
    for model_name, clf in _models_.items():
        if verbose: print("-"*120, "\n", "Model: " + '\033[1m' + model_name + '\033[0m' + " Classifier")
        imputer_results = OrderedDict()
        
        for imputer_name, dataframes_list in _imputers_.items():
            if verbose: print('\tImputer Technique: ' + '\033[1m' + imputer_name + '\033[0m')
            
            feature_dfs, label_dfs = split_dataframes_features_labels(dataframes_list)            
            
            year_results = OrderedDict()
            
            for df_index in range(len(dataframes_list)):
                if verbose: print('\t\tDataset: ' + '\033[1m' + str(df_index+1) + 'year' + '\033[0m')
               
                X_train_list, y_train_list, X_test_list, y_test_list = prepare_kfold_cv_data(k_folds, feature_dfs[df_index], label_dfs[df_index], verbose)
                
                metrics_results = OrderedDict()
                accuracy_list = np.zeros([k_folds])
                precision_list = np.zeros([k_folds,2])
                recall_list = np.zeros([k_folds,2])
                confusion_list = np.zeros([k_folds])
                TN_list = np.zeros([k_folds])
                FP_list = np.zeros([k_folds])
                FN_list = np.zeros([k_folds])
                TP_list = np.zeros([k_folds])                
                
                for k_index in range(k_folds):
                    X_train = X_train_list[k_index]
                    y_train = y_train_list[k_index]
                    X_test = X_test_list[k_index]
                    y_test = y_test_list[k_index]
                    
                    clf = clf.fit(X_train, y_train)
                    y_test_predicted = clf.predict(X_test)
                    


                    _accuracy_ = accuracy_score(y_test, y_test_predicted.round(), normalize=True)
                    accuracy_list[k_index] = _accuracy_
                    
                    _recalls_ = recall_score(y_test, y_test_predicted, average=None)
                    recall_list[k_index] = _recalls_

                    
                    _precisions_ = precision_score(y_test, y_test_predicted, average=None)
                    precision_list[k_index] = _precisions_
                                     
                    _confusion_matrix_ = confusion_matrix(y_test, y_test_predicted)                    
                    TN_list[k_index] = _confusion_matrix_[0][0]
                    FP_list[k_index] = _confusion_matrix_[0][1]
                    FN_list[k_index] = _confusion_matrix_[1][0]
                    TP_list[k_index] = _confusion_matrix_[1][1]


                
                
                metrics_results['Accuracy'] = np.mean(accuracy_list)
                metrics_results['Precisions'] = np.mean(precision_list, axis=0)
                metrics_results['Recalls'] = np.mean(recall_list, axis=0)
                metrics_results['Confusion'] = np.mean(confusion_list, axis=0)
                metrics_results['Co'] = np.mean(_confusion_matrix_)
                metrics_results['TN'] = np.mean(TN_list)
                metrics_results['FP'] = np.mean(FP_list)
                metrics_results['FN'] = np.mean(FN_list)
                metrics_results['TP'] = np.mean(TP_list)
                
                if verbose:
                    print('\t\t\tAccuracy:', metrics_results['Accuracy'])
                    print('\t\t\tPrecision:', metrics_results['Precisions'])
                    print('\t\t\tRecall:', metrics_results['Recalls'])
                    print('\t\t\Matrix:', metrics_results['Co'])
                    print('\t\t\TN:', metrics_results['TN'])
                    print('\t\t\FP:', metrics_results['FP'])
                    print('\t\t\FN:', metrics_results['FN'])
                    print('\t\t\TP:', metrics_results['TP'])
                    
#                     print('RF train roc-auc: {}'.format(roc_auc_score(y_train, y_train_predicted[:,1])))
#                     print('RF test roc-auc: {}'.format(roc_auc_score(y_test, y_test_predicted[:,1])))


#                     print('Recall: %f', _recalls_)
#                     print('Precision: %f', _precision_)
                        
                year_results[str(df_index+1)+'year'] = metrics_results   
                
            imputer_results[imputer_name] = year_results
            
        model_results[model_name] = imputer_results  
                    
    return model_results

In [89]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [90]:
results = perform_data_modeling(models_dictionary, imputed_oversampled_dataframes_dictionary, verbose=True, k_folds=5)

------------------------------------------------------------------------------------------------------------------------ 
 Model: [1mGaussian Naive Bayes[0m Classifier
	Imputer Technique: [1mMICE[0m
		Dataset: [1m1year[0m
			Accuracy: 0.5116151064981668
			Precision: [0.53333333 0.50091047]
			Recall: [0.02250076 0.59067598]
		\Matrix: 675.5
		\TN: 52.0
		\FP: 1299.2
		\FN: 20.8
		\TP: 1330.4
		Dataset: [1m2year[0m
			Accuracy: 0.5088750663267075
			Precision: [0.50638298 0.50018349]
			Recall: [0.01908359 0.59007534]
		\Matrix: 977.25
		\TN: 64.6
		\FP: 1890.0
		\FN: 30.0
		\TP: 1924.6
		Dataset: [1m3year[0m
			Accuracy: 0.5180095447894598
			Precision: [0.53666667 0.5011589 ]
			Recall: [0.03062713 0.58955878]
		\Matrix: 1000.75
		\TN: 106.2
		\FP: 1895.4
		\FN: 34.2
		\TP: 1967.4
		Dataset: [1m4year[0m
			Accuracy: 0.5140375847720152
			Precision: [0.51555556 0.50061521]
			Recall: [0.03239246 0.58275387]
		\Matrix: 927.5
		\TN: 104.6
		\FP: 1750.8
		\FN: 52.6
		\TP: 180