In [88]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc,roc_auc_score


pd.set_option('display.max_columns',None)

In [27]:
data_train = pd.read_csv('train_HK6lq50.csv')
data_test = pd.read_csv('test_wF0Ps6O.csv')

In [28]:
df_train = data_train.copy(deep = True)
df_test = data_train.copy(deep = True)

In [29]:
df_train.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,9389_150,Y_1,Y,136,150,offline,intermediate,9389,M,Matriculation,3,24.0,5,N,1.0,0
1,16523_44,T_1,T,131,44,offline,easy,16523,F,High School Diploma,4,26.0,2,N,3.0,1
2,13987_178,Z_2,Z,120,178,online,easy,13987,M,Matriculation,1,40.0,1,N,2.0,1
3,13158_32,T_2,T,117,32,offline,easy,13158,F,Matriculation,3,,4,N,1.0,1
4,10591_84,V_3,V,131,84,offline,intermediate,10591,F,High School Diploma,1,42.0,2,N,4.0,1


## Eda

In [30]:
df_train.is_pass.value_counts()

1    50867
0    22280
Name: is_pass, dtype: int64

In [31]:
# for col in df_train.columns:
#     print(col)
#     print()
#     print(df_train[col].value_counts())
#     print('='*80)

In [32]:
# df_train.groupby('gender')['is_pass'].value_counts()    # drop gender

In [33]:
# df_train.groupby('is_handicapped')['is_pass'].value_counts(normalize = True)    # drop is_handicapped

In [34]:
# drop program_type as program id is related 

In [35]:
# df_train.groupby('city_tier')['is_pass'].value_counts(normalize = True)    

In [36]:
# df_train.groupby('trainee_engagement_rating')['is_pass'].value_counts(normalize = True) # cannot drop

In [37]:
def type_casting(data):
    df = data
    df['program_id'] = df['program_id'].astype('object')
    df['education'] = df['education'].astype('object')
    df['test_type'] = df['test_type'].astype('object')
    df['difficulty_level'] = df['difficulty_level'].astype('object')
    
    return(df)

In [38]:
def drop(data):
#     df = data.copy()
    df = data.drop(columns = ['is_handicapped','gender','program_type','id'])
    
    return df
    

In [39]:
def label_enc(dataset):
    from sklearn.preprocessing import LabelEncoder
    df1 = dataset.copy()
    df_categorical = df1.select_dtypes(include=['object'])

    # apply Label encoder to df_categorical

    le = LabelEncoder()
    df_categorical = df_categorical.apply(le.fit_transform)
    df_categorical = df_categorical.astype('object')

    # concat df_categorical with original df AFTER LABEL ENCODING
    df1 = df1.drop(df_categorical.columns, axis=1)
    df1 = pd.concat([df1, df_categorical], axis=1)
    
    df_categorical =0
    return df1

In [40]:
def impute(data):
    from sklearn.impute import SimpleImputer
    
    si = SimpleImputer()
    array = si.fit_transform(data)
    
    return array
    

In [41]:
def standardize(dataset, X_test = None ,test = False):
    from sklearn.preprocessing import StandardScaler
    df1 = dataset.copy()
    df_num = df1.select_dtypes(include=['int64','float64','int32','float32'])
    df_num = pd.DataFrame(impute(df_num), columns = df_num.columns,index = df1.index)

    se = StandardScaler()
    df_scaled = pd.DataFrame(se.fit_transform(df_num),columns = df_num.columns, index = df_num.index)
    
    df_new = df1.drop(df_num.columns,axis =1)
    df_new = pd.concat([df_scaled,df_new],axis =1)
    
    if test:
        df1 = X_test.copy()
        df_test_num = X_test.select_dtypes(include=['int64','float64','int32','float32'])
        df_test_num = pd.DataFrame(impute(df_test_num), columns = df_test_num.columns,index = df_test_num.index)
        
        df_scaled = pd.DataFrame(se.transform(df_test_num),columns = df_test_num.columns, index = df_test_num.index)
        
        df_new_ = df1.drop(df_test_num.columns,axis =1)
        df_new_ = pd.concat([df_new_,df_scaled],axis =1)
    else:
        df_new_ = 0
    
#     df_num ,df1,df_scaled = 0    
    return df_new,df_new_

In [42]:
# df = drop(df_train)
# df = type_casting(df)
# df = label_enc(df)
# df_tr,_ = standardize(df)
# # df_tr

## Split

In [43]:
df = drop(df_train)
df = type_casting(df)
df = label_enc(df)

In [44]:
X = df.drop('is_pass',axis = 1)
y = df.is_pass

In [45]:
X1,_ = standardize(X)

In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X1,y, random_state = 6, stratify = y)

## ML models

In [47]:
def cls(model):
    from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
    
    model = model()
    model.fit(X_train, y_train)

    y_predict = model.predict(X_test)
    print(confusion_matrix(y_test,y_predict)) 
    print(accuracy_score(y_test,y_predict))

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

### w/o dummies

In [49]:
cls(DecisionTreeClassifier)

[[2825 2745]
 [2772 9945]]
0.698310275058785


In [50]:
cls(AdaBoostClassifier)

[[ 1251  4319]
 [  905 11812]]
0.7143325859900476


In [51]:
cls(RandomForestClassifier)

[[ 2610  2960]
 [ 1697 11020]]
0.7453382184065183


In [52]:
cls(GradientBoostingClassifier)

[[ 1374  4196]
 [  852 11865]]
0.7239569092798163


In [53]:
cls(GaussianNB)

[[ 2001  3569]
 [ 2284 10433]]
0.6799365669601356


### dummies

In [54]:
X_train_dum = pd.get_dummies(X_train, drop_first= True)
X_test_dum = pd.get_dummies(X_test, drop_first = True)

In [55]:
def cls1(model):
    from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,auc,roc_curve
    
    model = model()
    model.fit(X_train_dum, y_train)

    y_predict = model.predict(X_test_dum)
    print(confusion_matrix(y_test,y_predict)) 
    print(accuracy_score(y_test,y_predict))
    fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
    
    print('auc :',auc(fpr, tpr))

In [56]:
cls1(DecisionTreeClassifier)

[[2976 2594]
 [2773 9944]]
0.7065128233171105
auc : 0.6581181355933878


In [57]:
cls1(RandomForestClassifier)

[[ 2536  3034]
 [ 1743 10974]]
0.7387761797998578
auc : 0.6591178011480131


In [58]:
cls1(AdaBoostClassifier)

[[ 1287  4283]
 [  906 11811]]
0.7162465139169902
auc : 0.5799080141102348


In [59]:
cls1(GaussianNB)

[[2469 3101]
 [3472 9245]]
0.6405643353201728
auc : 0.5851235690248524


In [60]:
cls1(GradientBoostingClassifier)

[[ 1349  4221]
 [  824 11893]]
0.7241209602449827
auc : 0.5886975745580951


In [61]:
cls1(LogisticRegression)

[[ 1263  4307]
 [  912 11805]]
0.7146060042653251
auc : 0.5775177108519971


In [62]:

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = RandomForestClassifier()

rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train_dum, y_train.values.ravel())
# print(rfe.support_)
# print(rfe.ranking_)

y_predict = rfe.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))

[[ 2553  3017]
 [ 1703 11014]]
0.7418931481380215


### oversampling

In [63]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
columns = X_train_dum.columns

X_train_dum_os,y_train_os=os.fit_sample(X_train_dum, y_train)

X_train_dum_os = pd.DataFrame(data=X_train_dum_os,columns=columns )


In [64]:
X_train_dum_os.shape

(76300, 36)

In [65]:
X_train_dum.shape

(54860, 36)

In [66]:
y_train_os.value_counts()

1    38150
0    38150
Name: is_pass, dtype: int64

In [67]:
def cls_os(model):
    from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc
    
    model = model()
    model.fit(X_train_dum_os, y_train_os)

    y_predict = model.predict(X_test_dum)
    print(confusion_matrix(y_test,y_predict)) 
    print(accuracy_score(y_test,y_predict))
    
    fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
    print('auc :',auc(fpr, tpr))
    

In [68]:
cls_os(RandomForestClassifier)

[[ 2904  2666]
 [ 2203 10514]]
0.7337452835347514
auc : 0.6740658858800099


In [69]:
cls_os(DecisionTreeClassifier)

[[2963 2607]
 [2996 9721]]
0.6936074807240116
auc : 0.648183378558988


In [70]:
cls_os(LogisticRegression)

[[3570 2000]
 [4325 8392]]
0.6541258817739378
auc : 0.6504188190675935


In [71]:
cls_os(AdaBoostClassifier)

[[3331 2239]
 [3886 8831]]
0.6650626127850385
auc : 0.6462249601849064


In [72]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [73]:

model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

model.fit(X_train_dum_os, y_train_os)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[ 3025  2545]
 [ 2230 10487]]
0.7388855471099688
auc : 0.6838660741802383


In [77]:

model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

model.fit(X_train_dum, y_train)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[ 2149  3421]
 [ 1223 11494]]
0.7460491059222398
auc : 0.6448231978314274


In [79]:

model = LogisticRegression(solver='newton-cg',
                          n_jobs = -1
                          )

model.fit(X_train_dum, y_train)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[ 1264  4306]
 [  908 11809]]
0.7148794225406027
auc : 0.577764747255155


In [83]:
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier

In [81]:

model = LogisticRegressionCV(solver='newton-cg',
                          n_jobs = -1,cv = 10,
                            random_state = 6,
                             scoring='accuracy'
                          )

model.fit(X_train_dum, y_train)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[ 1383  4187]
 [ 1000 11717]]
0.7163558812271013
auc : 0.5848297681512851


In [82]:

model = LogisticRegressionCV(solver='newton-cg',
                          n_jobs = -1,cv = 10,
                            random_state = 6,
                             scoring='accuracy'
                          )

model.fit(X_train_dum_os, y_train_os)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[3566 2004]
 [4303 8414]]
0.6551101875649369
auc : 0.6509247365201501


In [84]:

model = SGDClassifier()

model.fit(X_train_dum, y_train)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))

[[  289  5281]
 [  230 12487]]
0.6986383769891179
auc : 0.516899536082336


In [89]:
model = SGDClassifier(loss = 'modified_huber')

model.fit(X_train_dum_os, y_train_os)

y_predict = model.predict(X_test_dum)
print(confusion_matrix(y_test,y_predict)) 
print(accuracy_score(y_test,y_predict))
    
fpr, tpr, thresholds = roc_curve(y_test, y_predict, pos_label=None)
print('auc :',auc(fpr, tpr))
roc_auc_score(y_test,y_predict)

[[3645 1925]
 [4770 7947]]
0.6338929294034014
auc : 0.639655049736926


0.639655049736926

In [90]:
from sklearn.model_selection import GridSearchCV

In [94]:
param_grid = {
    'loss': ['log','hinge','perceptron'],
    'penalty': ['elasticnet','l2'],
    'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1],
    'learning_rate': ['invscaling','optimal', 'adaptive'],
    'warm_start' : [True , False]
}
clf = SGDClassifier(eta0=1,random_state=0, class_weight='balanced')

clf_grid = GridSearchCV(estimator=clf, param_grid=param_grid,
                                    n_jobs=-1, scoring='roc_auc',
                       verbose = 1,
                       return_train_score=True,
                       cv = 10
                       )

clf_grid.fit(X=X_train_dum, y=y_train)
# clf_grid.score(X)

Fitting 10 folds for each of 2268 candidates, totalling 22680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 40.9min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 48.2min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 60.7min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 66.4min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 72.8min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

In [None]:
# setup(remove_outliers= True, feature_selection=True, pca =True, remove_multicollinearity= True,)