In [0]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)

In [0]:
data_train = pd.read_csv('train_HK6lq50.csv')
data_test = pd.read_csv('test_wF0Ps6O.csv')

In [0]:
df_train = data_train.copy(deep = True)
df_test = data_test.copy(deep = True)

In [4]:
df_train.head()

Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,9389_150,Y_1,Y,136,150,offline,intermediate,9389,M,Matriculation,3,24.0,5,N,1.0,0
1,16523_44,T_1,T,131,44,offline,easy,16523,F,High School Diploma,4,26.0,2,N,3.0,1
2,13987_178,Z_2,Z,120,178,online,easy,13987,M,Matriculation,1,40.0,1,N,2.0,1
3,13158_32,T_2,T,117,32,offline,easy,13158,F,Matriculation,3,,4,N,1.0,1
4,10591_84,V_3,V,131,84,offline,intermediate,10591,F,High School Diploma,1,42.0,2,N,4.0,1


## Eda

In [5]:
df_train.is_pass.value_counts()

1    50867
0    22280
Name: is_pass, dtype: int64

In [0]:
# for col in df_train.columns:
#     print(col)
#     print()
#     print(df_train[col].value_counts())
#     print('='*80)

In [0]:
# df_train.groupby('gender')['is_pass'].value_counts()    # drop gender

In [0]:
# df_train.groupby('is_handicapped')['is_pass'].value_counts(normalize = True)    # drop is_handicapped

In [0]:
# drop program_type as program id is related 

In [0]:
# df_train.groupby('city_tier')['is_pass'].value_counts(normalize = True)    

In [0]:
# df_train.groupby('trainee_engagement_rating')['is_pass'].value_counts(normalize = True) # cannot drop

In [0]:
def type_casting(data):
    df = data
    df['program_id'] = df['program_id'].astype('object')
    df['education'] = df['education'].astype('object')
    df['test_type'] = df['test_type'].astype('object')
    df['difficulty_level'] = df['difficulty_level'].astype('object')
    
    return(df)

In [0]:
def drop(data):
#     df = data.copy()
    df = data.drop(columns = ['is_handicapped','gender','program_type','id'])
    
    return df
    

In [0]:
def label_enc(dataset):
    from sklearn.preprocessing import LabelEncoder
    df1 = dataset.copy()
    df_categorical = df1.select_dtypes(include=['object'])

    # apply Label encoder to df_categorical

    le = LabelEncoder()
    df_categorical = df_categorical.apply(le.fit_transform)
    df_categorical = df_categorical.astype('object')

    # concat df_categorical with original df AFTER LABEL ENCODING
    df1 = df1.drop(df_categorical.columns, axis=1)
    df1 = pd.concat([df1, df_categorical], axis=1)
    
    df_categorical =0
    return df1

In [0]:
def impute(data):
    from sklearn.impute import SimpleImputer
    
    si = SimpleImputer()
    array = si.fit_transform(data)
    
    return array
    

In [0]:
def standardize(dataset, X_test = None ,test = False):
    from sklearn.preprocessing import StandardScaler
    df1 = dataset.copy()
    df_num = df1.select_dtypes(include=['int64','float64','int32','float32'])
    df_num = pd.DataFrame(impute(df_num), columns = df_num.columns,index = df1.index)

    se = StandardScaler()
    df_scaled = pd.DataFrame(se.fit_transform(df_num),columns = df_num.columns, index = df_num.index)
    
    df_new = df1.drop(df_num.columns,axis =1)
    df_new = pd.concat([df_scaled,df_new],axis =1)
    
    if test:
        df1 = X_test.copy()
        df_test_num = X_test.select_dtypes(include=['int64','float64','int32','float32'])
        
        df_scaled = pd.DataFrame(se.transform(df_test_num),columns = df_test_num.columns, index = df_test_num.index)
        
        df_new_ = df1.drop(df_test_num.columns,axis =1)
        df_new_ = pd.concat([df_new_,df_scaled],axis =1)
    else:
        df_new_ = 0
    
#     df_num ,df1,df_scaled = 0    
    return df_new,df_new_

In [0]:
# df = drop(df_train)
# df = type_casting(df)
# df = label_enc(df)
# df_tr,_ = standardize(df)
# # df_tr

## Split

In [0]:
df = drop(df_train)
df = type_casting(df)
df = label_enc(df)

In [0]:
X = df.drop('is_pass',axis = 1)
y = df.is_pass

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y, random_state = 6, stratify = y)

In [0]:
X_train1,_ = standardize(X_train)

In [0]:
x_train,x_test = standardize(X_train,X_test, test = True)

####test

In [0]:
df_test_after = drop(df_test)
df_test_after = type_casting(df_test_after)
df_test_final = label_enc(df_test_after)

In [71]:
df_test_final.shape

(31349, 11)

In [72]:
df_test_final.isnull().sum()

program_duration                 0
test_id                          0
trainee_id                       0
city_tier                        0
age                          11791
total_programs_enrolled          0
trainee_engagement_rating       31
program_id                       0
test_type                        0
difficulty_level                 0
education                        0
dtype: int64

## ML models

In [0]:
def cls(model):
    from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
    
    model = model()
    model.fit(X_train1, y_train)

    y_predict = model.predict(X_test1)
    print(confusion_matrix(y_test,y_predict)) 
    print(accuracy_score(y_test,y_predict))

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [0]:
X_test1,_ = standardize(X_test)

In [0]:
cls(DecisionTreeClassifier)

In [0]:
cls(LogisticRegression)

In [0]:
cls(AdaBoostClassifier)

In [0]:
cls(RandomForestClassifier)

In [0]:
cls(GradientBoostingClassifier)

In [0]:
cls(XGBClassifier)

## PyCaret

In [0]:
df.info()

In [34]:
df.columns

Index(['program_duration', 'test_id', 'trainee_id', 'city_tier', 'age',
       'total_programs_enrolled', 'trainee_engagement_rating', 'is_pass',
       'program_id', 'test_type', 'difficulty_level', 'education'],
      dtype='object')

In [13]:
pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/c7/41/f7fa05b6ce3cb3096a35fb5ac6dc0f2bb23e8304f068618fb2501be0a562/pycaret-1.0.0-py3-none-any.whl (188kB)
[K     |█▊                              | 10kB 9.9MB/s eta 0:00:01[K     |███▌                            | 20kB 1.5MB/s eta 0:00:01[K     |█████▏                          | 30kB 1.8MB/s eta 0:00:01[K     |███████                         | 40kB 1.6MB/s eta 0:00:01[K     |████████▊                       | 51kB 1.8MB/s eta 0:00:01[K     |██████████▍                     | 61kB 2.2MB/s eta 0:00:01[K     |████████████▏                   | 71kB 2.0MB/s eta 0:00:01[K     |██████████████                  | 81kB 2.1MB/s eta 0:00:01[K     |███████████████▋                | 92kB 2.3MB/s eta 0:00:01[K     |█████████████████▍              | 102kB 2.4MB/s eta 0:00:01[K     |███████████████████▏            | 112kB 2.4MB/s eta 0:00:01[K     |████████████████████▉           | 122kB 2.4MB/s eta 0:0

In [0]:
from pycaret.classification import *

###rgd

In [0]:
setup_data = setup(df, target='is_pass', session_id= 5,normalize=True, profile = True,
              numeric_features = ['program_duration','test_id', 'trainee_id', 'city_tier', 'age','total_programs_enrolled', 'trainee_engagement_rating'],
              categorical_features = ['program_id', 'test_type', 'difficulty_level', 'education'])

In [0]:
s

In [38]:
lr = create_model('lr',fold = 10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7159,0.6987,0.9253,0.7348,0.8191,0.1956
1,0.7151,0.7083,0.9245,0.7345,0.8186,0.1938
2,0.7211,0.7013,0.9298,0.7376,0.8226,0.2092
3,0.7158,0.7014,0.9264,0.7344,0.8193,0.1937
4,0.7162,0.7115,0.9228,0.7361,0.8189,0.1997
5,0.7203,0.7085,0.9323,0.736,0.8226,0.2032
6,0.7086,0.6933,0.9281,0.7278,0.8158,0.1649
7,0.717,0.7079,0.927,0.7351,0.82,0.1979
8,0.7119,0.7039,0.9219,0.7328,0.8165,0.1853
9,0.7242,0.7175,0.9404,0.7361,0.8259,0.2083


In [37]:
dt = create_model('dt')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6891,0.6324,0.7776,0.7758,0.7767,0.2652
1,0.686,0.6325,0.7694,0.7769,0.7731,0.2633
2,0.6979,0.647,0.777,0.7861,0.7815,0.2919
3,0.6996,0.6477,0.7804,0.7861,0.7833,0.2941
4,0.685,0.6329,0.7661,0.7777,0.7718,0.2632
5,0.6855,0.6293,0.7731,0.7744,0.7737,0.2584
6,0.692,0.6361,0.7792,0.7781,0.7787,0.2724
7,0.6818,0.6302,0.7624,0.7761,0.7692,0.2575
8,0.7082,0.6598,0.7837,0.794,0.7888,0.317
9,0.6961,0.641,0.782,0.7811,0.7816,0.2822


In [39]:
tuned_dt = tune_model('dt', fold=5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7131,0.6968,0.9179,0.7353,0.8165,0.1944
1,0.7159,0.6918,0.8968,0.746,0.8145,0.2298
2,0.7146,0.7058,0.8825,0.7509,0.8114,0.2411
3,0.7116,0.6931,0.9054,0.7388,0.8137,0.2042
4,0.7178,0.6995,0.9086,0.7429,0.8174,0.2226
Mean,0.7146,0.6974,0.9022,0.7428,0.8147,0.2184
SD,0.0021,0.005,0.012,0.0054,0.0021,0.017


In [40]:
rf = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.703,0.7009,0.7967,0.7807,0.7886,0.2896
1,0.7153,0.7239,0.8048,0.7897,0.7972,0.3196
2,0.716,0.7232,0.8068,0.7895,0.7981,0.32
3,0.7119,0.7244,0.8048,0.7861,0.7953,0.3092
4,0.7057,0.7142,0.806,0.7786,0.7921,0.2891
5,0.7178,0.7263,0.8135,0.7877,0.8004,0.3193
6,0.7129,0.7124,0.8124,0.7829,0.7974,0.3056
7,0.7053,0.7067,0.8081,0.7769,0.7922,0.286
8,0.7105,0.7258,0.8062,0.7837,0.7948,0.3041
9,0.7182,0.7228,0.8208,0.784,0.802,0.3142


In [41]:
rf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)>

In [42]:
rf.n_features_

39

In [43]:
rf.feature_importances_

array([0.01440632, 0.17070111, 0.31141709, 0.06900554, 0.16038247,
       0.07150908, 0.07472877, 0.00105689, 0.00124127, 0.00255476,
       0.00524251, 0.00176464, 0.00114779, 0.00111598, 0.00219379,
       0.00203071, 0.00270674, 0.00205182, 0.00081734, 0.00472491,
       0.00130411, 0.00113707, 0.00171488, 0.00319881, 0.00194843,
       0.00388892, 0.00413858, 0.00196831, 0.00215838, 0.01758395,
       0.00873493, 0.00470559, 0.00521354, 0.00603349, 0.00981989,
       0.01033015, 0.00174547, 0.01152827, 0.00204768])

In [44]:
tuned_rf = tune_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7405,0.7433,0.8961,0.7689,0.8276,0.3147
1,0.7393,0.7608,0.8936,0.769,0.8266,0.3133
2,0.7414,0.754,0.8978,0.7691,0.8285,0.316
3,0.7367,0.7579,0.8983,0.7644,0.826,0.2996
4,0.733,0.7573,0.8947,0.7626,0.8234,0.2907
5,0.743,0.76,0.9099,0.7651,0.8312,0.3092
6,0.734,0.7509,0.9003,0.7609,0.8248,0.2887
7,0.7369,0.7531,0.8992,0.7641,0.8262,0.2998
8,0.741,0.7699,0.8958,0.7695,0.8279,0.3169
9,0.7412,0.7669,0.9076,0.7644,0.8298,0.3059


In [60]:
print(tuned_rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)


In [51]:
lbgm = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7333,0.7364,0.9101,0.756,0.8259,0.2757
1,0.7327,0.7462,0.9155,0.7532,0.8265,0.2679
2,0.7365,0.7437,0.9233,0.7534,0.8298,0.2726
3,0.7312,0.7436,0.9118,0.7535,0.8252,0.2666
4,0.724,0.7516,0.9037,0.7505,0.82,0.2501
5,0.7428,0.7515,0.9275,0.7572,0.8338,0.2902
6,0.7289,0.7388,0.9228,0.7469,0.8256,0.2465
7,0.7297,0.7441,0.9166,0.7501,0.825,0.2562
8,0.7354,0.7525,0.9149,0.7559,0.8278,0.278
9,0.7426,0.7542,0.9244,0.7583,0.8332,0.2932


In [54]:
lgbm_tuned = tune_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7356,0.7524,0.8978,0.7636,0.8252,0.2966
1,0.7397,0.7615,0.9012,0.7659,0.828,0.3071
2,0.7406,0.7571,0.9045,0.7653,0.8291,0.3066
3,0.7383,0.7627,0.8992,0.7655,0.827,0.304
4,0.741,0.7711,0.8995,0.7679,0.8285,0.313
5,0.7396,0.7582,0.9076,0.763,0.829,0.3001
6,0.7387,0.7604,0.9065,0.7625,0.8283,0.2983
7,0.7346,0.7539,0.9003,0.7615,0.8251,0.2907
8,0.7361,0.7645,0.8952,0.7652,0.8251,0.3011
9,0.7482,0.7655,0.9059,0.7717,0.8334,0.3313


In [55]:
nb_tuned = tune_model('nb')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.6973,0.6665,0.8644,0.7426,0.7989,0.2016
1,0.6999,0.6755,0.8646,0.7448,0.8003,0.21
2,0.698,0.6675,0.8632,0.7438,0.7991,0.2051
3,0.6975,0.6716,0.8691,0.7408,0.7998,0.1963
4,0.6984,0.6762,0.8585,0.7462,0.7984,0.2118
5,0.7102,0.6821,0.8773,0.749,0.8081,0.2311
6,0.6947,0.6594,0.8697,0.738,0.7985,0.1865
7,0.7078,0.6796,0.8719,0.749,0.8058,0.2294
8,0.6941,0.6769,0.8542,0.7439,0.7952,0.2023
9,0.7078,0.688,0.8787,0.7462,0.807,0.2218


In [56]:
mlp = create_model('mlp')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7135,0.7,0.886,0.7483,0.8114,0.2334
1,0.7141,0.7167,0.8809,0.751,0.8108,0.241
2,0.7145,0.7106,0.8677,0.7572,0.8087,0.2561
3,0.7199,0.7164,0.863,0.7646,0.8108,0.2792
4,0.7197,0.7246,0.8655,0.7632,0.8112,0.276
5,0.7258,0.7232,0.884,0.7606,0.8177,0.2773
6,0.7176,0.7108,0.8899,0.7504,0.8142,0.2432
7,0.7199,0.713,0.8607,0.7656,0.8104,0.2818
8,0.7129,0.7151,0.8573,0.7603,0.8059,0.2621
9,0.7244,0.7268,0.8657,0.7676,0.8137,0.2915


In [57]:
cat = tune_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7325,0.7312,0.9124,0.7544,0.8259,0.2705
1,0.7338,0.7491,0.9141,0.7549,0.8269,0.2735
2,0.734,0.7443,0.9166,0.754,0.8274,0.271
3,0.7271,0.7403,0.9104,0.7505,0.8227,0.2537
4,0.7318,0.7521,0.9135,0.7534,0.8257,0.2668
5,0.7377,0.7531,0.9239,0.7542,0.8305,0.2762
6,0.7283,0.7383,0.9202,0.7474,0.8249,0.2472
7,0.7303,0.7431,0.9126,0.7523,0.8247,0.2626
8,0.7283,0.7495,0.9093,0.7519,0.8231,0.2595
9,0.7414,0.7539,0.9258,0.7567,0.8327,0.2876


In [59]:
svm = tune_model('svm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,0.7003,0.0,0.9832,0.7036,0.8202,0.0505
1,0.6962,0.0,0.9823,0.7009,0.8181,0.034
2,0.7018,0.0,0.9834,0.7046,0.821,0.056
3,0.6982,0.0,0.9809,0.7028,0.8189,0.0449
4,0.7002,0.0,0.9815,0.7041,0.8199,0.0525
5,0.7043,0.0,0.9834,0.7065,0.8223,0.067
6,0.6945,0.0,0.9817,0.6998,0.8172,0.0281
7,0.7002,0.0,0.9829,0.7036,0.8201,0.051
8,0.6998,0.0,0.982,0.7036,0.8198,0.0506
9,0.7014,0.0,0.9823,0.7046,0.8206,0.0569


In [61]:
compare_models(fold = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.734,0.7502,0.9113,0.7562,0.8265,0.2769
1,Light Gradient Boosting Machine,0.7305,0.7426,0.9159,0.7512,0.8254,0.2597
2,Gradient Boosting Classifier,0.7229,0.7211,0.9312,0.7385,0.8237,0.2143
3,Extreme Gradient Boosting,0.7222,0.7199,0.9333,0.7372,0.8237,0.2094
4,Ada Boost Classifier,0.7167,0.7058,0.9267,0.7351,0.8198,0.197
5,Linear Discriminant Analysis,0.7167,0.7107,0.9196,0.7377,0.8187,0.2057
6,Ridge Classifier,0.7164,0.0,0.9401,0.7299,0.8217,0.1787
7,Logistic Regression,0.7162,0.7043,0.9277,0.7342,0.8197,0.1939
8,Extra Trees Classifier,0.7065,0.708,0.8246,0.7697,0.7962,0.2741
9,Random Forest Classifier,0.7035,0.7066,0.8016,0.7786,0.7899,0.2866


In [0]:
df

In [63]:
df.is_pass.value_counts()

1    50867
0    22280
Name: is_pass, dtype: int64

In [0]:
predict = predict_model(cat, data = df_test_final)

In [74]:
predict

Unnamed: 0,program_duration,test_id,trainee_id,city_tier,age,total_programs_enrolled,trainee_engagement_rating,program_id,test_type,difficulty_level,education,Label,Score
0,131,45,1626,3,46.0,2,4.0,2,0,2,3,1,0.7552
1,135,130,11020,3,,4,4.0,17,1,0,0,1,0.8645
2,120,146,12652,3,,2,3.0,16,1,0,3,1,0.6458
3,122,72,7038,1,,2,2.0,11,0,3,1,0,0.4105
4,122,71,888,3,,2,2.0,11,0,2,3,1,0.5212
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31344,134,22,18820,2,22.0,2,1.0,4,0,2,4,0,0.3393
31345,133,96,1641,1,41.0,2,3.0,8,0,0,3,1,0.8524
31346,136,157,7010,2,,3,4.0,15,1,0,0,1,0.8024
31347,136,179,14895,3,,1,1.0,19,0,0,3,0,0.4104


In [0]:
predict.Label.value_counts()

In [0]:
predict_rgd_cat_py = predict.Label

In [0]:
predict_rgd_cat_py.to_csv('test_predict_rgd_cat_py.csv')

###py

In [0]:
setup_data = setup(df, target='is_pass', session_id= 5,normalize=True)

In [85]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7369,0.7554,0.9136,0.7578,0.8284,0.2845
1,Light Gradient Boosting Machine,0.7325,0.7468,0.9175,0.7523,0.8267,0.2649
2,Gradient Boosting Classifier,0.7225,0.72,0.9306,0.7384,0.8234,0.2134
3,Extreme Gradient Boosting,0.7215,0.7191,0.9318,0.7372,0.8231,0.2086
4,Ada Boost Classifier,0.7166,0.7049,0.9318,0.7331,0.8206,0.1902
5,Linear Discriminant Analysis,0.7162,0.6942,0.9289,0.7338,0.8199,0.1923
6,Logistic Regression,0.7157,0.6952,0.9334,0.7317,0.8203,0.1847
7,Ridge Classifier,0.715,0.0,0.9426,0.7279,0.8214,0.1702
8,Random Forest Classifier,0.7108,0.7182,0.8022,0.7863,0.7942,0.3083
9,Extra Trees Classifier,0.709,0.7162,0.8132,0.7783,0.7953,0.2926


In [0]:
setup_data = setup(df, target='is_pass', session_id= 5,normalize=False,silent = True)

In [87]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7369,0.7554,0.9136,0.7578,0.8284,0.2845
1,Light Gradient Boosting Machine,0.7316,0.7465,0.9169,0.7517,0.8261,0.2625
2,Gradient Boosting Classifier,0.7225,0.72,0.9306,0.7384,0.8234,0.2134
3,Extreme Gradient Boosting,0.7215,0.7191,0.9318,0.7372,0.8231,0.2086
4,Ada Boost Classifier,0.7166,0.7049,0.9318,0.7331,0.8206,0.1902
5,Linear Discriminant Analysis,0.7162,0.6942,0.9289,0.7338,0.8199,0.1923
6,Ridge Classifier,0.715,0.0,0.9426,0.7279,0.8214,0.1702
7,Random Forest Classifier,0.7093,0.7165,0.8031,0.7841,0.7935,0.303
8,Extra Trees Classifier,0.7087,0.7163,0.8141,0.7775,0.7954,0.2911
9,Logistic Regression,0.7063,0.6336,0.9629,0.7149,0.8201,0.1041


In [97]:
setup_data1 = setup(df, target = 'is_pass', session_id = 5, silent = True, normalize = True,transformation=True,
                    remove_outliers= True, feature_selection=True, pca =True, remove_multicollinearity= True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(73147, 12)"
4,Missing Values,True
5,Numeric Features,4
6,Categorical Features,7
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [98]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7317,0.7361,0.91,0.7556,0.8256,0.2649
1,Light Gradient Boosting Machine,0.7273,0.7284,0.9185,0.7482,0.8246,0.2393
2,Gradient Boosting Classifier,0.7206,0.7086,0.9333,0.7367,0.8234,0.1963
3,Extreme Gradient Boosting,0.7203,0.7092,0.9346,0.736,0.8235,0.1935
4,Logistic Regression,0.7159,0.6864,0.9456,0.7284,0.8229,0.162
5,Linear Discriminant Analysis,0.7156,0.6849,0.9405,0.73,0.822,0.1679
6,Ridge Classifier,0.7147,0.0,0.9571,0.7235,0.824,0.1413
7,Ada Boost Classifier,0.7146,0.6903,0.92,0.7367,0.8182,0.1907
8,Random Forest Classifier,0.7143,0.7161,0.8075,0.7884,0.7978,0.3113
9,Extra Trees Classifier,0.7099,0.7134,0.7989,0.7884,0.7936,0.3057


In [0]:
setup_data1 = setup(df_train, target = 'is_pass', session_id = 5, silent = True, ignore_features = ['id'])

In [89]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7385,0.7587,0.9143,0.759,0.8294,0.2895
1,Light Gradient Boosting Machine,0.7328,0.748,0.9183,0.7522,0.827,0.2653
2,Gradient Boosting Classifier,0.7231,0.7211,0.9315,0.7386,0.8239,0.2147
3,Extreme Gradient Boosting,0.723,0.7197,0.9346,0.7373,0.8243,0.2105
4,Linear Discriminant Analysis,0.7199,0.7139,0.9167,0.7415,0.8199,0.2206
5,Ridge Classifier,0.7196,0.0,0.9334,0.7349,0.8223,0.1993
6,Ada Boost Classifier,0.7182,0.7068,0.9303,0.735,0.8212,0.1981
7,Random Forest Classifier,0.7166,0.7266,0.8105,0.788,0.7991,0.3186
8,Extra Trees Classifier,0.7151,0.725,0.8195,0.7815,0.8,0.3058
9,Logistic Regression,0.714,0.6736,0.9399,0.7285,0.8205,0.1675


In [92]:
setup_data1 = setup(df_train, target = 'is_pass', session_id = 5, silent = True, ignore_features = ['id'], normalize = True,transformation=True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(73147, 16)"
4,Missing Values,True
5,Numeric Features,4
6,Categorical Features,11
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [93]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7385,0.7586,0.9143,0.759,0.8294,0.2895
1,Light Gradient Boosting Machine,0.7323,0.7477,0.9182,0.7518,0.8267,0.2634
2,Gradient Boosting Classifier,0.7231,0.7211,0.9315,0.7386,0.8239,0.2147
3,Extreme Gradient Boosting,0.723,0.7197,0.9346,0.7373,0.8243,0.2105
4,Logistic Regression,0.7203,0.7112,0.9259,0.7383,0.8216,0.2112
5,Linear Discriminant Analysis,0.72,0.7135,0.9167,0.7417,0.82,0.2212
6,Ridge Classifier,0.7193,0.0,0.9333,0.7347,0.8222,0.1984
7,Ada Boost Classifier,0.7182,0.7068,0.9303,0.735,0.8212,0.1981
8,Random Forest Classifier,0.7156,0.7263,0.8073,0.7887,0.7979,0.3182
9,Extra Trees Classifier,0.7155,0.7256,0.8206,0.7813,0.8005,0.3061


In [99]:
setup_data1 = setup(df_train, target = 'is_pass', session_id = 5, silent = True, ignore_features = ['id'], normalize = True,transformation=True,
                    remove_outliers= True, feature_selection=True, pca =True, remove_multicollinearity= True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(73147, 16)"
4,Missing Values,True
5,Numeric Features,4
6,Categorical Features,11
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [100]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,CatBoost Classifier,0.7331,0.7431,0.9044,0.7595,0.8256,0.2746
1,Light Gradient Boosting Machine,0.7282,0.7345,0.9139,0.751,0.8245,0.2464
2,Gradient Boosting Classifier,0.7223,0.7194,0.9377,0.7368,0.8252,0.1957
3,Extreme Gradient Boosting,0.7221,0.7187,0.9407,0.7354,0.8255,0.1906
4,Logistic Regression,0.7194,0.7024,0.9299,0.7372,0.8224,0.1946
5,Linear Discriminant Analysis,0.7192,0.7018,0.9245,0.7391,0.8215,0.2006
6,Ridge Classifier,0.7186,0.0,0.9397,0.7329,0.8235,0.1786
7,Ada Boost Classifier,0.7174,0.6997,0.9162,0.7408,0.8192,0.2045
8,Extra Trees Classifier,0.7163,0.723,0.8172,0.7855,0.801,0.3077
9,K Neighbors Classifier,0.7118,0.6956,0.8496,0.7643,0.8047,0.2609


In [15]:
df.columns

Index(['program_duration', 'test_id', 'trainee_id', 'city_tier', 'age',
       'total_programs_enrolled', 'trainee_engagement_rating', 'is_pass',
       'program_id', 'test_type', 'difficulty_level', 'education'],
      dtype='object')

In [0]:
from imblearn.over_sampling import SMOTE

In [0]:
X_os = df.drop(columns='is_pass')
y_os = df.is_pass

In [0]:
def impute(data):
    from sklearn.impute import SimpleImputer
    
    si = SimpleImputer()
    array = si.fit_transform(data)
    
    return array

In [0]:
X_os = pd.DataFrame(impute(X_os), columns = X_os.columns,index = X_os.index)


In [0]:
os = SMOTE(random_state=0)
columns = X_os.columns

X_train_os,y_train_os=os.fit_sample(X_os, y_os)

X_train_os = pd.DataFrame(data=X_train_os,columns=columns )


In [0]:
df_os = X_train_os.copy(deep = True)
df_os['is_pass'] = y_train_os

In [121]:
setup_data1 = setup(df_os, target = 'is_pass', session_id = 5, silent = True, normalize = True,transformation=True,
                    remove_outliers= True, feature_selection=True, pca =True, remove_multicollinearity= True)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(101734, 12)"
4,Missing Values,False
5,Numeric Features,11
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [123]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.776,0.819,0.7543,0.7849,0.7693,0.5518
1,Random Forest Classifier,0.763,0.8298,0.7481,0.7672,0.7575,0.5258
2,CatBoost Classifier,0.7367,0.8185,0.7521,0.726,0.7388,0.4736
3,Decision Tree Classifier,0.7346,0.7498,0.7259,0.7347,0.7302,0.469
4,K Neighbors Classifier,0.72,0.7917,0.7454,0.7055,0.7249,0.4403
5,Light Gradient Boosting Machine,0.7177,0.7973,0.7308,0.7083,0.7193,0.4356
6,Gradient Boosting Classifier,0.6842,0.7491,0.6857,0.6793,0.6825,0.3684
7,Extreme Gradient Boosting,0.6836,0.7479,0.6841,0.6791,0.6816,0.3672
8,Ada Boost Classifier,0.6507,0.7105,0.6347,0.651,0.6427,0.3012
9,Quadratic Discriminant Analysis,0.6412,0.6921,0.6664,0.63,0.6477,0.2827


In [2]:
setup_data1 = setup(df_os, target = 'is_pass', session_id = 5, silent = False, normalize = True,transformation=True,
                    remove_outliers= True, feature_selection=True, pca =True, remove_multicollinearity= True,
                    numeric_features = ['program_duration','test_id', 'trainee_id', 'city_tier', 'age','total_programs_enrolled', 'trainee_engagement_rating'],
              categorical_features = ['program_id', 'test_type', 'difficulty_level', 'education'])

NameError: ignored

In [0]:
compare_models()

In [3]:
df

NameError: ignored