In [7]:
import pandas as pd
import numpy as np

def read_data():
    nvals = 10000   # Change to None for Reading all data
    #Read Bureau Balance data and transform to merge with App_train 
    bureau_bal = pd.read_csv('bureau_balance.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    bureau_bal = pd.concat([bureau_bal, pd.get_dummies(bureau_bal.STATUS, prefix='bureau_bal_status')], axis=1).drop('STATUS', axis=1)
    bureau_counts = bureau_bal[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
    bureau_bal['bureau_count'] = bureau_bal['SK_ID_BUREAU'].map(bureau_counts['MONTHS_BALANCE'])
    avg_bureau_bal = bureau_bal.groupby('SK_ID_BUREAU').mean()
    avg_bureau_bal.columns = ['avg_' + val for val in avg_bureau_bal.columns]

    #Read Bureau Data Transform and Merge with Bureau Balance Data
    bureau = pd.read_csv('bureau.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    credit_act = pd.get_dummies(bureau.CREDIT_ACTIVE, prefix='ca_')
    credit_curr = pd.get_dummies(bureau.CREDIT_CURRENCY, prefix='cu_')
    credit_type = pd.get_dummies(bureau.CREDIT_TYPE, prefix='ty_')
    bureau_ct = pd.concat([bureau, credit_act, credit_curr, credit_type], axis=1)
    bureau_merged = bureau_ct.merge(right=avg_bureau_bal.reset_index(), how='left', on='SK_ID_BUREAU', suffixes=('', '_bureau_bal'))
    bureau_per_count = bureau_merged[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
    bureau_merged['SK_ID_BUREAU'] = bureau_merged['SK_ID_CURR'].map(bureau_per_count['SK_ID_BUREAU'])
    bureau_avg = bureau_merged.groupby('SK_ID_CURR').mean()

    # Read Previous Application Data and Transform
    prev_app = pd.read_csv('previous_application.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    categorical_feats = [ cat for cat in prev_app.columns if prev_app[cat].dtype == 'object']
    categorical_feats = categorical_feats[2:]
    #Factarozise Previous Application data there are so many Object Columns 
    for cat in categorical_feats:
        prev_app[cat],indexer = pd.factorize(prev_app[cat])
    prev_app_count = prev_app[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    prev_app['SK_ID_PREV'] = prev_app['SK_ID_CURR'].map(prev_app_count['SK_ID_PREV'])
    prev_apps_avg = prev_app.groupby('SK_ID_CURR').mean()
    prev_apps_avg.columns = ['prev_' + col for col in prev_apps_avg.columns]

    #Read Cash Balance and Transform to merge with App Train Data
    pos_cash_bal = pd.read_csv('POS_CASH_balance.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    pos_bal = pd.concat([pos_cash_bal, pd.get_dummies(pos_cash_bal['NAME_CONTRACT_STATUS'])], axis=1)
    pos_bal_count = pos_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    pos_bal['SK_ID_PREV'] = pos_bal['SK_ID_CURR'].map(pos_bal_count['SK_ID_PREV'])
    avg_pos_bal = pos_bal.groupby('SK_ID_CURR').mean()


    #Read Credit Balance and Transform to merger with App Train Data
    credit_bal = pd.read_csv('credit_card_balance.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    credit_bal = pd.concat([credit_bal, pd.get_dummies(credit_bal['NAME_CONTRACT_STATUS'], prefix='credit_status_')], axis=1)

    credit_bal_count = credit_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    credit_bal['SK_ID_PREV'] = credit_bal['SK_ID_CURR'].map(credit_bal_count['SK_ID_PREV'])
    avg_credit_bal = credit_bal.groupby('SK_ID_CURR').mean()
    avg_credit_bal.columns = ['credit_bal_' + f_ for f_ in avg_credit_bal.columns]

    # Read Intsallment Payments data and Transform for merging with App Train Data
    inst_pay = pd.read_csv('installments_payments.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
    # As Install Payments as many Object columns get the features and factorize
    categorical_feats = [ cat for cat in inst_pay.columns if inst_pay[cat].dtype == 'object']
    categorical_feats = categorical_feats[2:]
    for cat in categorical_feats:
        inst_pay[cat],indexer = pd.factorize(inst_pay[cat])


    inst_pay_counts = inst_pay[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
    inst_pay['SK_ID_PREV'] = inst_pay['SK_ID_CURR'].map(inst_pay_counts['SK_ID_PREV'])
    avg_inst_pay = inst_pay.groupby('SK_ID_CURR').mean()
    avg_inst_pay.columns = ['inst_' + f_ for f_ in avg_inst_pay.columns]


    # Read Application Training Data and Factorize as many Onject Columns

    app_train = pd.read_csv('application_train.csv',nrows=nvals, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')         
    y = app_train['TARGET']
    del app_train['TARGET']
    categorical_feats = [ cat for cat in app_train.columns if app_train[cat].dtype == 'object']
    categorical_feats = categorical_feats[1:]
    for cat in categorical_feats:
        app_train[cat],indexer = pd.factorize(app_train[cat])
    # Merger with all the Avg Dara by SKID
    app_train_final = app_train.merge(right=bureau_avg.reset_index(), how='left', on='SK_ID_CURR')
    app_train_final = app_train_final.merge(right=avg_credit_bal.reset_index(), how='left', on='SK_ID_CURR')
    app_train_final = app_train_final.merge(right=prev_apps_avg.reset_index(), how='left', on='SK_ID_CURR')
    app_train_final = app_train_final.merge(right=avg_pos_bal.reset_index(), how='left', on='SK_ID_CURR')
    app_train_final = app_train_final.merge(right=avg_inst_pay.reset_index(), how='left', on='SK_ID_CURR')
    app_train_final = app_train_final.fillna(0)
    
    print("app_train_final", app_train_final.shape)
    return (app_train_final , y)

In [8]:
from sklearn.ensemble import RandomForestClassifier

def feature_engineering(X,y):
    print(X.shape)
    print(y.shape)
    rfc=RandomForestClassifier()
    rfc.fit(X,y)
    Imp = rfc.feature_importances_
    ctr = 0
    
    features_names = []
    feature_names = list(X.columns.values)
    
    Imp = [i * 100 for i in Imp]
    cnt = 0
    sum = 0.0
    for i in Imp:
        
        sum+=i
        if i<0.05:
            column_name = feature_names[ctr]
            del X[column_name]
            cnt += 1
        ctr+=1
    print("Unimportant features : ",cnt)
    print("Sum : ",sum)
    
    return (X,y)
    

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn import svm
from sklearn.model_selection import cross_val_score

def run_RF(X,y,split):
    print("\n ---------------------------------------------")
    print("Random forest algorithm")
    print("\n ---------------------------------------------")

    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=split)
    clf_rf = RandomForestClassifier(n_estimators=100, max_features='log2')
    clf_rf = clf_rf.fit(X_train, y_train)
    
    cross_val_10 = cross_val_score(clf_rf, X_train, y_train, scoring='accuracy', cv = 10)
    cross_val_5 = cross_val_score(clf_rf, X_train, y_train, scoring='accuracy', cv = 5)
    
    print("Cross validation score with cv = 10 : ",cross_val_10.mean())
    print("Cross validation score with cv = 5 : ",cross_val_5.mean())

    y_pred = clf_rf.predict(X_test)
    print("\nClassification Report")
    print(classification_report(y_test,y_pred))
    print("\nAccuracy score for Random Forest:",accuracy_score(y_test,y_pred))
    print("\nF1 score for Random Forest:",f1_score(y_test, y_pred, average='weighted'))
    print("\n---------------------------------------------------------------------------------------------------------------")
    return

def run_KNN(X,y,split, k):
    print("\n ---------------------------------------------")
    print("K-nearest neighbor algorithm")
    print(" --------------------------------------------- \n")
    
    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=split)
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)
    
    cross_val_10 = cross_val_score(neigh, X_train, y_train, scoring='accuracy', cv = 10)
    cross_val_5 = cross_val_score(neigh, X_train, y_train, scoring='accuracy', cv = 5)
    
    print("Cross validation score with cv = 10 : ",cross_val_10.mean())
    print("Cross validation score with cv = 5 : ",cross_val_5.mean())
    
    y_pred = neigh.predict(X_test)
    print("\nClassification Report")
    print(classification_report(y_test,y_pred))
    print("\nAccuracy score for KNN:",accuracy_score(y_test,y_pred))
    print("\nF1 score for KNN:",f1_score(y_test, y_pred, average='weighted'))
    print("\n---------------------------------------------------------------------------------------------------------------")
    return  

def run_SVM(X, y, split):
    print("\n ---------------------------------------------")
    print("SVM algorithm")
    print(" --------------------------------------------- \n")
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=split)
    model = svm.SVC(gamma=0.1)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    cross_val_10 = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 10)
    cross_val_5 = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 5)
    
    print("Cross validation score with cv = 10 : ",cross_val_10.mean())
    print("Cross validation score with cv = 5 : ",cross_val_5.mean())
    
    report = classification_report(y_test,y_pred)
    print("Classification report:\n%s" % report)
    print("Accuracy score for SVM is:",accuracy_score(y_test,y_pred))
    print("\nF1 score for SVM:",f1_score(y_test, y_pred, average='weighted'))
    print("\n---------------------------------------------------------------------------------------------------------------")
    return

In [10]:
print("Running ......") 

X,y = read_data()
new_x, new_y = feature_engineering(X,y)
print("After feature_engineering")
print("new_x : ",new_x.shape)
print("new_y : ",new_y.shape)
# print("Calling the Random forest algorithm and prediction")
run_RF(X, y, 0.3)
run_KNN(X, y, 0.3, 3)
run_SVM(X[:1000], y[:1000], 0.3)

Running ......
app_train_final (10000, 202)
(10000, 202)
(10000,)




Unimportant features :  74
Sum :  99.99999999999997
After feature_engineering
new_x :  (10000, 128)
new_y :  (10000,)

 ---------------------------------------------
Random forest algorithm

 ---------------------------------------------
Cross validation score with cv = 10 :  0.9250008673487089
Cross validation score with cv = 5 :  0.925

Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2750
           1       0.00      0.00      0.00       250

   micro avg       0.92      0.92      0.92      3000
   macro avg       0.46      0.50      0.48      3000
weighted avg       0.84      0.92      0.88      3000


Accuracy score for Random Forest: 0.9166666666666666

F1 score for Random Forest: 0.8768115942028986

---------------------------------------------------------------------------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



 ---------------------------------------------
K-nearest neighbor algorithm
 --------------------------------------------- 

Cross validation score with cv = 10 :  0.9068561743420467
Cross validation score with cv = 5 :  0.9055713067783635

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      2771
           1       0.11      0.02      0.04       229

   micro avg       0.91      0.91      0.91      3000
   macro avg       0.52      0.50      0.49      3000
weighted avg       0.86      0.91      0.88      3000


Accuracy score for KNN: 0.9113333333333333

F1 score for KNN: 0.8835086541285613

---------------------------------------------------------------------------------------------------------------

 ---------------------------------------------
SVM algorithm
 --------------------------------------------- 

Cross validation score with cv = 10 :  0.9343568658326772
Cross validation score with cv = 5 :  0.93429

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [11]:
run_RF(new_x, new_y, 0.3)
run_KNN(new_x, new_y, 0.3, 3)
run_SVM(new_x[:1000], new_y[:1000], 0.3)


 ---------------------------------------------
Random forest algorithm

 ---------------------------------------------
Cross validation score with cv = 10 :  0.9244287448983133
Cross validation score with cv = 5 :  0.9244286580904232

Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2754
           1       0.00      0.00      0.00       246

   micro avg       0.92      0.92      0.92      3000
   macro avg       0.46      0.50      0.48      3000
weighted avg       0.84      0.92      0.88      3000


Accuracy score for Random Forest: 0.918

F1 score for Random Forest: 0.8787528675703858

---------------------------------------------------------------------------------------------------------------

 ---------------------------------------------
K-nearest neighbor algorithm
 --------------------------------------------- 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Cross validation score with cv = 10 :  0.9062855976674007
Cross validation score with cv = 5 :  0.90842935014617

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      2758
           1       0.14      0.03      0.05       242

   micro avg       0.91      0.91      0.91      3000
   macro avg       0.53      0.51      0.50      3000
weighted avg       0.86      0.91      0.88      3000


Accuracy score for KNN: 0.905

F1 score for KNN: 0.8776465300492772

---------------------------------------------------------------------------------------------------------------

 ---------------------------------------------
SVM algorithm
 --------------------------------------------- 

Cross validation score with cv = 10 :  0.9372140086898201
Cross validation score with cv = 5 :  0.9371518226148565
Classification report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       2

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
