In [1]:
import os
from os.path import join as pjoin

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
import xgboost as xgb


from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA



# feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel,RFECV
from sklearn.preprocessing import Binarizer, scale,normalize

import warnings
warnings.filterwarnings('ignore')

In [8]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
def split(x,n=None):
    if n==None:
        n=int(len(x)*.7)
    
    return x[:n],x[n:]

# Provide a place holder for train
def split_data(df):
    y = df['TARGET'].copy()
    x = df.drop('TARGET',axis=1)
    global X_train,X_valid,y_train,y_valid
    
    X_train ,X_valid = split(x)
    y_train ,y_valid = split(y)

In [4]:

def eval(clf):
    train_pred = clf.predict(X_train)
    valid_pred = clf.predict(X_valid)
    if train_pred.sum()==0:
        print("All prediciton on train is 0")
        
    if valid_pred.sum()==0:
        print("All prediciton on valid is 0")
    print('ROC on training set : ',roc_auc_score(y_train,train_pred))
    print('ROC on valid set : ',roc_auc_score(y_valid,valid_pred))
    
    

In [5]:
def plot_fi(clf,n_top=30):
    fi = pd.DataFrame({'cols':X_train.columns,'fi':clf.feature_importances_}).sort_values('fi',ascending=False)
    fi.iloc[:n_top].plot(x='cols',y='fi',kind='barh')
    return list(fi.cols.values)
    

In [6]:
def base_model():
    clf = RFC(criterion='gini', class_weight='balanced', max_depth=10,random_state=1)
    clf.fit(X_train,y_train)
    eval(clf)

146 columns with skewed value


In [17]:
skew_cols = []
for col in train.columns:
    train_cnt = train[col].value_counts().sort_values(ascending=False)/len(train)
    if train_cnt.values[0]>0.999:
        skew_cols.append(col)
        
train.drop(skew_cols,axis=1,inplace=True)        
test.drop(skew_cols,axis=1,inplace=True)
print(f'{len(skew_cols)} columns with skewed value')

##################
dup_cols = []
for col_1 in train.columns:
    v = train[col_1].values
    for col_2 in train.columns:
        #if col_1 != col_2 and train[col_1].sum() == train[col_2].sum():
        if col_1 != col_2:
            if np.array_equal(train[col_1],train[col_2]):
                dup_cols.append(col_2)

##################
train.drop(dup_cols,axis=1,inplace=True)
test.drop(dup_cols,axis=1,inplace=True)

print(f'{len(dup_cols)} columns with duplicate value')

##################
train['var3'].replace(-999999,0, inplace=True)
test['var3'].replace(-999999,0, inplace=True)

##################
train.drop('ID',axis=1,inplace=True)
test_id = test.ID
test.drop('ID',axis=1,inplace=True)

0 columns with skewed value
0 columns with duplicate value


KeyError: "['ID'] not found in axis"

In [13]:
pca = PCA(n_components=3)
x_train_projected = pca.fit_transform(normalize(train.drop('TARGET',axis=1), axis=0))
x_test_projected = pca.transform(normalize(test, axis=0))
print(pca.explained_variance_ratio_)
train.insert(1, 'PCA1', x_train_projected[:, 0])
train.insert(1, 'PCA2', x_train_projected[:, 1])
train.insert(1, 'PCA3', x_train_projected[:, 2])

test.insert(1, 'PCA1', x_test_projected[:, 0])
test.insert(1, 'PCA2', x_test_projected[:, 1])
test.insert(1, 'PCA3', x_test_projected[:, 2])



[0.11167207 0.08418294 0.0609069 ]


In [14]:
split_data(train)

In [15]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, 
                        learning_rate=0.03, nthread=4, subsample=0.95, 
                        colsample_bytree=0.85, seed=4242)



In [16]:
clf.fit(X_train,y_train,early_stopping_rounds=20, eval_metric="auc",eval_set=[(X_valid,y_valid)])

[0]	validation_0-auc:0.779954
Will train until validation_0-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.814072
[2]	validation_0-auc:0.824869
[3]	validation_0-auc:0.821514
[4]	validation_0-auc:0.828019
[5]	validation_0-auc:0.83093
[6]	validation_0-auc:0.831745
[7]	validation_0-auc:0.829176
[8]	validation_0-auc:0.831217
[9]	validation_0-auc:0.832172
[10]	validation_0-auc:0.833506
[11]	validation_0-auc:0.833584
[12]	validation_0-auc:0.833731
[13]	validation_0-auc:0.83452
[14]	validation_0-auc:0.834913
[15]	validation_0-auc:0.835423
[16]	validation_0-auc:0.835702
[17]	validation_0-auc:0.835924
[18]	validation_0-auc:0.835827
[19]	validation_0-auc:0.836253
[20]	validation_0-auc:0.837041
[21]	validation_0-auc:0.837081
[22]	validation_0-auc:0.837896
[23]	validation_0-auc:0.837912
[24]	validation_0-auc:0.838
[25]	validation_0-auc:0.838513
[26]	validation_0-auc:0.83849
[27]	validation_0-auc:0.838247
[28]	validation_0-auc:0.838543
[29]	validation_0-auc:0.838763
[30]	validation_0-auc:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.85, gamma=0, learning_rate=0.03,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=350, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=4242, silent=True, subsample=0.95)

In [24]:
clf.predict(test).sum()

3