In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time
import gc
import xgboost as xgb
from category_encoders import OneHotEncoder
from sklearn.metrics import f1_score, cohen_kappa_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from subprocess import check_output
print(check_output(['ls', 'ds_data']).decode('utf-8'))

data_test.csv
data_train.csv
readme



In [2]:
np.random.seed(13)

In [3]:
train = pd.read_csv("ds_data/data_train.csv")
test = pd.read_csv("ds_data/data_test.csv")

In [4]:
train_id = train.id.values
target = train.target.values
train.drop(['id', 'target'], inplace=True, axis=1)
#print("train {0} and target {1}".format(train.shape, target.shape))

test_id = test.id.values
test.drop(['id'], inplace=True, axis=1)
#print("test {0} and id {1}".format(test.shape, test_id.shape))

#dropping colunms with 99% with one unique value
train.drop(['num7', 'num8', 'num9'], inplace=True, axis=1)
test.drop(['num7', 'num8', 'num9'], inplace=True, axis=1)
print("train {0} and test {1}".format(train.shape, test.shape))

In [5]:
#filling null with -1
null_cols =  [c for c in train.columns if train[c].isnull().sum() != 0]
for col in null_cols:
    train[col] = train[col].fillna(-1.0)
    test[col]  = test[col].fillna(-1.0)

In [6]:
#one hot encodeing columns with less than 5 categories
cat_with_less_5_nunique = [c for c in train.columns if train[c].nunique() < 5]
encoder = OneHotEncoder(return_df=True, drop_invariant=True, cols=cat_with_less_5_nunique)

train = encoder.fit_transform(train)
test = encoder.fit_transform(test)
print("train {0} and test {1}".format(train.shape, test.shape))

train (596000, 84) and test (892816, 84)


In [7]:
#random split train set to train and validation 
train['target'] = target
mask = np.random.rand(train.shape[0]) < 0.65
X_train = train[mask].copy()
X_test = train[~mask].copy()

y_test = X_test.target.values
X_test.drop(['target'], inplace=True, axis=1)
print(X_train.shape, X_test.shape)

(387553, 85) (208447, 84)


In [8]:
#del train, test
#gc.collect()
X_train.target.value_counts()/X_train.shape[0]

0    0.96351
1    0.03649
Name: target, dtype: float64

In [9]:
#down sampling Majority class to balance  
train_majority = X_train[X_train['target'] == 0]
train_minority = X_train[X_train['target'] == 1]

train_majority_downsampled = resample(train_majority, replace=False, n_samples=30000, random_state=13)
train_downsampled = pd.concat([train_majority_downsampled, train_minority])
del train_majority,train_minority, train_majority_downsampled
gc.collect()

760

In [10]:
train_downsampled.target.value_counts() /train_downsampled.shape[0]

0    0.679625
1    0.320375
Name: target, dtype: float64

In [11]:
train_downsampled.shape

(44142, 85)

In [12]:
y = train_downsampled.target.values
X = train_downsampled.drop(['target'], axis=1)

del train_downsampled; gc.collect()

0

In [13]:
print(X.shape, X_test.shape)

(44142, 84) (208447, 84)


In [14]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.4, random_state=13)
print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)
#in training set over sampling Minority and balanceing by SMOTE-Tomex
#sm = SMOTETomek(random_state=13)
#X_res, y_res = sm.fit_sample(X_tr, y_tr)
print("0 : {0}, 1 : {1}".format((y_tr==0).sum(), (y_tr==1).sum()))
#print(X_res.shape, y_res.shape)

(26485, 84) (17657, 84) (26485,) (17657,)
0 : 18021, 1 : 8464


In [15]:
dtrain = xgb.DMatrix(data = X_tr, label= y_tr)
dvalid = xgb.DMatrix(data = X_val, label=y_val)
dXtest = xgb.DMatrix(data = X_test)

In [16]:
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.02,
    'max_depth': 4,
    'min_child_weight':10,
    'gamma':10,
    'subsample':0.8,
    'tree_method': 'hist',
    'colsample_bytree':0.7,
    'grow_ploicy':'lossguide',
    'objective':'binary:logistic',
    'eval_metric':'auc',
    #'rate_drop':0.8, #dart booster
    #'one_drop':0, #dart booster    
    #'skip_drop':0.9, #dart booster 
    'silent': 1
}

In [17]:
model = xgb.train(xgb_params,
                 dtrain=dtrain,
                 num_boost_round=1000,
                 evals = [(dtrain, 'train'), (dvalid, 'valid')],
                 maximize = True,
                 verbose_eval = 20,
                 early_stopping_rounds = 50)


[16:29:49] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.591511	valid-auc:0.586705
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.626259	valid-auc:0.622423
[40]	train-auc:0.633154	valid-auc:0.628253
[60]	train-auc:0.637098	valid-auc:0.631609
[80]	train-auc:0.640298	valid-auc:0.63388
[100]	train-auc:0.643342	valid-auc:0.636067
[120]	train-auc:0.645676	valid-auc:0.63776
[140]	train-auc:0.647824	valid-auc:0.639094
[160]	train-auc:0.649247	valid-auc:0.639564
[180]	train-auc:0.650838	valid-auc:0.639888
[200]	train-auc:0.652204	valid-auc:0.64034
[220]	train-auc:0.653502	valid-auc:0.640397
[240]	train-auc:0.654512	valid-auc:0.640755
[260]	train-auc:0.655391	valid-auc:0.640942
[280]	train-auc:0.656311	valid-auc:0.641138
[300]	train-auc:0.657337	valid-auc:0.641134
[320]	train-auc:0.658136	valid-auc:0.641515
[340]	train-auc:0.65

In [18]:
test_pred = model.predict(dXtest, )

test_pred = np.where(test_pred >= 0.5, 1, 0)

In [19]:
confusion_matrix(y_test, test_pred)

array([[194267,   6606],
       [  6884,    690]])

In [20]:
sum(y_test == 1)

7574

In [21]:
f1_score(y_test, test_pred)

0.09280430396772024

In [22]:
cohen_kappa_score(y_test, test_pred)

0.0592612550723941

In [23]:
#xgb.plot_importance(model)

In [24]:
fscore = model.get_fscore()
xgb_imp = pd.DataFrame({"feat_name": list(fscore.keys()), "score": list(fscore.values())})
imp_feat = xgb_imp.sort_values(by='score', ascending=False).head(15)['feat_name'].tolist()

In [25]:
dtrain = xgb.DMatrix(data = X_tr[imp_feat], label= y_tr)
dvalid = xgb.DMatrix(data = X_val[imp_feat], label=y_val)
dXtest = xgb.DMatrix(data = X_test[imp_feat])

In [26]:
model = xgb.train(xgb_params,
                 dtrain=dtrain,
                 num_boost_round=1000,
                 evals = [(dtrain, 'train'), (dvalid, 'valid')],
                 maximize = True,
                 verbose_eval = 20,
                 early_stopping_rounds = 50)

[16:29:54] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.59518	valid-auc:0.588159
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.623404	valid-auc:0.617979
[40]	train-auc:0.627963	valid-auc:0.621834
[60]	train-auc:0.630687	valid-auc:0.623862
[80]	train-auc:0.63356	valid-auc:0.626066
[100]	train-auc:0.635376	valid-auc:0.62808
[120]	train-auc:0.637507	valid-auc:0.629557
[140]	train-auc:0.638678	valid-auc:0.630803
[160]	train-auc:0.639845	valid-auc:0.631325
[180]	train-auc:0.641244	valid-auc:0.6321
[200]	train-auc:0.642021	valid-auc:0.632673
[220]	train-auc:0.642608	valid-auc:0.633017
[240]	train-auc:0.643262	valid-auc:0.633179
[260]	train-auc:0.643785	valid-auc:0.63363
[280]	train-auc:0.644465	valid-auc:0.633891
[300]	train-auc:0.645101	valid-auc:0.634045
[320]	train-auc:0.645436	valid-auc:0.634171
[340]	train-auc:0.64587

In [27]:
test_pred = model.predict(dXtest)

test_pred = np.where(test_pred >= 0.5, 1, 0)

In [28]:
confusion_matrix(y_test, test_pred)

array([[194987,   5886],
       [  6961,    613]])

In [29]:
f1_score(y_test, test_pred)

0.08711717473175584

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=4, random_state=13)

In [31]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=13, verbose=0, warm_start=False)

In [32]:
rf.score(X_val, y_val)

0.67961714900606

In [33]:
rf_pred_test = rf.predict(X_test)

In [34]:
test_pred = (test_pred + rf_pred_test)/2

In [35]:
confusion_matrix(y_test, rf_pred_test)

array([[200712,    161],
       [  7540,     34]])

In [36]:
test_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
rf_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [38]:
rf_imp = pd.DataFrame({"feat_name": X.columns, "score":rf.feature_importances_})
rf_imp.sort_values(by = 'score', ascending=False).head(15)

Unnamed: 0,feat_name,score
60,num21,0.142689
57,num18,0.081843
19,num14_1,0.058751
18,num14_0,0.058684
78,cat3,0.053752
2,num4_0,0.045511
56,num17,0.04288
3,num4_1,0.040213
1,num3_1,0.039262
0,num3_0,0.03893


In [39]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X, y, cv=5)

In [40]:
scores

array([0.68014498, 0.68025824, 0.6803353 , 0.67976892, 0.68067512])

In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1e1, solver='lbfgs', max_iter=10000, random_state=13)

In [42]:
lr.fit(X, y)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=13, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
lr.score(X_test, y_test)

0.9310664101666131

In [44]:
lr_pred_test = lr.predict(X_test)

In [45]:
lr_pred_test

array([0, 0, 0, ..., 0, 0, 0])

In [46]:
confusion_matrix(y_test, lr_pred_test)

array([[193393,   7480],
       [  6889,    685]])