This kernel shows some experiments I have done.

1. basedline model logistic regression (result: overfitting)
2. feature binning (result: overfitting, but slightly improve the result)
3. feature selection (result: better, but still overfitting)

### 1. import packages and data

In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import KBinsDiscretizer

%matplotlib inline

In [28]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

### 2. Baseline model

In [29]:
train_x = train.drop(['id', 'target'], axis = 1)
train_y = train['target']
test_x = test.drop(["id"], axis = 1)

In [30]:
def baseline_model(train_x, train_y, run_num = 10, fold = 5):
    train_result, test_result = [], []
    for i in range(run_num):
        # result list
        train_fold, test_fold = [], []
        # split dataset
        skf = StratifiedKFold(n_splits = fold, shuffle = True)
        fold_num = 1
        for train_index, valid_index in skf.split(train_x, train_y):
            # dataset
            X_train, X_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
            y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
            # model
            reg = LogisticRegression(solver = "liblinear", penalty = "l2")
            reg.fit(X_train, y_train)
            y_train_pred = reg.predict(X_train)
            y_valid_pred = reg.predict(X_valid)
            # result AUC
            train_auc = roc_auc_score(y_train, y_train_pred)
            test_auc = roc_auc_score(y_valid, y_valid_pred)
            if i == 1:
                print("TRAIN Fold {0}, AUC score: {1}".format(fold_num, round(train_auc, 4)))
                print("TEST Fold {0}, AUC score: {1}".format(fold_num, round(test_auc, 4)))
            fold_num += 1
            train_fold.append(train_auc)
            test_fold.append(test_auc)
        train_result.append(train_fold)
        test_result.append(test_fold)
    return train_result, test_result

In [31]:
train_result, test_result = baseline_model(train_x = train_x, train_y = train_y, run_num = 10, fold = 5)

TRAIN Fold 1, AUC score: 1.0
TEST Fold 1, AUC score: 0.6424
TRAIN Fold 2, AUC score: 1.0
TEST Fold 2, AUC score: 0.5938
TRAIN Fold 3, AUC score: 1.0
TEST Fold 3, AUC score: 0.7951
TRAIN Fold 4, AUC score: 1.0
TEST Fold 4, AUC score: 0.6684
TRAIN Fold 5, AUC score: 1.0
TEST Fold 5, AUC score: 0.6337


In [32]:
def model_result(train_result, test_result):
    base_test_re = pd.DataFrame(test_result).T
    base_test_re.index = ['fold {0}'.format(i) for i in range(5)]
    base_test_re.columns = ['run {0}'.format(i) for i in range(10)]
    base_train_re = pd.DataFrame(train_result).T
    base_train_re.index = ['fold {0}'.format(i) for i in range(5)]
    base_train_re.columns = ['run {0}'.format(i) for i in range(10)]
    return base_train_re, base_test_re
base_train_re, base_test_re = model_result(train_result, test_result)

In [33]:
base_train_re

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
base_test_re.round(3)

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.594,0.642,0.767,0.668,0.578,0.78,0.618,0.597,0.689,0.783
fold 1,0.674,0.594,0.606,0.641,0.649,0.622,0.646,0.668,0.63,0.705
fold 2,0.72,0.795,0.674,0.63,0.627,0.661,0.661,0.531,0.752,0.639
fold 3,0.634,0.668,0.594,0.665,0.602,0.727,0.752,0.595,0.599,0.618
fold 4,0.594,0.634,0.668,0.642,0.609,0.611,0.649,0.653,0.641,0.656


**Conclusion:** Overfitting..., I also tried to change logistic regression parameters, but it doesn't improve the score and cannot handle overfitting problem.

### 3. feature bining
```
if x < quantile(0.1):
     x = 0
if quantile(0.1) < x < quantile(0.9)
     x = 1-9
if x > quantile(0.9):
     x = 10
```

In [35]:
def binning(data, feature, n_bins):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit(data[feature].values)
    Xt = est.transform(data[feature].values)
    data[feature] = pd.DataFrame(Xt)
    return data

In [36]:
train_x_bin = binning(train_x, train_x.columns, n_bins = 15)
test_x_bin = binning(test_x, test_x.columns, n_bins = 15)

In [37]:
train_result_bin, test_result_bin = baseline_model(train_x_bin, train_y, run_num = 10, fold = 5)

TRAIN Fold 1, AUC score: 1.0
TEST Fold 1, AUC score: 0.6441
TRAIN Fold 2, AUC score: 1.0
TEST Fold 2, AUC score: 0.7083
TRAIN Fold 3, AUC score: 1.0
TEST Fold 3, AUC score: 0.6493
TRAIN Fold 4, AUC score: 1.0
TEST Fold 4, AUC score: 0.5851
TRAIN Fold 5, AUC score: 1.0
TEST Fold 5, AUC score: 0.6372


In [38]:
base_train_re_bin, base_test_re_bin = model_result(train_result, test_result)

In [39]:
base_train_re_bin

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
fold 4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
base_test_re_bin.round(3)

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.594,0.642,0.767,0.668,0.578,0.78,0.618,0.597,0.689,0.783
fold 1,0.674,0.594,0.606,0.641,0.649,0.622,0.646,0.668,0.63,0.705
fold 2,0.72,0.795,0.674,0.63,0.627,0.661,0.661,0.531,0.752,0.639
fold 3,0.634,0.668,0.594,0.665,0.602,0.727,0.752,0.595,0.599,0.618
fold 4,0.594,0.634,0.668,0.642,0.609,0.611,0.649,0.653,0.641,0.656


**Conclusion:** I got better results but still overfitting...

### 3. feature selection by statistics test

In [41]:
sig_features = []
for each_feature in train.columns[2:]:
    X = train[each_feature]
    X = sm.add_constant(X)
    y = train.iloc[:,1]
    model = sm.OLS(y, X)
    result = model.fit()
    pvalue = result.pvalues[1]
    # using 90% significance level
    if pvalue <= 0.1:
        print("Feature {0}, p value is {1}".format(each_feature, round(pvalue, 3)))
        sig_features.append(each_feature)

Feature 0, p value is 0.086
Feature 4, p value is 0.07
Feature 13, p value is 0.089
Feature 16, p value is 0.023
Feature 17, p value is 0.08
Feature 24, p value is 0.006
Feature 30, p value is 0.036
Feature 33, p value is 0.0
Feature 39, p value is 0.024
Feature 43, p value is 0.055
Feature 63, p value is 0.044
Feature 65, p value is 0.0
Feature 73, p value is 0.008
Feature 80, p value is 0.01
Feature 82, p value is 0.043
Feature 90, p value is 0.031
Feature 91, p value is 0.002
Feature 101, p value is 0.062
Feature 105, p value is 0.081
Feature 108, p value is 0.072
Feature 114, p value is 0.049
Feature 117, p value is 0.002
Feature 129, p value is 0.027
Feature 133, p value is 0.028
Feature 134, p value is 0.035
Feature 150, p value is 0.039
Feature 164, p value is 0.05
Feature 165, p value is 0.024
Feature 183, p value is 0.009
Feature 189, p value is 0.014
Feature 194, p value is 0.017
Feature 199, p value is 0.012
Feature 201, p value is 0.025
Feature 209, p value is 0.044
Feature

In [42]:
train_x = train.drop(['id', 'target'], axis = 1)
train_y = train['target']

In [43]:
train_select_x = train_x[sig_features]
train_select_bin_x = train_x_bin[sig_features]

In [44]:
train_result_select, test_result_select = baseline_model(train_select_x, train_y, run_num = 10, fold = 5)
base_train_re_select, base_test_re_select = model_result(train_result_select, test_result_select)

TRAIN Fold 1, AUC score: 0.9666
TEST Fold 1, AUC score: 0.8142
TRAIN Fold 2, AUC score: 0.9705
TEST Fold 2, AUC score: 0.7708
TRAIN Fold 3, AUC score: 0.9635
TEST Fold 3, AUC score: 0.7517
TRAIN Fold 4, AUC score: 0.9457
TEST Fold 4, AUC score: 0.8299
TRAIN Fold 5, AUC score: 0.9783
TEST Fold 5, AUC score: 0.7986


In [45]:
base_train_re_select

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.996094,0.96658,0.938802,0.96658,0.981337,0.977431,0.978299,0.956597,0.952691,0.963542
fold 1,0.945747,0.970486,0.959635,0.946615,0.985243,0.977431,0.931858,0.981337,0.970486,0.967448
fold 2,0.944878,0.963542,0.971354,0.963542,0.949653,0.952691,0.963542,0.963542,0.974392,0.947917
fold 3,0.963542,0.945747,0.96658,0.967448,0.981337,0.952691,0.974392,0.977431,0.985243,0.985243
fold 4,0.974392,0.978299,0.963542,0.959635,0.963542,0.949653,0.974392,0.978299,0.974392,0.970486


In [46]:
base_test_re_select

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.767361,0.814236,0.857639,0.819444,0.798611,0.736111,0.758681,0.854167,0.782986,0.763889
fold 1,0.885417,0.770833,0.779514,0.789931,0.699653,0.826389,0.897569,0.652778,0.838542,0.770833
fold 2,0.866319,0.751736,0.802083,0.842014,0.857639,0.786458,0.838542,0.795139,0.708333,0.897569
fold 3,0.727431,0.829861,0.894097,0.826389,0.795139,0.897569,0.822917,0.798611,0.850694,0.770833
fold 4,0.802083,0.798611,0.791667,0.743056,0.795139,0.814236,0.782986,0.842014,0.711806,0.798611


In [47]:
train_result_bin_select, test_result_bin_elect = baseline_model(train_select_bin_x, train_y, run_num = 10, fold = 5)
base_train_re_bin_select, base_test_re_bin_select = model_result(train_result_bin_select, test_result_bin_elect)

TRAIN Fold 1, AUC score: 0.9566
TEST Fold 1, AUC score: 0.7899
TRAIN Fold 2, AUC score: 0.9961
TEST Fold 2, AUC score: 0.783
TRAIN Fold 3, AUC score: 0.9023
TEST Fold 3, AUC score: 0.8698
TRAIN Fold 4, AUC score: 0.9644
TEST Fold 4, AUC score: 0.7639
TRAIN Fold 5, AUC score: 0.9319
TEST Fold 5, AUC score: 0.8229


In [48]:
base_train_re_bin_select

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.971354,0.956597,0.937934,0.957465,0.948785,0.960503,0.927951,0.94184,0.974392,0.902344
fold 1,0.953559,0.996094,1.0,0.934896,0.957465,0.945747,0.985243,0.949653,1.0,0.945747
fold 2,0.921007,0.902344,0.927951,0.959635,0.920139,0.985243,0.945747,0.986111,0.927083,1.0
fold 3,0.945747,0.96441,0.924045,0.942708,0.97526,1.0,0.949653,0.986111,0.934896,0.978299
fold 4,0.956597,0.931858,0.953559,0.971354,0.989149,0.963542,0.984375,0.985243,0.961372,0.953559


In [49]:
base_test_re_bin_select

Unnamed: 0,run 0,run 1,run 2,run 3,run 4,run 5,run 6,run 7,run 8,run 9
fold 0,0.711806,0.789931,0.845486,0.763889,0.786458,0.826389,0.854167,0.835069,0.723958,0.845486
fold 1,0.826389,0.782986,0.696181,0.842014,0.786458,0.814236,0.727431,0.684028,0.723958,0.798611
fold 2,0.798611,0.869792,0.850694,0.743056,0.736111,0.842014,0.894097,0.727431,0.736111,0.748264
fold 3,0.774306,0.763889,0.829861,0.835069,0.881944,0.779514,0.779514,0.826389,0.802083,0.723958
fold 4,0.723958,0.822917,0.739583,0.746528,0.751736,0.696181,0.699653,0.755208,0.770833,0.822917


### 4. submission

In [55]:
train_select_bin_x = train_x_bin[sig_features]
test_select_bin_x = test_x_bin[sig_features]

In [58]:
# split dataset
skf = StratifiedKFold(n_splits = 5, shuffle = True)
fold_num = 1
y_test = np.zeros(len(test_select_bin_x))
for train_index, valid_index in skf.split(train_select_bin_x, train_y):
    # dataset
    X_train, X_valid = train_select_bin_x.iloc[train_index], train_select_bin_x.iloc[valid_index]
    y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
    # model
    reg = LogisticRegression(solver = "liblinear", penalty = "l2")
    reg.fit(X_train, y_train)
    y_train_pred = reg.predict(X_train)
    y_valid_pred = reg.predict(X_valid)
    # result AUC
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_valid, y_valid_pred)
    fold_num += 1
    # predict test set
    y_test_fold = reg.predict_proba(test_select_bin_x)[:, 1]
    y_test += y_test_fold
y_test = y_test/5

In [62]:
sub = pd.read_csv("../input/sample_submission.csv")
sub['target'] = y_test
sub.to_csv("submission_logit.csv", index = False)