## 【問題1】クロスバリデーション
事前学習期間では検証用データをはじめに分割しておき、それに対して指標値を計算することで検証を行っていました。（ホールドアウト法）しかし、分割の仕方により精度は変化します。実践的には クロスバリデーション（交差検証） を行います。分割を複数回行い、それぞれに対して学習と検証を行う方法です。複数回の分割のためにscikit-learnにはKFoldクラスが用意されています。

事前学習期間の課題で作成したベースラインモデルに対してKFoldクラスによるクロスバリデーションを行うコードを作成し実行してください。

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
train = pd.read_csv('/Users/user/Downloads/material_dive/kaggle/home-credit-default-risk/application_train.csv')

In [27]:
test = pd.read_csv('/Users/user/Downloads/material_dive/kaggle/home-credit-default-risk/application_test.csv')

In [28]:
train_X = train.drop(columns=['TARGET'])

In [29]:
train_y = train['TARGET']

In [30]:
 np.array(train_X)

array([[100002, 'Cash loans', 'M', ..., 0.0, 0.0, 1.0],
       [100003, 'Cash loans', 'F', ..., 0.0, 0.0, 0.0],
       [100004, 'Revolving loans', 'M', ..., 0.0, 0.0, 0.0],
       ...,
       [456253, 'Cash loans', 'F', ..., 1.0, 0.0, 1.0],
       [456254, 'Cash loans', 'F', ..., 0.0, 0.0, 0.0],
       [456255, 'Cash loans', 'F', ..., 2.0, 0.0, 1.0]], dtype=object)

## クロスバリデーション　定義

In [102]:
import numpy as np
from sklearn.model_selection import KFold
X =  np.array(train_X)
kf = KFold(n_splits=5, shuffle = True, random_state = 1)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=5, random_state=1, shuffle=True)


## クロスバリデーション　インデックスの表示

In [33]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [     0      2      3 ... 307508 307509 307510] TEST: [     1      4      6 ... 307493 307495 307501]
TRAIN: [     0      1      2 ... 307508 307509 307510] TEST: [     3     18     22 ... 307499 307503 307504]
TRAIN: [     0      1      2 ... 307508 307509 307510] TEST: [     9     10     15 ... 307489 307492 307502]
TRAIN: [     0      1      2 ... 307506 307507 307508] TEST: [    12     16     23 ... 307505 307509 307510]
TRAIN: [     1      3      4 ... 307505 307509 307510] TEST: [     0      2      5 ... 307506 307507 307508]


## ホットエンコード

In [136]:
X = pd.get_dummies(train_X)
test = pd.get_dummies(test)

print('Training Features shape: ', X.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 245)
Testing Features shape:  (48744, 242)


In [39]:
X.isnull().sum()

SK_ID_CURR                          0
CNT_CHILDREN                        0
AMT_INCOME_TOTAL                    0
AMT_CREDIT                          0
AMT_ANNUITY                        12
                                   ..
WALLSMATERIAL_MODE_Panel            0
WALLSMATERIAL_MODE_Stone, brick     0
WALLSMATERIAL_MODE_Wooden           0
EMERGENCYSTATE_MODE_No              0
EMERGENCYSTATE_MODE_Yes             0
Length: 245, dtype: int64

## 欠損値のインプット

In [69]:
from sklearn.preprocessing import Imputer
def imputer_device(data):
    imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imr = imr.fit(data)
    imputed_data = imr.transform(data)
    return imputed_data

In [70]:
X_nonan =  imputer_device(X)
test_nonan = imputer_device(test)

## クロスバリデーショんで ロジスティック回帰を使う

In [45]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)
clf = log_reg
cv = kf

scores = cross_val_score(clf, X_nonan, y, cv=cv)
scores

array([0.91917467, 0.9192384 , 0.92010016, 0.91691327, 0.92088062])

## 【問題2】グリッドサーチ

## パラメータ捜索範囲の設定

In [81]:
C = 0.0001
C_list = [C]
for i in range(10):
    C *= 10
    C_list.append(C)
    
C_list

[0.0001,
 0.001,
 0.01,
 0.1,
 1.0,
 10.0,
 100.0,
 1000.0,
 10000.0,
 100000.0,
 1000000.0]

In [82]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'C':C_list}
clf = GridSearchCV(log_reg,parameters)
gs = clf.fit(X_nonan, y)

# sorted(clf.cv_results_.keys())
# ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
#  'param_C', 'param_kernel', 'params',...
#  'rank_test_score', 'split0_test_score',...
#  'split2_test_score', ...
#  'std_fit_time', 'std_score_time', 'std_test_score']

In [83]:
gs.best_score_

0.9192646767107518

In [85]:
gs.cv_results_

{'mean_fit_time': array([13.49084703, 10.85066136, 10.42762995, 13.37147872, 12.94564001,
        15.70965354, 12.59363556, 11.83601499, 12.26587828, 12.16538095,
        13.90194464]),
 'std_fit_time': array([1.42771225, 0.23045182, 1.01987911, 2.53587145, 0.29187924,
        3.67126378, 0.3500925 , 0.94032733, 0.48106212, 0.41386339,
        1.62836954]),
 'mean_score_time': array([0.09835299, 0.04865559, 0.0561866 , 0.14818128, 0.06568233,
        0.32414309, 0.06030695, 0.07691169, 0.05446378, 0.06458831,
        0.13391137]),
 'std_score_time': array([0.06339302, 0.00496351, 0.00678361, 0.09529281, 0.01524161,
        0.0733141 , 0.00946882, 0.03175869, 0.00072845, 0.01367336,
        0.11137411]),
 'param_C': masked_array(data=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0,
                    10000.0, 100000.0, 1000000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fill_value='?',
           

In [97]:
gs.cv_results_['params']

[{'C': 0.0001},
 {'C': 0.001},
 {'C': 0.01},
 {'C': 0.1},
 {'C': 1.0},
 {'C': 10.0},
 {'C': 100.0},
 {'C': 1000.0},
 {'C': 10000.0},
 {'C': 100000.0},
 {'C': 1000000.0}]

In [98]:
gs.cv_results_['mean_test_score']

array([0.91926468, 0.91926468, 0.91926468, 0.91926468, 0.91926468,
       0.91926468, 0.91926468, 0.91926468, 0.91926468, 0.91926468,
       0.91926468])

## 【問題3】Kernelからの調査
KaggleのKernelから様々なアイデアを見つけ出して、列挙してください。

## アイデア

/feature importanceから重要な特徴量を見つけだす  
/見つけた特徴量を組み合わせて、新たな特徴量を生成する   
/lightGBMがいいらしいので使ってみる。   
/とりあえす　他のモデルも片っ端から試す。　　  
/外れ値なんとかできないか

### 方針  

/欠損値の補完  
/見つけた特徴量を組み合わせて、新たな特徴量を生成する   
/train_data test_dataの列を揃える    
/lightGBMがいいらしいので使ってみる。   
/とりあえす　他のモデルも片っ端から試す    
/cross_validasion

## テストデータとトレインデータの列数を揃えたい

## alignを使って揃える。

In [137]:

# # Align the training and testing data, keep only columns present in both dataframes
train_a, test_a = X.align(test, join = 'inner', axis = 1)
print(train_a.shape)
print(test_a.shape)

(307511, 242)
(48744, 242)


## 違うパターン「揃え方」
##  trainにしかないデータの違いを抽出

In [138]:
# X_train = pd.DataFrame(X_nonan )
# X_test = pd.DataFrame(test_nonan)
print(X.columns.difference(test.columns))

# X_nonan =  imputer_device(X)
# test_nonan = imputer_device(test)

Index(['CODE_GENDER_XNA', 'NAME_FAMILY_STATUS_Unknown',
       'NAME_INCOME_TYPE_Maternity leave'],
      dtype='object')


## 欠損値のインプットのパンダス版

In [166]:
X_filled = X.fillna(X.mean())
test_filled_np = test.fillna(test.mean())

## trainにかないデータを削除

In [167]:
X_filled = X_filled.drop(columns = ['CODE_GENDER_XNA', 'NAME_FAMILY_STATUS_Unknown',
       'NAME_INCOME_TYPE_Maternity leave'])

In [270]:
print(X_filled.shape)
print(test_filled.shape)

(307511, 242)
(48744, 242)


## 相関関係の確認

In [186]:
# Find correlations with the target and sort
train_data = pd.concat([X_filled, train_y], axis=1)

correlations = train_data.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

Most Positive Correlations:
 DAYS_REGISTRATION                                    0.041975
OCCUPATION_TYPE_Laborers                             0.043019
FLAG_DOCUMENT_3                                      0.044346
REG_CITY_NOT_LIVE_CITY                               0.044395
FLAG_EMP_PHONE                                       0.045982
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049824
REG_CITY_NOT_WORK_CITY                               0.050994
DAYS_ID_PUBLISH                                      0.051457
CODE_GENDER_M                                        0.054713
DAYS_LAST_PHONE_CHANGE                               0.055218
NAME_INCOME_TYPE_Working                             0.057481
REGION_RATING_CLIENT                                 0.058899
REGION_RATING_CLIENT_W_CITY                          0.060893
DAYS_BIRTH                                           0.078239
TARGET                                               1.000000
Name: TARGET, dtype: float64

Most Negati

In [348]:
print(train_data.shape)#相関ように、train dataにターゲットが入っている。

(307511, 243)


## 相関のいい外部ソース　を選択

In [178]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = train_data[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

Unnamed: 0,TARGET,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH
TARGET,1.0,-0.099152,-0.160303,-0.157397,0.078239
EXT_SOURCE_1,-0.099152,1.0,0.135721,0.110606,-0.358525
EXT_SOURCE_2,-0.160303,0.135721,1.0,0.096653,-0.091885
EXT_SOURCE_3,-0.157397,0.110606,0.096653,1.0,-0.181252
DAYS_BIRTH,0.078239,-0.358525,-0.091885,-0.181252,1.0


In [272]:
# Make a new dataframe for polynomial features
poly_features = train_data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = test_filled[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# imputer for handling missing values
# from sklearn.preprocessing import Imputer
# imputer = Imputer(strategy = 'median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns = ['TARGET'])

# # Need to impute missing values
# poly_features = imputer.fit_transform(poly_features)
# poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

In [273]:
poly_features

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH
0,0.083037,0.262949,0.139376,-9461
1,0.311267,0.622246,0.510853,-16765
2,0.502130,0.555912,0.729567,-19046
3,0.502130,0.650442,0.510853,-19005
4,0.502130,0.322738,0.510853,-19932
...,...,...,...,...
307506,0.145570,0.681632,0.510853,-9327
307507,0.502130,0.115992,0.510853,-20775
307508,0.744026,0.535722,0.218859,-14966
307509,0.502130,0.514163,0.661024,-11961


## 組み合わせ作成

In [274]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

Polynomial Features shape:  (307511, 35)


In [279]:
poly_features_test

array([[ 1.00000000e+00,  7.52614491e-01,  7.89654351e-01, ...,
        -4.89615795e+02,  5.90566991e+07, -7.12332761e+12],
       [ 1.00000000e+00,  5.64990202e-01,  2.91655532e-01, ...,
        -3.38620166e+03,  1.41278897e+08, -5.89442945e+12],
       [ 1.00000000e+00,  5.01179811e-01,  6.99786830e-01, ...,
        -7.48039386e+03,  2.45326120e+08, -8.04568669e+12],
       ...,
       [ 1.00000000e+00,  7.33503500e-01,  6.32769551e-01, ...,
        -1.28160051e+03,  7.19238232e+07, -4.03638756e+12],
       [ 1.00000000e+00,  3.73090081e-01,  4.45700983e-01, ...,
        -4.95260708e+03,  1.16176497e+08, -2.72522698e+12],
       [ 1.00000000e+00,  5.01179811e-01,  4.56540667e-01, ...,
        -1.03398023e+03,  5.30490449e+07, -2.72171659e+12]])

In [275]:
poly_features

array([[ 1.00000000e+00,  8.30369674e-02,  2.62948593e-01, ...,
        -1.83785678e+02,  1.24755987e+07, -8.46859039e+11],
       [ 1.00000000e+00,  3.11267311e-01,  6.22245775e-01, ...,
        -4.37517365e+03,  1.43582987e+08, -4.71205850e+12],
       [ 1.00000000e+00,  5.02129806e-01,  5.55912083e-01, ...,
        -1.01375679e+04,  2.64650402e+08, -6.90893871e+12],
       ...,
       [ 1.00000000e+00,  7.44026400e-01,  5.35721752e-01, ...,
        -7.16860892e+02,  4.90203102e+07, -3.35210198e+12],
       [ 1.00000000e+00,  5.02129806e-01,  5.14162820e-01, ...,
        -5.22638430e+03,  9.45696770e+07, -1.71120670e+12],
       [ 1.00000000e+00,  7.34459669e-01,  7.08568896e-01, ...,
        -2.18762433e+02,  3.23681708e+07, -4.78920655e+12]])

## feature_nameの作成

In [276]:
poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]

['1',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DAYS_BIRTH',
 'EXT_SOURCE_1^2',
 'EXT_SOURCE_1 EXT_SOURCE_2',
 'EXT_SOURCE_1 EXT_SOURCE_3',
 'EXT_SOURCE_1 DAYS_BIRTH',
 'EXT_SOURCE_2^2',
 'EXT_SOURCE_2 EXT_SOURCE_3',
 'EXT_SOURCE_2 DAYS_BIRTH',
 'EXT_SOURCE_3^2',
 'EXT_SOURCE_3 DAYS_BIRTH',
 'DAYS_BIRTH^2']

## test train poly feature 作成

In [280]:
# Create a dataframe of the features 
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

poly_features_test = pd.DataFrame(poly_features_test, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))

EXT_SOURCE_2 EXT_SOURCE_3                -0.194235
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189593
EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176589
EXT_SOURCE_2 EXT_SOURCE_3^2              -0.171729
EXT_SOURCE_1 EXT_SOURCE_2                -0.166538
EXT_SOURCE_1 EXT_SOURCE_3                -0.164933
EXT_SOURCE_2                             -0.160303
EXT_SOURCE_3                             -0.157397
EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156791
EXT_SOURCE_1 EXT_SOURCE_3^2              -0.151139
Name: TARGET, dtype: float64
EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH    0.155847
EXT_SOURCE_2 DAYS_BIRTH                 0.156879
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH    0.181417
TARGET                                  1.000000
1                                            NaN
Name: TARGET, dtype: float64


In [281]:
poly_features_test

Unnamed: 0,1,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1^2,EXT_SOURCE_1 EXT_SOURCE_2,EXT_SOURCE_1 EXT_SOURCE_3,EXT_SOURCE_1 DAYS_BIRTH,EXT_SOURCE_2^2,...,EXT_SOURCE_2^3,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3
0,1.0,0.752614,0.789654,0.159520,-19241.0,0.566429,0.594305,0.120057,-14481.055414,0.623554,...,0.492392,0.099469,-11997.802403,0.020094,-2423.698322,2.923427e+08,0.004059,-489.615795,5.905670e+07,-7.123328e+12
1,1.0,0.564990,0.291656,0.432962,-18064.0,0.319214,0.164783,0.244619,-10205.983005,0.085063,...,0.024809,0.036829,-1536.577117,0.054673,-2281.043619,9.516956e+07,0.081161,-3386.201665,1.412789e+08,-5.894429e+12
2,1.0,0.501180,0.699787,0.610991,-20038.0,0.251181,0.350719,0.306217,-10042.641046,0.489702,...,0.342687,0.299203,-9812.640816,0.261238,-8567.521115,2.809794e+08,0.228089,-7480.393855,2.453261e+08,-8.045687e+12
3,1.0,0.525734,0.509677,0.612704,-13976.0,0.276396,0.267955,0.322119,-7347.658072,0.259771,...,0.132399,0.159163,-3630.555667,0.191336,-4364.443591,9.955450e+07,0.230013,-5246.681115,1.196786e+08,-2.729912e+12
4,1.0,0.202145,0.425687,0.500106,-13040.0,0.040863,0.086051,0.101094,-2635.970697,0.181210,...,0.077139,0.090624,-2362.974127,0.106467,-2776.067480,7.238455e+07,0.125079,-3261.377501,8.503876e+07,-2.217342e+12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,1.0,0.501180,0.648575,0.643026,-19970.0,0.251181,0.325053,0.322271,-10008.560818,0.420649,...,0.272823,0.270488,-8400.368742,0.268174,-8328.493414,2.586523e+08,0.265879,-8257.233066,2.564392e+08,-7.964054e+12
48740,1.0,0.501180,0.684596,0.500106,-11186.0,0.251181,0.343106,0.250643,-5606.197362,0.468671,...,0.320850,0.234385,-5242.555692,0.171221,-3829.752193,8.566112e+07,0.125079,-2797.681651,6.257651e+07,-1.399666e+12
48741,1.0,0.733503,0.632770,0.283712,-15922.0,0.538027,0.464139,0.208104,-11678.842724,0.400397,...,0.253359,0.113597,-6375.125880,0.050933,-2858.384957,1.604135e+08,0.022837,-1281.600508,7.192382e+07,-4.036388e+12
48742,1.0,0.373090,0.445701,0.595456,-13968.0,0.139196,0.166287,0.222159,-5211.322249,0.198649,...,0.088538,0.118287,-2774.734348,0.158031,-3707.043157,8.695850e+07,0.211130,-4952.607075,1.161765e+08,-2.725227e+12


In [278]:
poly_features

Unnamed: 0,1,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1^2,EXT_SOURCE_1 EXT_SOURCE_2,EXT_SOURCE_1 EXT_SOURCE_3,EXT_SOURCE_1 DAYS_BIRTH,EXT_SOURCE_2^2,...,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2^2 DAYS_BIRTH,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_2 DAYS_BIRTH^2,EXT_SOURCE_3^3,EXT_SOURCE_3^2 DAYS_BIRTH,EXT_SOURCE_3 DAYS_BIRTH^2,DAYS_BIRTH^3,TARGET
0,1.0,0.083037,0.262949,0.139376,-9461.0,0.006895,0.021834,0.011573,-785.612748,0.069142,...,0.009637,-654.152107,0.005108,-346.733022,2.353667e+07,0.002707,-183.785678,1.247560e+07,-8.468590e+11,1
1,1.0,0.311267,0.622246,0.510853,-16765.0,0.096887,0.193685,0.159012,-5218.396475,0.387190,...,0.197797,-6491.237078,0.162388,-5329.192190,1.748916e+08,0.133318,-4375.173647,1.435830e+08,-4.712058e+12,0
2,1.0,0.502130,0.555912,0.729567,-19046.0,0.252134,0.279140,0.366337,-9563.564279,0.309038,...,0.225464,-5885.942404,0.295894,-7724.580288,2.016572e+08,0.388325,-10137.567875,2.646504e+08,-6.908939e+12,0
3,1.0,0.502130,0.650442,0.510853,-19005.0,0.252134,0.326606,0.256514,-9542.976957,0.423074,...,0.216129,-8040.528832,0.169746,-6314.981929,2.349331e+08,0.133318,-4959.747997,1.845150e+08,-6.864416e+12,0
4,1.0,0.502130,0.322738,0.510853,-19932.0,0.252134,0.162057,0.256514,-10008.451286,0.104160,...,0.053210,-2076.117157,0.084225,-3286.224555,1.282190e+08,0.133318,-5201.667828,2.029540e+08,-7.918677e+12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,1.0,0.145570,0.681632,0.510853,-9327.0,0.021191,0.099226,0.074365,-1357.735625,0.464623,...,0.237354,-4333.535804,0.177886,-3247.790830,5.929720e+07,0.133318,-2434.073642,4.444059e+07,-8.113830e+11,0
307507,1.0,0.502130,0.115992,0.510853,-20775.0,0.252134,0.058243,0.256514,-10431.746713,0.013454,...,0.006873,-279.510194,0.030271,-1231.020288,5.006225e+07,0.133318,-5421.666121,2.204844e+08,-8.966503e+12,0
307508,1.0,0.744026,0.535722,0.218859,-14966.0,0.553575,0.398591,0.162837,-11135.099105,0.286998,...,0.062812,-4295.209004,0.025661,-1754.727146,1.199916e+08,0.010483,-716.860892,4.902031e+07,-3.352102e+12,0
307509,1.0,0.502130,0.514163,0.661024,-11961.0,0.252134,0.258176,0.331920,-6005.974605,0.264363,...,0.174750,-3162.050698,0.224665,-4065.229651,7.355897e+07,0.288836,-5226.384299,9.456968e+07,-1.711207e+12,1


In [350]:
print('train_file\n ' + str(X_filled.shape))
print('test_file\n' + str(test_filled.shape))
print('add features of train\n' + str(poly_features.shape))
print('add featrers of test\n' + str(poly_features_test.shape))

train_file
 (307511, 242)
test_file
(48744, 242)
add features of train
(307511, 36)
add featrers of test
(48744, 35)


 ## 追加特徴量と合体

In [284]:
final_train = pd.concat([X_filled, poly_features], 1)
final_test = pd.concat([test_filled, poly_features_test], 1)

In [285]:
final_train.shape

(307511, 278)

In [287]:
final_test.shape

(48744, 277)

## 欠損の確認

In [291]:
print(final_train.isnull().sum())
print(final_test.isnull().sum())

SK_ID_CURR                   0
CNT_CHILDREN                 0
AMT_INCOME_TOTAL             0
AMT_CREDIT                   0
AMT_ANNUITY                  0
                            ..
EXT_SOURCE_3^3               0
EXT_SOURCE_3^2 DAYS_BIRTH    0
EXT_SOURCE_3 DAYS_BIRTH^2    0
DAYS_BIRTH^3                 0
TARGET                       0
Length: 278, dtype: int64
SK_ID_CURR                   0
CNT_CHILDREN                 0
AMT_INCOME_TOTAL             0
AMT_CREDIT                   0
AMT_ANNUITY                  0
                            ..
EXT_SOURCE_2 DAYS_BIRTH^2    0
EXT_SOURCE_3^3               0
EXT_SOURCE_3^2 DAYS_BIRTH    0
EXT_SOURCE_3 DAYS_BIRTH^2    0
DAYS_BIRTH^3                 0
Length: 277, dtype: int64


## train testを列揃える

In [293]:
print(final_train.columns.difference(final_test.columns))

Index(['TARGET'], dtype='object')


In [295]:
final_train_X = final_train.drop(columns = ['TARGET'])
final_train_y = train_y

In [352]:
print('X_train' + str(final_train_X.shape) + '  ' + 'y_train' + str(final_train_y.shape ))

X_train(307511, 277)  y_train(307511,)


## 普通の分割(train, valid)

In [353]:
from sklearn.model_selection import train_test_split

f_train_X, f_valid_X, f_train_y, f_valid_y = train_test_split(
         final_train_X, final_train_y, test_size=0.3, random_state=42)

## lgbm で予測してみる。（パラはカーネルのコピ）

In [299]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

lgb.fit(f_train_X, f_train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.9497036, importance_type='split',
               learning_rate=0.02, max_depth=8, min_child_samples=20,
               min_child_weight=39.3259775, min_split_gain=0.0222415,
               n_estimators=10000, n_jobs=-1, nthread=4, num_leaves=34,
               objective=None, random_state=None, reg_alpha=0.041545473,
               reg_lambda=0.0735294, silent=-1, subsample=0.8715623,
               subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [301]:
f_train_pred = lgb.predict_proba(f_train_X)
# f_valid_pred = lgb.predict_proba(f_valid_X)
# import numpy as np
# from sklearn.metrics import roc_auc_score

# print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
# print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))


In [303]:
f_valid_pred = lgb.predict_proba(f_valid_X)
import numpy as np
from sklearn.metrics import roc_auc_score

print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))

train_roc 
0.9748507051255825
valid_roc 
0.7421219632729228


## submit

In [304]:
f_test_pred = lgb.predict(final_test)

In [305]:
f_test_pred = lgb.predict(final_test)
# Submission dataframe
submit = test[['SK_ID_CURR']]
submit['TARGET'] = f_test_pred

submit.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0
1,100005,0
2,100013,0
3,100028,0
4,100038,0


In [306]:
# Save the submission to a csv file
submit.to_csv('lgb_baseline.csv', index = False)

## kaggle score of lgbm 1

In [354]:
0.50689

0.50689

##  lgbm 2

In [307]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(
            nthread=4,
            n_estimators=100)

lgb.fit(f_train_X, f_train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, nthread=4, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [308]:
f_train_pred = lgb.predict_proba(f_train_X)
f_valid_pred = lgb.predict_proba(f_valid_X)
import numpy as np
from sklearn.metrics import roc_auc_score

print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))

train_roc 
0.8110193112721575
valid_roc 
0.7565460786645102


In [343]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def cross(function):
    X =  final_train_X
    y =  final_train_y
    kf = KFold(n_splits=4, shuffle = True, random_state = 1)
    kf.get_n_splits(X)
    cv = kf


    clf = function
    scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')
    print(scores)

In [328]:
# def lgb_function(est = 100)
lgb = LGBMClassifier(
            max_depth = 8,
            learning_rate=0.05,
            num_leaves=30,)

lgb.fit(f_train_X, f_train_y)

f_train_pred = lgb.predict_proba(f_train_X)
f_valid_pred = lgb.predict_proba(f_valid_X)
import numpy as np
from sklearn.metrics import roc_auc_score

print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))

train_roc 
0.7795640415785626
valid_roc 
0.7546147221029142


In [344]:
cross(lgb)

[0.75664562 0.75423454 0.75336798 0.75362216]


In [329]:
clf = lgb
f_test_pred = clf.predict(final_test)
# Submission dataframe
submit = test[['SK_ID_CURR']]
submit['TARGET'] = f_test_pred
# Save the submission to a csv file
submit.to_csv('lgb3_baseline.csv', index = False)

## kaggle score of lgbm 2

In [330]:
0.50066

0.50066

In [325]:
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
rf = RandomForestClassifier(n_estimators = 100, max_depth=11, random_state = 50, verbose = 1, n_jobs = -1)

rf.fit(f_train_X, f_train_y)

f_train_pred = rf.predict_proba(f_train_X)
f_valid_pred = rf.predict_proba(f_valid_X)
import numpy as np
from sklearn.metrics import roc_auc_score

print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   59.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.9s finished


train_roc 
0.8225263592419438
valid_roc 
0.7363314138732366


In [345]:
cross(rf)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]:

[0.7395778  0.73334163 0.73510907 0.73465504]


In [331]:
clf = rf
f_test_pred = clf.predict(final_test)
# Submission dataframe
submit = test[['SK_ID_CURR']]
submit['TARGET'] = f_test_pred
# Save the submission to a csv file
submit.to_csv('rf_baseline.csv', index = False)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.7s finished


## kaggle score of random_forest

In [None]:
0.50133

In [339]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 1)
log_reg.fit(f_train_X, f_train_y)

f_train_pred = log_reg.predict_proba(f_train_X)
f_valid_pred = log_reg.predict_proba(f_valid_X)
import numpy as np
from sklearn.metrics import roc_auc_score

print('train_roc \n' + str(roc_auc_score(f_train_y, f_train_pred[:,1])))
print('valid_roc \n' + str(roc_auc_score(f_valid_y, f_valid_pred[:,1])))

train_roc 
0.5831266464266416
valid_roc 
0.5827508545424509


In [346]:
cross(log_reg)

[0.58476108 0.58154429 0.5842632  0.58147453]


In [340]:
clf = log_reg
f_test_pred = clf.predict(final_test)
# Submission dataframe
submit = test[['SK_ID_CURR']]
submit['TARGET'] = f_test_pred
# Save the submission to a csv file
submit.to_csv('log_reg_baseline.csv', index = False)

## kaggle score of log_reg

In [None]:
0.50000

## submitすると急激に下がる。　汎用性が確保できていないと思われる。　　　
## 新たな特徴量を作り、　汎用性をあげる必要あり