In [12]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [13]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

X_train = train.loc[:, train.columns != 'claim']
y_train = train.loc[:, 'claim']

In [14]:
train.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 957919 entries, 0 to 957918
Columns: 119 entries, f1 to claim
dtypes: float64(118), int64(1)
memory usage: 877.0 MB


In [16]:
train.describe()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,942672.0,942729.0,942428.0,942359.0,942514.0,942398.0,942415.0,942546.0,942670.0,942696.0,...,942554.0,942420.0,942509.0,942686.0,942481.0,942360.0,942330.0,942512.0,942707.0,957919.0
mean,0.090201,0.345964,4068.744207,0.201214,0.304869,-0.071458,1620.843815,377164.2,1806054000000000.0,5323.442367,...,-19.926398,2.07453,23.885245,1.748777,63152.97354,1.208876,4.276905e+16,3959.204669,0.559267,0.498492
std,0.043564,0.146251,6415.82944,0.21251,0.145343,2.123777,1276.281403,345432.5,2335204000000000.0,10068.380032,...,18.578439,0.895793,45.58136,10.088848,92435.016241,0.114959,6.732441e+16,3155.991777,0.408426,0.499998
min,-0.14991,-0.019044,-9421.7,-0.082122,-0.00699,-12.791,-224.8,-29843.0,-1153300000000000.0,-26404.0,...,-105.86,0.27704,-27.691,-26.589,-81977.0,0.90527,-8944400000000000.0,-415.24,-0.15124,0.0
25%,0.070227,0.28305,418.43,0.035086,0.24052,-1.1207,481.545,91209.0,11531000000000.0,75.87675,...,-28.812,1.4877,-0.62888,-4.473975,2443.2,1.1468,232110000000000.0,1306.2,0.27656,0.0
50%,0.090135,0.3891,1279.5,0.137,0.32779,-0.38011,1446.1,289670.0,504305000000000.0,1073.2,...,-14.636,1.6621,1.7277,0.88571,19479.0,1.1772,1.3275e+16,3228.0,0.47344,0.0
75%,0.1165,0.45845,4444.4,0.2971,0.41283,0.92194,2495.9,560560.0,3103100000000000.0,5693.2,...,-5.3253,2.522325,18.991,6.840775,88488.0,1.242,5.2787e+16,6137.9,0.74621,1.0
max,0.41517,0.51899,39544.0,1.3199,0.55475,11.202,5426.6,1913700.0,1.0424e+16,85622.0,...,1.6134,4.5659,217.84,47.757,526050.0,1.8867,3.2499e+17,13151.0,2.7436,1.0


In [17]:
train.claim.value_counts()

0    480404
1    477515
Name: claim, dtype: int64

In [18]:
train.loc[:, train.columns != 'claim'].isna().sum().describe()

count      118.000000
mean     15430.355932
std        112.771725
min      15168.000000
25%      15363.500000
50%      15441.000000
75%      15508.500000
max      15678.000000
dtype: float64

In [19]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(X_train)
X_train.loc[:, :] = imp.transform(X_train)
test.loc[:, :] = imp.transform(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [20]:
sc = StandardScaler()
sc.fit(X_train)
X_train.loc[:, :] = sc.transform(X_train)
test.loc[:, :] = sc.transform(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [23]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_importances = pd.DataFrame()
predictions = [0] * len(test)

features = X_train.columns
train = pd.concat([X_train, y_train], axis=1)
for fold, (train_idx, val_idx) in enumerate(skf.split(X=train.loc[:, train.columns != 'claim'], y=train['claim'])):
    print(f"###### SKFold {fold} ######")
    X_train, y_train = train[features].iloc[train_idx], train['claim'].iloc[train_idx]
    X_valid, y_valid = train[features].iloc[val_idx], train['claim'].iloc[val_idx]
    
    model = RandomForestClassifier(n_estimators=500, max_depth=16, n_jobs=-1, random_state=42)
    start = time.time()
    model.fit(X_train, y_train)
    elapsed = time.time() - start
    
    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = features
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = fold
    rf_importances = rf_importances.append(fi_tmp)

    y_pred = model.predict(X_valid)
    predictions += model.predict_proba(test)[:, -1]
    auc = roc_auc_score(y_valid, y_pred)
    print(f"fold {fold} - rf roc auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

predictions /= 5
# predictions = np.where(predictions > 0.5, 1, 0)
test['claim'] = predictions
test.reset_index(inplace=True)
test[['id', 'claim']].to_csv('predictions.csv', index=False)

###### SKFold 0 ######


KeyboardInterrupt: 

In [24]:
rf_importances.groupby(['feature', 'fold']).mean()

KeyError: 'feature'