In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import random_projection
import lightgbm as lgb
from math import exp, log
path = '/home/ec2-user/Sanjay/main/santander/'

In [3]:
X_train = pd.read_csv(path + 'codes/data/' + '1_raw_data_train.csv')
X_test = pd.read_csv(path + 'codes/data/' + '1_raw_data_test.csv')

test_df = pd.read_csv(path + 'test.csv',  usecols=['ID'])
train_df = pd.read_csv(path + 'train.csv', usecols=['target'])
y_train = train_df[['target']]['target'].map(lambda x : log(1+x))
X_train.shape, X_test.shape, train_df.shape

((4459, 4730), (49343, 4730), (4459, 1))

In [4]:
Random_Projection_train = pd.read_csv(path + 'codes/data/' + '2_Random_Projection_train.csv')
Random_Projection_test = pd.read_csv(path + 'codes/data/' + '2_Random_Projection_test.csv')

kmeans_train = pd.read_csv(path + 'codes/data/' + '3_kmeans_train.csv')
kmeans_test = pd.read_csv(path + 'codes/data/' + '3_kmeans_test.csv')

aggr_train = pd.read_csv(path + 'codes/data/' + '4_aggr_train.csv')
aggr_test = pd.read_csv(path + 'codes/data/' + '4_aggr_test.csv')

pca_train = pd.read_csv(path + 'codes/data/' + '5_pca_train.csv')
pca_test = pd.read_csv(path + 'codes/data/' + '5_pca_test.csv')

print(Random_Projection_train.shape, Random_Projection_test.shape, 
      kmeans_train.shape, kmeans_test.shape,
      aggr_train.shape, aggr_test.shape, 
      pca_train.shape, pca_test.shape )

(4459, 50) (49343, 50) (4459, 9) (49343, 9) (4459, 9) (49343, 9) (4459, 20) (49343, 20)


In [13]:
train = X_train.join(Random_Projection_train).join(kmeans_train).join(aggr_train).join(pca_train).fillna(0)
test = X_test.join(Random_Projection_test).join(kmeans_test).join(aggr_test).join(pca_test).fillna(0)
train.shape, test.shape

((4459, 4818), (49343, 4818))

### Weak RandomForestRegressor

In [6]:
from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 4500
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))

x1, x2, y1, y2 = model_selection.train_test_split(
    train, y_train.values, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))

col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
train = train[col]
test = test[col]
train.shape

0.09410954790021511


(4459, 4500)

### Kolmogorov-Smirnov test

In [14]:
from scipy.stats import ks_2samp
THRESHOLD_P_VALUE = 0.01 #need tuned
THRESHOLD_STATISTIC = 0.2 #need tuned
diff_cols = []
for col in train.columns:
    statistic, pvalue = ks_2samp(train[col].values, test[col].values)
    if pvalue <= THRESHOLD_P_VALUE and np.abs(statistic) > THRESHOLD_STATISTIC:
        diff_cols.append(col)
for col in diff_cols:
    if col in train.columns:
        train.drop(col, axis=1, inplace=True)
        test.drop(col, axis=1, inplace=True)
train.shape

(4459, 4777)

#### LGBM Models

In [15]:
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 60,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_seed' : 42,
    'bagging_freq': 5,
    'verbose': 50
}

In [16]:
def lgbm_model(X_tr, y_tr, X_va, y_va, test, fold):
    tr_data = lgb.Dataset(X_tr, label=y_tr)
    va_data = lgb.Dataset(X_va, label=y_va)
    
    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=va_data,
                      num_boost_round=4000,
                      early_stopping_rounds=120,
                      verbose_eval=300)
    
    y_pred = model.predict(X_va)
    valid = pd.DataFrame(y_pred, columns=['prob' + str(fold)])
    test_pred = model.predict(test)
    test = pd.DataFrame(test_pred, columns=['prob' + str(fold)])
    return test

In [17]:
from sklearn.model_selection import KFold
X = train
y = y_train
kf = KFold(n_splits=10, shuffle=True, random_state=2017)
kf.get_n_splits(X), kf.split(X)
fold=1
valid_final = test_df[['ID']]
for train_idx, test_idx in kf.split(X):
    print("FOLD ", fold, " ----------------------------------------------------------------------------------")
    X_tr = X[X.index.isin(train_idx)]
    y_tr = y[y.index.isin(train_idx)]
    X_va = X[X.index.isin(test_idx)]
    y_va = y[y.index.isin(test_idx)]
    print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)
    valid = lgbm_model(X_tr, y_tr, X_va, y_va, test, fold)
    valid_final = valid_final.join(valid)
    print(valid.shape, valid_final.shape)
    fold = fold+1

FOLD  1  ----------------------------------------------------------------------------------
(4013, 4777) (4013,) (446, 4777) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.38206
[600]	valid_0's rmse: 1.35358
Early stopping, best iteration is:
[771]	valid_0's rmse: 1.35061
(49343, 1) (49343, 2)
FOLD  2  ----------------------------------------------------------------------------------
(4013, 4777) (4013,) (446, 4777) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.4491
[600]	valid_0's rmse: 1.41166
Early stopping, best iteration is:
[770]	valid_0's rmse: 1.41053
(49343, 1) (49343, 3)
FOLD  3  ----------------------------------------------------------------------------------
(4013, 4777) (4013,) (446, 4777) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.44821
[600]	valid_0's rmse: 1.37789
[900]	valid_0's rmse: 1.36471
Early stopping, best iteration i

In [11]:
valid_final['target'] = valid_final.iloc[:,1:fold].mean(axis=1)
valid_final['target'] = valid_final['target'].map(lambda x : exp(x)-1)

In [12]:
valid_final = valid_final.iloc[:49342]
valid_final.to_csv(path + 'submit/' + 'lgbm_10fold_1.csv', index=False)
valid_final[['ID', 'target']].to_csv(path + 'submit/' + 'lgbm_1.csv', index=False)