In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import random_projection
import lightgbm as lgb
from math import exp, log
path = '/home/ec2-user/Sanjay/main/santander/'

In [32]:
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

X_train = train_df.drop(["ID", "target"], axis=1)
y_train = train_df[['target']]['target'].map(lambda x : log(1+x))

X_test = test_df.drop(["ID"], axis=1)
ntrain = len(X_train)
ntest = len(X_test)
X_train.shape, X_test.shape

((4459, 4991), (49343, 4991))

In [33]:
print("Preparetion")
colsToRemove = []
for col in X_train.columns:
    if X_train[col].std() == 0: 
        colsToRemove.append(col)
    
X_train.drop(colsToRemove, axis=1, inplace=True)
X_test.drop(colsToRemove, axis=1, inplace=True)

X_train.shape, X_test.shape, len(colsToRemove)

Preparetion


((4459, 4735), (49343, 4735), 256)

In [34]:
colsToRemove = []
colsScaned = []
dupList = {}
columns = X_train.columns

for i in range(len(columns)-1):
    v = X_train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, X_train[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
                
X_train.drop(colsToRemove, axis=1, inplace=True) 
X_test.drop(colsToRemove, axis=1, inplace=True)
X_train.shape, X_test.shape, len(dupList)

((4459, 4730), (49343, 4730), 4)

In [35]:
weight = ((X_train != 0).sum()/len(X_train)).values

In [36]:
tmp_train = X_train[X_train!=0]
tmp_test = X_test[X_test!=0]
tmp = pd.concat([X_train,X_test]).fillna(0)
tmp_train.shape, tmp_test.shape, tmp.shape

((4459, 4730), (49343, 4730), (53802, 4730))

In [37]:
X_train["weight_count"] = (tmp_train*weight).sum(axis=1)
X_test["weight_count"] = (tmp_test*weight).sum(axis=1)
X_train["count_not0"] = (X_train != 0).sum(axis=1)
X_test["count_not0"] = (X_test != 0).sum(axis=1)
X_train["sum"] = X_train.sum(axis=1)
X_test["sum"] = X_test.sum(axis=1)
X_train["var"] = tmp_train.var(axis=1)
X_test["var"] = tmp_test.var(axis=1)
X_train["mean"] = tmp_train.mean(axis=1)
X_test["mean"] = tmp_test.mean(axis=1)
X_train["std"] = tmp_train.std(axis=1)
X_test["std"] = tmp_test.std(axis=1)
X_train["max"] = tmp_train.max(axis=1)
X_test["max"] = tmp_test.max(axis=1)
X_train["min"] = tmp_train.min(axis=1)
X_test["min"] = tmp_test.min(axis=1)

In [38]:
del(tmp_train)
del(tmp_test)

In [39]:
n_com = 50
transformer = random_projection.SparseRandomProjection(n_components = n_com)

In [40]:
RP = transformer.fit_transform(tmp)
rp = pd.DataFrame(RP)

In [41]:
columns = ["RandomProjection{}".format(i) for i in range(n_com)]
rp.columns = columns

In [42]:
rp_train = rp[:ntrain]
rp_test = rp[ntrain:]
rp_test.index = X_test.index

In [43]:
X_train = pd.concat([X_train,rp_train],axis=1)
X_test = pd.concat([X_test,rp_test],axis=1)
X_train.shape, X_test.shape

((4459, 4788), (49343, 4788))

### 5fold CV LightGBM

In [44]:
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 60,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_seed' : 42,
    'bagging_freq': 5,
    'verbose': 50
}

In [45]:
def lgbm_model(X_tr, y_tr, X_va, y_va, test, fold):
    tr_data = lgb.Dataset(X_tr, label=y_tr)
    va_data = lgb.Dataset(X_va, label=y_va)
    
    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=va_data,
                      num_boost_round=4000,
                      early_stopping_rounds=120,
                      verbose_eval=300)
    
    y_pred = model.predict(X_va)
    valid = pd.DataFrame(y_pred, columns=['prob' + str(fold)])
    test_pred = model.predict(test)
    test = pd.DataFrame(test_pred, columns=['prob' + str(fold)])
    return test

In [46]:
from sklearn.model_selection import KFold
X = X_train
y = y_train
kf = KFold(n_splits=10, shuffle=True, random_state=2017)
kf.get_n_splits(X), kf.split(X)
fold=1
valid_final = test_df[['ID']]
for train_idx, test_idx in kf.split(X):
    print("FOLD ", fold, " ----------------------------------------------------------------------------------")
    X_tr = X[X.index.isin(train_idx)]
    y_tr = y[y.index.isin(train_idx)]
    X_va = X[X.index.isin(test_idx)]
    y_va = y[y.index.isin(test_idx)]
    print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)
    valid = lgbm_model(X_tr, y_tr, X_va, y_va, X_test, fold)
    valid_final = valid_final.join(valid)
    print(valid.shape, valid_final.shape)
    fold = fold+1

FOLD  1  ----------------------------------------------------------------------------------
(4013, 4788) (4013,) (446, 4788) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.35247
Early stopping, best iteration is:
[429]	valid_0's rmse: 1.34552
(49343, 1) (49343, 2)
FOLD  2  ----------------------------------------------------------------------------------
(4013, 4788) (4013,) (446, 4788) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.36723
[600]	valid_0's rmse: 1.33795
Early stopping, best iteration is:
[575]	valid_0's rmse: 1.33765
(49343, 1) (49343, 3)
FOLD  3  ----------------------------------------------------------------------------------
(4013, 4788) (4013,) (446, 4788) (446,)
Training until validation scores don't improve for 120 rounds.
[300]	valid_0's rmse: 1.37197
[600]	valid_0's rmse: 1.32835
[900]	valid_0's rmse: 1.32394
Early stopping, best iteration is:
[849]	valid_0's rmse: 1.32

In [47]:
valid_final['target'] = valid_final.iloc[:,1:fold].mean(axis=1)
valid_final['target'] = valid_final['target'].map(lambda x : exp(x)-1)

In [48]:
valid_final = valid_final.iloc[:49342]
valid_final.to_csv(path + 'submit/' + 'lgbm_Random_Projection_Aggregate_10_fold1.csv', index=False)
valid_final[['ID', 'target']].to_csv(path + 'submit/' + 'lgbm_Random_Projection_Aggregate1.csv', index=False)

### Temp

In [60]:
data = pd.read_csv(path + 'submit/' + 'lgbm_Random_Projection_Aggregate_10_fold1.csv')

In [61]:
data = data[['ID', 'prob4']].rename(columns={'prob4':'target'})
data['target'] = data['target'].map(lambda x : exp(x)-1)

In [63]:
data[['ID', 'target']].to_csv(path + 'submit/' + 'temp.csv', index=False)