In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import random_projection
import lightgbm as lgb
from math import exp, log
from catboost import CatBoostRegressor
path = '/home/ec2-user/Sanjay/main/santander/'

In [3]:
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

X_train = train_df.drop(["ID", "target"], axis=1)
y_train = train_df[['target']]['target'].map(lambda x : log(1+x))

X_test = test_df.drop(["ID"], axis=1)
ntrain = len(X_train)
ntest = len(X_test)
X_train.shape, X_test.shape

((4459, 4991), (49343, 4991))

In [4]:
print("Preparetion")
colsToRemove = []
for col in X_train.columns:
    if X_train[col].std() == 0: 
        colsToRemove.append(col)
    
X_train.drop(colsToRemove, axis=1, inplace=True)
X_test.drop(colsToRemove, axis=1, inplace=True)

X_train.shape, X_test.shape, len(colsToRemove)

Preparetion


((4459, 4735), (49343, 4735), 256)

In [5]:
colsToRemove = []
colsScaned = []
dupList = {}
columns = X_train.columns

for i in range(len(columns)-1):
    v = X_train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, X_train[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
                
X_train.drop(colsToRemove, axis=1, inplace=True) 
X_test.drop(colsToRemove, axis=1, inplace=True)
X_train.shape, X_test.shape, len(dupList)

((4459, 4730), (49343, 4730), 4)

In [6]:
weight = ((X_train != 0).sum()/len(X_train)).values

In [7]:
tmp_train = X_train[X_train!=0]
tmp_test = X_test[X_test!=0]
tmp = pd.concat([X_train,X_test]).fillna(0)
tmp_train.shape, tmp_test.shape, tmp.shape

((4459, 4730), (49343, 4730), (53802, 4730))

In [8]:
X_train["weight_count"] = (tmp_train*weight).sum(axis=1)
X_test["weight_count"] = (tmp_test*weight).sum(axis=1)
X_train["count_not0"] = (X_train != 0).sum(axis=1)
X_test["count_not0"] = (X_test != 0).sum(axis=1)
X_train["sum"] = X_train.sum(axis=1)
X_test["sum"] = X_test.sum(axis=1)
X_train["var"] = tmp_train.var(axis=1)
X_test["var"] = tmp_test.var(axis=1)
X_train["mean"] = tmp_train.mean(axis=1)
X_test["mean"] = tmp_test.mean(axis=1)
X_train["std"] = tmp_train.std(axis=1)
X_test["std"] = tmp_test.std(axis=1)
X_train["max"] = tmp_train.max(axis=1)
X_test["max"] = tmp_test.max(axis=1)
X_train["min"] = tmp_train.min(axis=1)
X_test["min"] = tmp_test.min(axis=1)

In [9]:
del(tmp_train)
del(tmp_test)

In [10]:
n_com = 100
transformer = random_projection.SparseRandomProjection(n_components = n_com)
RP = transformer.fit_transform(tmp)
rp = pd.DataFrame(RP)

In [11]:
columns = ["RandomProjection{}".format(i) for i in range(n_com)]
rp.columns = columns
rp_train = rp[:ntrain]
rp_test = rp[ntrain:]
rp_test.index = X_test.index

In [12]:
X_train = pd.concat([X_train,rp_train],axis=1)
X_test = pd.concat([X_test,rp_test],axis=1)
X_train.shape, X_test.shape

((4459, 4838), (49343, 4838))

### 5fold CV CatBoost

In [13]:
model = CatBoostRegressor(iterations=800, 
                          learning_rate=0.05, 
                          depth=12, 
                          l2_leaf_reg=9, 
                          rsm=1, 
                          one_hot_max_size=1, 
                          random_strength=1, 
                          bagging_temperature=1, 
                          thread_count=16, 
                          random_seed=2, 
                          metric_period = 100,
                          loss_function='RMSE')

In [14]:
def lgbm_model(X_tr, y_tr, X_va, y_va, test, fold):

    model.fit(X_tr, y_tr, use_best_model=True, eval_set=(X_va, y_va), verbose=True)
    
    y_pred = model.predict(X_va)
    valid = pd.DataFrame(y_pred, columns=['prob' + str(fold)])
    test_pred = model.predict(test)
    test = pd.DataFrame(test_pred, columns=['prob' + str(fold)])
    return test

In [15]:
from sklearn.model_selection import KFold
X = X_train
y = y_train
kf = KFold(n_splits=5, shuffle=True, random_state=2017)
kf.get_n_splits(X), kf.split(X)
fold=1
valid_final = test_df[['ID']]
for train_idx, test_idx in kf.split(X):
    print("FOLD ", fold, " ----------------------------------------------------------------------------------")
    X_tr = X[X.index.isin(train_idx)]
    y_tr = y[y.index.isin(train_idx)]
    X_va = X[X.index.isin(test_idx)]
    y_va = y[y.index.isin(test_idx)]
    print(X_tr.shape, y_tr.shape, X_va.shape, y_va.shape)
    valid = lgbm_model(X_tr, y_tr, X_va, y_va, X_test, fold)
    valid_final = valid_final.join(valid)
    print(valid.shape, valid_final.shape)
    fold = fold+1

FOLD  1  ----------------------------------------------------------------------------------
(3567, 4838) (3567,) (892, 4838) (892,)
0:	learn: 13.8839842	test: 13.9684641	best: 13.9684641 (0)	total: 8.54s	remaining: 7h 7m 5s
100:	learn: 1.4372076	test: 1.4698139	best: 1.4698139 (100)	total: 13m 53s	remaining: 6h 38m 42s
200:	learn: 1.3017608	test: 1.3999876	best: 1.3999876 (200)	total: 27m 40s	remaining: 6h 25m 21s
300:	learn: 1.2021116	test: 1.3869673	best: 1.3869673 (300)	total: 41m 28s	remaining: 6h 11m 55s
400:	learn: 1.1148224	test: 1.3838947	best: 1.3836313 (379)	total: 55m 20s	remaining: 5h 58m 39s
500:	learn: 1.0283170	test: 1.3807782	best: 1.3804806 (480)	total: 1h 9m 9s	remaining: 5h 44m 58s
600:	learn: 0.9623662	test: 1.3804468	best: 1.3797075 (552)	total: 1h 23m	remaining: 5h 31m 21s
700:	learn: 0.9025148	test: 1.3824082	best: 1.3797075 (552)	total: 1h 36m 53s	remaining: 5h 17m 44s
800:	learn: 0.8488571	test: 1.3828851	best: 1.3797075 (552)	total: 1h 50m 43s	remaining: 5h 3m

KeyboardInterrupt: 

In [None]:
valid_final['target'] = valid_final.iloc[:,1:fold].mean(axis=1)
valid_final['target'] = valid_final['target'].map(lambda x : exp(x)-1)
valid_final = valid_final.iloc[:49342]

In [None]:
valid_final.to_csv(path + 'submit/' + 'catboost_Random_Projection_Aggregate_10_fold.csv', index=False)
valid_final[['ID', 'target']].to_csv(path + 'submit/' + 'catboost_Random_Projection_Aggregate.csv', index=False)

In [None]:
valid_final.head()