## Load Required Libraries

In [1]:
### Import required libraries

import numpy as np
import pandas as pd
import gc
import random
random.seed(2018)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

import lightgbm as lgb
import xgboost as xgb

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

## Load Train and Test Data

In [2]:
# Read train and test files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

# Prepare Data

In [3]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)

In [4]:
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4991)
Test set size: (49342, 4991)


## Remove Contsant Features

In [5]:
# check and remove constant columns
colsToRemove = []
for col in X_train.columns:
    if X_train[col].std() == 0: 
        colsToRemove.append(col)
        
# remove constant columns in the training set
X_train.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
X_test.drop(colsToRemove, axis=1, inplace=True) 

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

In [6]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4735)
Test set size: (49342, 4735)


## Remove Duplicate Columns

In [7]:
%%time
# Other way to drop duplicate columns is to transpose DatFrame and use pandas routine - drop_duplicates. (Thanks Scirpus!!)
# df.T.drop_duplicates().T. However, transposing is a bad idea when working with large DataFrames.
# But this is fine in this case because of small Dataset.

# Check and remove duplicate columns
colsToRemove = []
colsScaned = []
dupList = {}

columns = X_train.columns

for i in range(len(columns)-1):
    v = X_train[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, X_train[columns[j]].values):
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
                
# remove duplicate columns in the training set
X_train.drop(colsToRemove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
X_test.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dupList)))
print(dupList)

Removed `4` Duplicate Columns

{'34ceb0081': ['d60ddde1b'], '8d57e2749': ['acc5b709d', 'f333a5f60'], '168b3e5bc': ['f8d75792f'], 'a765da8bc': ['912836770']}
CPU times: user 3min 46s, sys: 1.4 s, total: 3min 47s
Wall time: 3min 47s


In [8]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4730)
Test set size: (49342, 4730)


## Drop Sparse Data

In [9]:
def drop_sparse(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test

In [10]:
%%time
X_train, X_test = drop_sparse(X_train, X_test)

CPU times: user 560 ms, sys: 0 ns, total: 560 ms
Wall time: 561 ms


In [11]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4730)
Test set size: (49342, 4730)


# Add Features

## SumZeros

In [12]:
def add_SumZeros(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target']]
    if 'SumZeros' in features:
        train.insert(1, 'SumZeros', (train[flist] == 0).astype(int).sum(axis=1))
        test.insert(1, 'SumZeros', (test[flist] == 0).astype(int).sum(axis=1))
    flist = [x for x in train.columns if not x in ['ID','target']]

    return train, test

In [13]:
%%time
X_train, X_test = add_SumZeros(X_train, X_test, ['SumZeros'])

CPU times: user 9.74 s, sys: 8.66 s, total: 18.4 s
Wall time: 18.4 s


In [14]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4731)
Test set size: (49342, 4731)


## SumValues

In [15]:
def add_SumValues(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target']]
    if 'SumValues' in features:
        train.insert(1, 'SumValues', (train[flist] != 0).astype(int).sum(axis=1))
        test.insert(1, 'SumValues', (test[flist] != 0).astype(int).sum(axis=1))
    flist = [x for x in train.columns if not x in ['ID','target']]

    return train, test

In [16]:
%%time
X_train, X_test = add_SumValues(X_train, X_test, ['SumValues'])

CPU times: user 9.35 s, sys: 3.49 s, total: 12.8 s
Wall time: 12.8 s


In [17]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4732)
Test set size: (49342, 4732)


## Other Aggregates

In [18]:
def add_OtherAgg(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target','SumZeros','SumValues']]
    if 'OtherAgg' in features:
        train['Mean']   = train[flist].mean(axis=1)
        train['Median'] = train[flist].median(axis=1)
        train['Mode']   = train[flist].mode(axis=1)
        train['Max']    = train[flist].max(axis=1)
        train['Var']    = train[flist].var(axis=1)
        train['Std']    = train[flist].std(axis=1)
        
        test['Mean']   = test[flist].mean(axis=1)
        test['Median'] = test[flist].median(axis=1)
        test['Mode']   = test[flist].mode(axis=1)
        test['Max']    = test[flist].max(axis=1)
        test['Var']    = test[flist].var(axis=1)
        test['Std']    = test[flist].std(axis=1)
    flist = [x for x in train.columns if not x in ['ID','target','SumZeros','SumValues']]

    return train, test

In [19]:
%%time
X_train, X_test = add_OtherAgg(X_train, X_test, ['OtherAgg'])

CPU times: user 1min 37s, sys: 21.2 s, total: 1min 58s
Wall time: 1min 58s


In [20]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4738)
Test set size: (49342, 4738)


## K-Means

In [21]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)

['kmeans_cluster_2', 'kmeans_cluster_3', 'kmeans_cluster_4', 'kmeans_cluster_5', 'kmeans_cluster_6', 'kmeans_cluster_7', 'kmeans_cluster_8', 'kmeans_cluster_9', 'kmeans_cluster_10']


In [22]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4747)
Test set size: (49342, 4747)


## PCA

In [23]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

n_components = 20
flist_pca = []
pca = PCA(n_components=n_components)
x_train_projected = pca.fit_transform(normalize(X_train[flist], axis=0))
x_test_projected = pca.transform(normalize(X_test[flist], axis=0))
for npca in range(0, n_components):
    X_train.insert(1, 'PCA_'+str(npca+1), x_train_projected[:, npca])
    X_test.insert(1, 'PCA_'+str(npca+1), x_test_projected[:, npca])
    flist_pca.append('PCA_'+str(npca+1))
print(flist_pca)

['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8', 'PCA_9', 'PCA_10', 'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15', 'PCA_16', 'PCA_17', 'PCA_18', 'PCA_19', 'PCA_20']


In [24]:
gc.collect()
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4767)
Test set size: (49342, 4767)


In [25]:
X_train.head(n=10)

Unnamed: 0,48df886f9,PCA_20,PCA_19,PCA_18,PCA_17,PCA_16,PCA_15,PCA_14,PCA_13,PCA_12,PCA_11,PCA_10,PCA_9,PCA_8,PCA_7,PCA_6,PCA_5,PCA_4,PCA_3,PCA_2,PCA_1,SumValues,SumZeros,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,11d86fa6a,77c9823f2,8d6c2a0b2,4681de4fd,adf119b9a,cff75dd09,96f83a237,b8a716ebf,...,88458cb21,f40da20f4,7ad6b38bd,c901e7df1,8f55955dc,85dcc913d,5ca0b9b0c,eab8abf7a,8d8bffbae,2a1f6c7f9,9437d8b64,5831f4c76,2e84e09c5,d45fd5508,a165f5761,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466,Mean,Median,Mode,Max,Var,Std,kmeans_cluster_2,kmeans_cluster_3,kmeans_cluster_4,kmeans_cluster_5,kmeans_cluster_6,kmeans_cluster_7,kmeans_cluster_8,kmeans_cluster_9,kmeans_cluster_10
0,0.0,-0.00316,0.001888,-0.012204,0.003142,-0.006776,0.001334,0.00592,0.011289,0.004267,-0.009382,-0.005685,2e-05,0.010768,-0.005125,-0.006784,-0.00118,0.006625,-0.005408,0.016651,-0.051944,103,4628,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,4000000,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,152382.3,0.0,0.0,40000000.0,3198114000000.0,1788327.0,0,0,1,2,0,0,5,5,0
1,0.0,0.000226,0.010562,-0.005298,0.009436,-0.0044,0.005933,-0.00037,-0.006015,-0.004209,-0.001969,0.008223,0.005729,-0.003254,-0.014305,-0.007763,-0.009793,-0.02238,-0.008216,0.005297,-0.053817,68,4663,0,0.0,0,0,0,0,0,2200000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,112466.5,0.0,0.0,50000000.0,2467155000000.0,1570718.0,0,0,1,2,0,0,5,5,0
2,0.0,-0.003166,0.004828,-0.004364,0.000771,0.004714,-0.000658,-0.005711,0.000457,-0.015853,-0.00231,0.008593,-0.005416,0.004962,-0.011025,-0.012638,-0.009143,-0.016872,-0.013511,0.003368,-0.072836,19,4712,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,16109.94,0.0,0.0,12000000.0,115879100000.0,340410.2,0,0,1,2,0,0,5,5,0
3,0.0,-0.002647,0.007636,-0.004435,0.00093,0.005787,-0.000543,-0.006402,0.000483,-0.016586,-0.002838,0.008897,-0.005476,0.003717,-0.011522,-0.012897,-0.009416,-0.017666,-0.012315,0.00078,-0.07581,23,4708,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,7059.338,0.0,0.0,6000000.0,31870700000.0,178523.7,0,0,1,2,0,0,5,5,0
4,0.0,-0.002545,0.0065,-0.005829,0.001127,0.004537,0.000897,-0.003864,0.001684,-0.0153,-0.00432,0.007367,-0.004973,0.004145,-0.01101,-0.011458,-0.007036,-0.016165,-0.013838,0.00903,-0.067276,27,4704,0,0.0,0,0,0,0,0,2000000.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,37778.86,0.0,0.0,37662000.0,759211800000.0,871327.6,0,0,1,2,0,0,5,5,0
5,0.0,0.047827,0.006105,-0.019545,0.000271,-0.069836,0.002438,0.034377,-0.041054,0.077986,-0.017362,0.032353,0.005598,-0.00298,-0.019075,0.087328,0.153782,0.319307,0.048979,0.08391,0.158245,762,3969,0,0.0,0,0,0,0,0,17020000.0,0.0,8000.0,0,0,0,0,0.0,0,5000000.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,12000.0,5600000.0,20000000.0,0,0,0,0,0,0,11000,1131738.0,0.0,0.0,200000000.0,35452980000000.0,5954241.0,0,0,1,2,4,6,0,0,4
6,0.0,0.008531,-0.008929,-0.036412,0.014707,-0.005126,-0.024201,0.037632,-0.013128,0.036367,0.004167,0.000872,-0.009468,-0.046889,0.082912,-0.026435,0.002766,0.005434,-0.006597,-0.003591,-0.073832,137,4594,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,40000,0,0,0,190630.3,0.0,0.0,50000000.0,7133915000000.0,2670939.0,0,0,1,2,0,0,5,5,0
7,0.0,-0.00155,0.003522,-0.006025,-0.002775,-0.008186,-0.004297,0.008274,-0.005692,-0.002403,-0.000871,0.014059,-0.011983,-0.011256,0.019759,-0.026904,-0.004536,-0.021001,0.006024,0.004927,-0.057811,31,4700,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,159364.1,0.0,0.0,100000000.0,10624740000000.0,3259561.0,0,0,1,2,0,0,5,5,0
8,0.0,-0.001393,0.008128,-0.005401,0.002702,0.005881,0.00119,-0.003817,-0.000848,-0.010614,-0.003626,0.006197,-0.004921,0.005954,-0.009698,-0.002207,0.028045,-0.015366,-0.013116,-1.3e-05,-0.067549,224,4507,0,0.0,0,0,0,0,0,58000.0,0.0,0.0,0,0,22000,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,22000,0.0,0.0,0.0,4000000.0,0,0,0,0,0,0,0,49389.22,0.0,0.0,20000000.0,335455000000.0,579184.8,0,0,1,2,0,0,5,5,0
9,0.0,-0.003236,0.006132,-0.007609,0.001704,0.005571,-0.000294,-0.007098,-0.00196,-0.015227,-0.003071,0.007017,-0.003432,0.003703,-0.011151,-0.012636,-0.011438,-0.018123,-0.013626,-0.000833,-0.071682,50,4681,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,400000,14197.84,0.0,0.0,8108000.0,59231500000.0,243375.2,0,0,1,2,0,0,5,5,0


In [26]:
X_test.head(n=10)

Unnamed: 0,48df886f9,PCA_20,PCA_19,PCA_18,PCA_17,PCA_16,PCA_15,PCA_14,PCA_13,PCA_12,PCA_11,PCA_10,PCA_9,PCA_8,PCA_7,PCA_6,PCA_5,PCA_4,PCA_3,PCA_2,PCA_1,SumValues,SumZeros,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,11d86fa6a,77c9823f2,8d6c2a0b2,4681de4fd,adf119b9a,cff75dd09,96f83a237,b8a716ebf,...,88458cb21,f40da20f4,7ad6b38bd,c901e7df1,8f55955dc,85dcc913d,5ca0b9b0c,eab8abf7a,8d8bffbae,2a1f6c7f9,9437d8b64,5831f4c76,2e84e09c5,d45fd5508,a165f5761,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466,Mean,Median,Mode,Max,Var,Std,kmeans_cluster_2,kmeans_cluster_3,kmeans_cluster_4,kmeans_cluster_5,kmeans_cluster_6,kmeans_cluster_7,kmeans_cluster_8,kmeans_cluster_9,kmeans_cluster_10
0,0.0,-0.015422,-0.02596,-0.01501,0.013223,0.034848,0.015931,-0.009407,-0.011184,-0.006899,-0.003957,-0.000107,-0.002884,0.00596,-0.01054,0.014311,-0.007425,0.002604,-0.008757,-0.02636,-0.035962,73,4658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462597.297747,0.0,0.0,960504300.0,217819700000000.0,14758720.0,0,2,0,4,5,3,6,4,2
1,0.0,-0.003187,0.005343,-0.007314,0.001708,0.005357,-0.000517,-0.005608,0.000571,-0.015256,-0.003334,0.008128,-0.005141,0.004502,-0.01093,-0.012999,-0.011254,-0.017793,-0.013139,1.2e-05,-0.076567,11,4720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15814.649601,0.0,0.0,31764780.0,330441900000.0,574840.7,0,0,1,2,0,0,5,5,0
2,0.0,-0.002715,0.003567,-0.01056,0.002895,0.004065,-0.001458,-0.002881,-0.000778,-0.011623,-0.004343,0.007276,-0.003394,0.003564,-0.010795,-0.010144,-0.006391,-0.008043,-0.009664,0.004792,-0.069435,100,4631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,124139.289006,0.0,0.0,80000000.0,3474023000000.0,1863873.0,0,0,1,2,0,0,5,5,0
3,0.0,0.000392,0.002179,0.003718,-0.003416,0.002503,0.005229,-0.00713,0.001936,-0.004888,-0.003589,0.003844,-0.004167,0.000402,-0.01325,-0.000794,-0.009078,-0.006484,-0.008791,0.017488,-0.042201,121,4610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,374390.399463,0.0,0.0,100000000.0,10307230000000.0,3210487.0,0,0,1,2,0,0,5,5,0
4,0.0,0.000423,0.005825,-0.012687,-0.003691,0.008225,0.004645,0.001112,0.006108,-0.017412,-0.004226,0.006913,-0.005243,0.006114,-0.011155,-0.008478,0.002461,-0.018694,-0.013635,-0.000559,-0.075942,9,4722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28132.72773,0.0,0.0,107708700.0,2484236000000.0,1576146.0,0,0,1,2,0,0,5,5,0
5,0.0,-0.000907,0.005268,-0.007718,-0.000173,0.00583,0.000572,-0.006318,-0.001014,-0.015949,-0.003076,0.00842,-0.004693,0.003531,-0.011335,-0.012269,-0.011547,-0.018657,-0.013486,-0.000905,-0.074743,22,4709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49335.937248,0.0,0.0,43385790.0,1355339000000.0,1164190.0,0,0,1,2,0,0,5,5,0
6,0.0,-0.012565,0.009934,-0.003807,0.016233,-0.003965,0.004922,-0.006653,0.004667,-0.011674,-0.00315,0.008853,-0.008786,0.005522,-0.010905,-0.020011,-0.009705,-0.019428,-0.013161,-0.005857,-0.069088,26,4705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93488.26903,0.0,0.0,117456500.0,6288018000000.0,2507592.0,0,0,1,2,0,0,5,5,0
7,0.0,-0.003463,0.005948,-0.00664,0.001125,0.005994,-0.001681,-0.007044,0.000243,-0.015791,-0.003083,0.008554,-0.005454,0.004694,-0.010396,-0.012062,-0.007078,-0.018902,-0.013618,-0.00044,-0.076234,8,4723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17794.568022,0.0,0.0,44519420.0,564879600000.0,751584.7,0,0,1,2,0,0,5,5,0
8,0.0,-0.002877,0.00204,-0.000512,0.00385,0.004451,-0.00391,-0.004519,0.001254,-0.017997,-0.004196,0.006622,-0.004995,0.003552,-0.008673,-0.015158,-0.010189,-0.019388,-0.013555,-0.002175,-0.073182,27,4704,0.0,24617120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,276673.667565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66767.415807,0.0,0.0,66540040.0,2212585000000.0,1487476.0,0,0,1,2,0,0,5,5,0
9,0.0,-0.002237,0.005832,0.000989,0.000159,0.002689,-0.002831,-0.002522,0.000725,-0.009394,-0.003668,0.010054,-0.002743,0.006322,-0.007447,-0.008882,-0.001155,-0.014123,-0.009409,0.01055,-0.058867,149,4582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12675000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2600000.0,0.0,0.0,0.0,0.0,0.0,13333333.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206325.224004,0.0,0.0,79800000.0,4548733000000.0,2132776.0,0,0,1,2,0,0,5,5,0


## Build Train and Test Data for Modeling

## LightGBM

In [27]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 30,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgtrain, lgval], early_stopping_rounds=100, 
                      verbose_eval=200, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [28]:
# Training LGB
seeds = [42, 2018]
pred_test_full_seed = 0
for seed in seeds:
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)
    pred_test_full = 0
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = X_train.loc[dev_index,:], X_train.loc[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
        pred_test_full += pred_test
    pred_test_full /= 5.
    pred_test_full = np.expm1(pred_test_full)
    pred_test_full_seed += pred_test_full
    print("Seed {} completed....".format(seed))
pred_test_full_seed /= np.float(len(seeds))

print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.19636	valid_1's rmse: 1.37626
[400]	training's rmse: 1.01159	valid_1's rmse: 1.36386
Early stopping, best iteration is:
[368]	training's rmse: 1.03427	valid_1's rmse: 1.36327
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.18563	valid_1's rmse: 1.42122
[400]	training's rmse: 1.00082	valid_1's rmse: 1.3982
Early stopping, best iteration is:
[434]	training's rmse: 0.978664	valid_1's rmse: 1.39727
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.19531	valid_1's rmse: 1.41335
[400]	training's rmse: 1.01253	valid_1's rmse: 1.39532
Early stopping, best iteration is:
[410]	training's rmse: 1.00552	valid_1's rmse: 1.395
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 1.19592	valid_1's rmse: 1.40119
[400]	training's rmse: 1.01471	valid_1's rmse: 1.37428
Early stopping, best iteration is:
[425]

In [29]:
# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:15])

Features Importance...
        feature  split       gain
4755        Max    235  10.677235
21    SumValues    596   8.704029
20        PCA_1    398   8.555009
4153  f190486d6    166   5.415135
4756        Var    146   5.374940
22     SumZeros    261   4.462228
15        PCA_6    229   3.524196
4752       Mean    222   2.631528
4757        Std     35   1.763109
19        PCA_2    171   1.710941
2400  58e2e02e6     85   1.386013
18        PCA_3    115   1.209715
10       PCA_11     84   1.205074
1572  26ab20ff9     66   0.871452
6        PCA_15    116   0.814969


## Predictions

In [30]:
sub = pd.read_csv('../input/sample_submission.csv')
sub["target"] = pred_test_full_seed

In [31]:
print(sub.head())
sub.to_csv('sub_lgb_s_k_p_5seeds_v2.csv', index=False)

          ID        target
0  000137c73  3.891063e+06
1  00021489f  2.410462e+06
2  0004d7953  1.231905e+06
3  00056a333  3.515238e+06
4  00056d8eb  2.946344e+06


---------------------------------