In [1]:
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows using display() for dataframes
import warnings
warnings.filterwarnings('ignore') # This is to not display warnings related to deprecation

In [2]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [4]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_df.shape

(4459, 4993)

In [6]:
test_df.shape

(49342, 4992)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


In [9]:
train_df.isnull().sum().sum() #Checking if there are any columns with null values

0

In [10]:
test_df.isnull().sum().sum()

0

### Checking and removing constant features (other than the target variable)

In [11]:
cols_to_remove=[]
for col in train_df.columns:
    if col!='ID' and col!='target':
        if train_df[col].std()==0:
            cols_to_remove.append(col)
train_df.drop(cols_to_remove,axis=1,inplace=True)
test_df.drop(cols_to_remove,axis=1,inplace=True)

print('These {} columns have been removed for their constant features:\n'.format(len(cols_to_remove)))
print(cols_to_remove)

These 256 columns have been removed for their constant features:

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c

### Remove Duplicate Columns

In [12]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

In [13]:
%%time

cols_to_remove=duplicate_columns(train_df)
print(cols_to_remove)

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
Wall time: 5min 54s


In [14]:
# Removing duplicate columns in training dataset
train_df.drop(cols_to_remove,axis=1,inplace=True)

# Removing duplicates in test dataset
test_df.drop(cols_to_remove,axis=1,inplace=True)

Sparse data is data that has many zero values. These are actual zero values, they are not zero because they are missing. Columns that have sparse data also need to be removed, as they do not contain much information and so will not be useful as a feature.

In [15]:
def drop_sparse(train,test):
    features_list=[x for x in train.columns if x not in ['ID','target']]
    for f in features_list:
        if len(np.unique(train[f]))<2:
            train.drop(f,axis=1,inplace=True)
            test.drop(f,axis=1,inplace=True)
    return train,test

In [16]:
%%time
train_df,test_df=drop_sparse(train_df,test_df)

Wall time: 530 ms


In [17]:
gc.collect()
print('Training set size:{}'.format(train_df.shape))
print('Test set size:{}'.format(test_df.shape))

Training set size:(4459, 4732)
Test set size:(49342, 4731)


### Building out train and test data for modeling

In [18]:
X_train=train_df.drop(['ID','target'],axis=1)
y_train=np.log1p(train_df['target'].values)
X_test=test_df.drop(['ID'],axis=1)

In [19]:
dev_X,val_X,dev_y,val_y = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

### Using LightGBM

In [20]:
def run_lgb(train_X,train_y,val_X,val_y,test_X):
    params={'objective':'regression','metric':'rmse','num_leaves':40,'learning_rate':0.004,'bagging_fraction':0.6,
           'feature_fraction':0.6,'bagging_frequency':6,'bagging_seed':42,'verbosity':-1,'seed':42}
    lgtrain=lgb.Dataset(train_X,label=train_y)
    lgval=lgb.Dataset(val_X,label=val_y)
    evals_result={}
    model=lgb.train(params,lgtrain,num_boost_round=5000,valid_sets=[lgtrain,lgval],early_stopping_rounds=100,
                   verbose_eval=150,evals_result=evals_result)
    pred_test_y=np.expm1(model.predict(test_X,num_iteration=model.best_iteration))
    return pred_test_y,model,evals_result

In [21]:
pred_test,model,evals_result=run_lgb(dev_X,dev_y,val_X,val_y,X_test)
print('LightGBM Training Completed')

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 1.50845	valid_1's rmse: 1.539
[300]	training's rmse: 1.3446	valid_1's rmse: 1.46591
[450]	training's rmse: 1.23333	valid_1's rmse: 1.43454
[600]	training's rmse: 1.15002	valid_1's rmse: 1.42157
[750]	training's rmse: 1.08396	valid_1's rmse: 1.41615
[900]	training's rmse: 1.03041	valid_1's rmse: 1.41411
Early stopping, best iteration is:
[877]	training's rmse: 1.03806	valid_1's rmse: 1.4137
LightGBM Training Completed


In [25]:
# Determining top 50 features by importance
print('Feature Importance:\n')
gain=model.feature_importance(importance_type='gain')
featureimp=pd.DataFrame({'feature':model.feature_name(),'split':model.feature_importance(importance_type='split'),
                        'gain':100*gain/gain.sum()}).sort_values(by='gain',ascending=False)
featureimp[:50]
                         

Feature Importance:



Unnamed: 0,feature,split,gain
4130,f190486d6,795,9.217105
2375,58e2e02e6,703,5.445323
3465,eeb9cd3aa,662,4.520153
4020,15ace8c9f,514,3.023536
2614,9fd594eec,362,2.933775
8,20aa07010,425,2.229353
3571,58232a6fb,371,1.503661
834,6eef030c1,313,1.411549
1457,b43a7cfd5,385,1.285384
3661,491b9ee45,276,1.066377


### XGBoost

In [26]:
def run_xgb(train_X,train_y,val_X,val_y,test_X):
    params={'objective':'reg:linear',
           'eval_metric':'rmse',
           'eta':0.001,
           'max_depth':10,
           'subsample':0.6,
           'colsample_bytree':0.6,
           'alpha':0.001,
           'random_state':42,
           'silent':True}
    tr_data=xgb.DMatrix(train_X,train_y)
    va_data=xgb.DMatrix(val_X,val_y)
    
    watchlist=[(tr_data,'train'),(va_data,'valid')]
    model_xgb=xgb.train(params=params,dtrain=tr_data,num_boost_round=2000,evals=watchlist,maximize=False,
                       early_stopping_rounds=100,verbose_eval=100)
    dtest=xgb.DMatrix(test_X)
    xgb_pred_y=np.expm1(model_xgb.predict(dtest,ntree_limit=model_xgb.best_ntree_limit))
    return xgb_pred_y,model_xgb

In [27]:
# Training XGB
pred_test_xgb,model_xgb=run_xgb(dev_X,dev_y,val_X,val_y,X_test)
print('XGBoost training completed.')

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:14.08765	valid-rmse:14.07678
[100]	train-rmse:12.76875	valid-rmse:12.75679
[200]	train-rmse:11.57719	valid-rmse:11.56388
[300]	train-rmse:10.50042	valid-rmse:10.48590
[400]	train-rmse:9.52826	valid-rmse:9.51335
[500]	train-rmse:8.65075	valid-rmse:8.63628
[600]	train-rmse:7.85852	valid-rmse:7.84507
[700]	train-rmse:7.14377	valid-rmse:7.13205
[800]	train-rmse:6.49865	valid-rmse:6.48965
[900]	train-rmse:5.91699	valid-rmse:5.91128
[1000]	train-rmse:5.39177	valid-rmse:5.39058
[1100]	train-rmse:4.91908	valid-rmse:4.92352
[1200]	train-rmse:4.49301	valid-rmse:4.50427
[1300]	train-rmse:4.10936	valid-rmse:4.12789
[1400]	train-rmse:3.76422	valid-rmse:3.79155
[1500]	train-rmse:3.45402	valid-rmse:3.49105


### Catboost

In [28]:
cb_model=CatBoostRegressoroostRegressor(iterations=500,learning_rate=0.05,depth=10,eval_metric='RMSE',
                          random_seed=42,bagging_temperature=0.2,od_type='Iter',metric_period=50,od_wait=20)

In [29]:
cb_model.fit(dev_X,dev_y,eval_set=(val_X,val_y,),use_best_model=True,verbose=50)



0:	learn: 1.7518683	test: 1.6878429	best: 1.6878429 (0)	total: 1.96s	remaining: 16m 16s
50:	learn: 1.4789181	test: 1.5197196	best: 1.5197196 (50)	total: 2m 18s	remaining: 20m 18s
100:	learn: 1.3788775	test: 1.4780503	best: 1.4780503 (100)	total: 4m 27s	remaining: 17m 37s
150:	learn: 1.3203081	test: 1.4647232	best: 1.4647232 (150)	total: 6m 48s	remaining: 15m 43s
200:	learn: 1.2546056	test: 1.4505032	best: 1.4504536 (198)	total: 8m 59s	remaining: 13m 23s
250:	learn: 1.1817435	test: 1.4381513	best: 1.4381513 (250)	total: 11m 4s	remaining: 10m 59s
300:	learn: 1.1208241	test: 1.4320748	best: 1.4314980 (297)	total: 13m 12s	remaining: 8m 44s
350:	learn: 1.0770301	test: 1.4298988	best: 1.4297384 (347)	total: 15m 19s	remaining: 6m 30s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.428228031
bestIteration = 376

Shrink model to first 377 iterations.


<catboost.core.CatBoostRegressor at 0x1b0d4c2cc70>

In [30]:
pred_test_cat=np.expm1(cb_model.predict(X_test))