In [None]:
from IPython.core.display import display, HTML
from IPython.display import Image
display(HTML("<style>.container { width:80% !important; }</style>"))

Name: Zhou Hong
Student ID: 19025779

# Introduction
This kernel is build for the competetion  “Santander Value Prediction Challenge” from [https://www.kaggle.com/c/santander-value-prediction-challenge](http://). In this competition, we have a dataset with customers' transation in their bank accounts, and our aim is to predict what the customers need in order to provide personalized service.

### Importing library

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output./

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb

In [None]:
import matplotlib.pyplot as plt
import time
import seaborn as sns
from pylab import rcParams
%matplotlib inline
#sklearn library

# Loading data 

In [None]:
train = pd.read_csv("../input/santander-value-prediction-challenge/train.csv")
test = pd.read_csv("../input/santander-value-prediction-challenge/test.csv")

In [None]:
print ("In train dataset, the number of Records is {}".format(train.shape[0])+", and number of Features is {}".format(train.shape[1]-2)) #not counting ID and target
print ("In test dataset, the number of Records is {}".format(test.shape[0])+", and number of Features is {}".format(test.shape[1]-1))#not counting ID 

test dataset has the same number of features with train dataset, but contains much more records.

In [None]:
train.head()

In [None]:
train.iloc[:,2:].info()

1844 columns of train is float type and 3147 columns is int type.

In [None]:
test.head()

In [None]:
test.iloc[:,1:].info()

All data in test dataset are float type, different to the train dataset.

In [None]:
train.iloc[:,2:]=train.iloc[:,2:].astype(float)

now, our train dataset is as the same type as test without changing any value.

In [None]:
train.iloc[:,2:].info()

# EDA

## Target

In [None]:
train.target.nunique()

First look at our target. It has 1413 unique values, I think it is numerical data rather than categorical data.

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(train.target, bins=50)
plt.title('target Histogram ')
plt.xlabel('Target')
plt.ylabel('Frequency')
plt.show()

It seems like a skewed distribution, just see the log transform distribution.

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(np.log1p(train.target), bins=50) # equal to "np.log(x+1)"   add 1 to avoid log(0)
plt.title('log target Histogram ')
plt.xlabel('Target')
plt.ylabel('Frequency')
plt.show()

That looks like a normal distribution. We may need to use log transformation when using target and in predition.

In [None]:
train.target.value_counts().head()

In [None]:
np.log(train.target.median())

## Features

### missing value

In [None]:
print(train.isnull().values.any())
print(test.isnull().values.any())

Great, we don't have missing value to deal with this time.

###  all zero columns

first look at the columns that only have zero value

In [None]:
all_zero_columns=[i for i in train.columns if train[i].nunique()==1]
print ("There are {}".format(len(all_zero_columns))+" all zero columns in train dataset")
print("There is {}".format(len([i for i in test.columns if test[i].nunique()==1]))+" all zero column in test dataset")

It is not a normal case, the columns that provide no information,has values in the tset dataset. The reason for this may be  the test dataset has ten times as many records as train dataset has. However, no matter what the reason is, we just need to drop them here, it is useless for trainning.

### duplicate columns

Let's check whether there are duplicate columns

In [None]:
def find_duplicate_columns(df):
    duplicate_columns=[]
    for i in range(len(df.columns)):
        this=df.iloc[:,i]
        for j in range(i+1,len(df.columns)):
            compare=df.iloc[:,j]
            if this.equals(compare):
                duplicate_columns.append(train.columns[j])
    return duplicate_columns

In [None]:
#a=find_duplicate_columns(train)
#a=['d60ddde1b', 'acc5b709d', '912836770', 'f8d75792f', 'f333a5f60'] it did take an hour.

a=getDuplicateColumns(train)

a=['d60ddde1b', 'acc5b709d', '912836770', 'f8d75792f', 'f333a5f60'] 

it did take near an hour to run,so I just simply copy the array every next time.

## Features analysis

### Features importance

In [None]:
train.head()

In [None]:
#use lgbm's parameters I tuned in other kernel 

In [None]:
clf_lgb=lgb.LGBMRegressor(bagging_fraction=0.5, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.01, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=500, n_jobs=-1, num_leaves=130,
              objective='regression', random_state=42, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [None]:
clf_lgb.fit(np.log1p(train.iloc[:,2:]),np.log1p(train.iloc[:,1]))

In [None]:
fig, ax = plt.subplots(figsize=(14,10))
lgb.plot_importance(clf_lgb, max_num_features=50, height=0.8,color="tomato",ax=ax)
plt.show()

As demonstrated above, column '58e2e026' provides most information and most of the features are useless for predition(In fact, I can also get 1.5 for the final score with 200 most important features.)

In [None]:
#store the features importance
feat_importances = pd.Series(clf_lgb.booster_.feature_importance(),clf_lgb.booster_.feature_name())
top30=[i for i in feat_importances.nlargest(30).index]

In [None]:
top30.insert(0,'target')
top30.insert(0,'ID')

In [None]:
# build a dataset for rich features
richdf=train[[i for i in top30]]

### feature distribution

In [None]:
top30_to_plot =richdf.iloc[:,2:10] .melt(var_name='columns')
g = sns.FacetGrid(top30_to_plot, col='columns')
g = (g.map(sns.distplot, 'value'))

In [None]:
richdf.iloc[:,1:]=np.log1p(richdf.iloc[:,1:])

Let's see the log transform of features

In [None]:
top30_to_plot =richdf.iloc[:,2:10] .melt(var_name='columns')
g = sns.FacetGrid(top30_to_plot, col='columns')
g = (g.map(sns.distplot, 'value'))

In [None]:
top30_to_plot1 =richdf.iloc[:,10:18] .melt(var_name='columns')
g = sns.FacetGrid(top30_to_plot, col='columns')
g = (g.map(sns.distplot, 'value'))

We can see that, the features'value seem to follow skewed distribution, but the log transform of it not follow normal distribution. We can see that the number of values around 0 is  comparatively large. Let drop the 0.

In [None]:
top30_to_plot['value'] = top30_to_plot['value'].replace(0.0,np.nan)
g = sns.FacetGrid(top30_to_plot.dropna(), col='columns')
g = (g.map(sns.distplot, 'value'))

In [None]:
top30_to_plot1['value'] = top30_to_plot['value'].replace(0.0,np.nan)
g = sns.FacetGrid(top30_to_plot1.dropna(), col='columns')
g = (g.map(sns.distplot, 'value'))

Great, perfet normal distribution! We got to use log transform in modeling.

### correlation

Check the correlation between features and target.

In [None]:
corr=richdf.iloc[:,1:].corr()

In [None]:
#forked from https://www.kaggle.com/samratp/beginner-guide-to-eda-and-modeling
#I have other heatmap but this one is so beautiful!
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(16,16))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title("Correlation HeatMap", fontsize=20)
plt.show()

We can see some columns are highly relative to all other columns and some are not that relative.

In [None]:
del richdf,corr,top30_to_plot

## Features engineering

As shown above,there no missing value in this dataset,some columns are constant,and some are duplicate.These need to be dropped at first.
Secondly,the target and features follow skewed distribution, we need to use log transform to get a normal distribution data.

In [None]:
# constant columns
all_zero_columns=[i for i in train.columns if train[i].nunique()==1]
train=train[[i for i in train.columns if i not in all_zero_columns]]
test=test[[i for i in test.columns if i not in all_zero_columns]]
# duplicate columns
duplicte_columns=['d60ddde1b', 'acc5b709d', '912836770', 'f8d75792f', 'f333a5f60']
train=train[[i for i in train.columns if i not in duplicte_columns]]
test=test[[i for i in test.columns if i not in duplicte_columns]]
# log transform
X = np.log1p(train.drop(["ID", "target"], axis=1))
y = np.log1p(train["target"].values)
test = np.log1p(test.drop(["ID"], axis=1))

## Modeling

I use many strategies to build the model, first is Random forest regression.

https://www.kaggle.com/daphnetree/ramdom-forest?scriptVersionId=20810061

This is the final model and score：

In [None]:
rf=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.1, min_samples_split=0.3,
                      min_weight_fraction_leaf=0.0, n_estimators=300,
                      n_jobs=None, oob_score=True, random_state=None, verbose=0,
                      warm_start=False)
Image("../input/imageforscore/rf.png")

In [None]:
Models_score={}
Models_score['Random Forest']=1.69

Then I spent a long time on tuning the parameters for lightgbm and xgboost algorithm.
The process of tuning can be view in my kaggle.
I use gridsearch to find the best value for each parameters, sometimes I run it with 2 values at the same time.
eg.GridSearchCV(clf,parameter,cv = 5,scoring = 'neg_mean_squared_error',verbose=5)

https://www.kaggle.com/daphnetree/xbg-model#Final-model-and-prediction 

https://www.kaggle.com/daphnetree/lgb-model (tuning process)

https://www.kaggle.com/zhouhong0/lgbbest (tuning lambda and alpha and get the best result)

My best models is as follow:

In [None]:
lgbbest=lgb.LGBMRegressor(bagging_fraction=0.5, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.01, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=500, n_jobs=-1, num_leaves=130,
              objective='regression', random_state=42, reg_alpha=0.0,
              reg_lambda=1, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)
xgbbest=xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0.3,
             importance_type='gain', learning_rate=0.02, max_delta_step=0,
             max_depth=5, min_child_weight=5, missing=0, n_estimators=500,
             n_job=4, n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=0, silent=None, subsample=0.7, verbosity=1)


In [None]:
Image("../input/imageforscore/lgb.png")

In [None]:
Image("../input/imageforscore/xgb.png")

In [None]:
Models_score['Lightgbm']=1.40192
Models_score['xgboost']=1.43379

I also tried two ensamble methods , stacking and soft voting, to boost the final result.

First I tried stacking with random forest,lightgbm and xgboost. But the result is merely better than random forest's result, far away from lgb and xgbosst.

In soft voting, the result is much better than stacking, but still worse than lgb's result. 

In [None]:
Image("../input/imageforscore/stacking_lgb_xgb_rf.png")

In [None]:
Image("../input/imageforscore/softvoting_lgb_xgb_rf.png")

In [None]:
Models_score['softvoting_lgb_xgb_rf']=1.62068
Models_score['stacking_lgb_xgb_rf']=1.45082

I thought it was the random forest model brought the score down. So I removed it in the ensamble modeling.

And the softvoting model was significant boosted ,reaching 1.39.

However,stacking still has a bad performance.
https://www.kaggle.com/daphnetree/stacking?scriptVersionId=20842799

In [None]:
Models_score['softvoting_lgb_xgb']=1.39769
Image("../input/imageforscore/softvoting_lgb_xgb.png")

In [None]:
Models_score['stacking_lgb_xgb']=1.5950
Image("../input/imageforscore/stacking_lgb_xgb.png")

In [None]:
modeldf=pd.DataFrame(list(Models_score.items()), columns=['Model', 'RMSE'])
modeldf=modeldf.sort_values('RMSE',ascending = False)

In [None]:
rcParams['figure.figsize'] = 25, 10
rcParams['font.size'] = 15
ax = sns.barplot(x="Model", y="RMSE", data=modeldf)

# Conclusion
In this assignment,I make a roughly analysis of the dataset, exploring the property of features.

However, according to the discussion in kaggle, they found the leak of the features,which seems like time series and made the competition becoming a leak seeking game. 

There are many zero value in the dataset, I think it means nan or nothing happened. The target and most features follow skewed distribution and their log transform follow normal distribution.

It also contains constant columns and duplicate columns.

In features engineering, I dropped the constant columns and duplicate columns and use their log transform for the machine learning. 

I learn a lot while tuning parameters for lightgbm and xgboost model, and I found the lightgbm run faster and has a better result.

I use gridsearch with cross validation to tun the parameters. It was a hard work.

Maybe I pay too less time in random forest, it's result is disappointing.

I'm not familiar with stacking, I think that's the reason of it's bad performance, it's parameters may need to be tuned in other way.

The soft voting performs best thanks to the good performance of xgb and lgbm, it averages the two predictions to form the final prediction.