In [1]:
## Import the required python utilities
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import pandas as pd
import numpy as np

## Import sklearn important modules
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, IncrementalPCA
from sklearn.decomposition import TruncatedSVD, FastICA, NMF, FactorAnalysis
from sklearn.manifold import TSNE

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

init_notebook_mode(connected=True)
path = "../input/"

## Introduction to Dataset Decomposition Techniques  

### Problem Statement: Santander Value Prediction

Santander Group wants to identify the value of transactions for each potential customer. This is a first step that Santander needs to nail in order to personalize their services at scale. The dataset can be downloaded from this [link](https://www.kaggle.com/c/santander-value-prediction-challenge/data). In this kernel I have explained different approaches for dataset decomposition. 


### Introduction

Decomposition of dataset into lower dimensions often becomes an important task while deailing with datasets having larger number of features. Dimensionality Reduction refers to the process of converting a dataset having vast dimensions into a dataset with lesser number of dimensions. This process is done by ensuring that the information conveyed by the original dataset is not lost. 

**Credits** - Big credits to the awesome [kernel](https://www.kaggle.com/arthurtok/interactive-intro-to-dimensionality-reduction) shared by [Anisotropic](https://www.kaggle.com/arthurtok/). 


### Contents

1. Dataset Preparation    
2. Feature Statistics    
3. Eigen Values and Eigen Vectors   
4. Principal Components Analysis   
&nbsp;&nbsp;&nbsp;&nbsp; 4.1 Finding Right Number of Components   
&nbsp;&nbsp;&nbsp;&nbsp; 4.2 PCA Implementation    
&nbsp;&nbsp;&nbsp;&nbsp; 4.3 Variants of PCA  
5. Truncated SVD   
6. Fast ICA   
7. Factor Analysis   
8. Negative Matrix Factorization  
9. Linear Discriminant Analysis     
10. Tsne Visualization   

### 1. Dataset Preparation  

As the first step, load the required dataset. Also separate out the target variable and remove it from the original dataset. This step is done so that entire dataframe can be used directly in decomposition. 

In [2]:
train = pd.read_csv('train.csv')

View the data

In [3]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


Note that ID is the name of customer and is of no use. '48df886f9' alike is the name of the feature.

Extract the target and show its distribution

In [4]:
target = train['target'] # This is the label

In [5]:
target.describe()

count    4.459000e+03
mean     5.944923e+06
std      8.234312e+06
min      3.000000e+04
25%      6.000000e+05
50%      2.260000e+06
75%      8.000000e+06
max      4.000000e+07
Name: target, dtype: float64

In [6]:
target.values.shape

(4459,)

In [7]:
target_value_hist = go.Histogram(x=target.values, opacity=0.45, marker=dict(color="red"))
target_value_hist_layout = dict(height=400, title='Distribution of Target Values', legend=dict(orientation="h"));
fig_target_value_hist = go.Figure(data=[target_value_hist], layout=target_value_hist_layout);
iplot(fig_target_value_hist)

Extract the training features

In [8]:
train = train.drop(["target", "ID"], axis=1) # This is the data part

In [9]:
print ("Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1]))
train.head()

Rows: 4459, Columns: 4991


Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


There are 4459 rows and 4992 features in the dataset which means that this dataset consists of more number of columns than number of rows. 

Normalize the dataset so that every value is in the same range of [0,1]

In [10]:
standardized_train = MinMaxScaler().fit_transform(train.values) # It is assumed that the features are column wise

In [11]:
# Replace original data with the min_max scales one
train.iloc[:,:] = standardized_train

Plot the data

In [12]:
import cv2

In [13]:
cv2.imwrite('The normalized training data.png',standardized_train*255)

True

In [27]:
cv2.imshow('The normalized training data.png',standardized_train)
cv2.waitKey(-1)
cv2.destroyAllWindows()

On may find that most of the plot are dark place. Features are distributed like stars in the sky.

### 2. Feature Statistics  

Computing the basic statistics about features such as mean, variance, standard deviation can help to understand about features. In this part, we will compute the following details about the features. 

In [12]:
train.describe()

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.000733,0.000348,0.001336,0.000306,0.000264,0.001483,0.000422,0.000449,0.008039,0.002587,...,0.006153,0.003598,0.006197,0.005411,0.000224,0.000403,0.000336,0.002231,0.001493,0.005276
std,0.019466,0.016071,0.028498,0.01594,0.015147,0.027866,0.018782,0.018072,0.030068,0.026464,...,0.053527,0.035836,0.034717,0.047365,0.014975,0.015667,0.015172,0.030846,0.026486,0.041925
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001877,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Transpose the table so that the meaning of columns and rows are correct

In [13]:
feature_df = train.describe().T

In [14]:
feature_df.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
48df886f9,4459.0,0.000733,0.019466,0.0,0.0,0.0,0.0,1.0
0deb4b6a8,4459.0,0.000348,0.016071,0.0,0.0,0.0,0.0,1.0
34b15f335,4459.0,0.001336,0.028498,0.0,0.0,0.0,0.0,1.0
a8cb14b00,4459.0,0.000306,0.01594,0.0,0.0,0.0,0.0,1.0
2f0771a37,4459.0,0.000264,0.015147,0.0,0.0,0.0,0.0,1.0


In [15]:
feature_df.reset_index().head()

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
0,48df886f9,4459.0,0.000733,0.019466,0.0,0.0,0.0,0.0,1.0
1,0deb4b6a8,4459.0,0.000348,0.016071,0.0,0.0,0.0,0.0,1.0
2,34b15f335,4459.0,0.001336,0.028498,0.0,0.0,0.0,0.0,1.0
3,a8cb14b00,4459.0,0.000306,0.01594,0.0,0.0,0.0,0.0,1.0
4,2f0771a37,4459.0,0.000264,0.015147,0.0,0.0,0.0,0.0,1.0


In [16]:
feature_df = feature_df.reset_index().rename(columns = {'index' : 'feature'})
feature_df.head()

Unnamed: 0,feature,count,mean,std,min,25%,50%,75%,max
0,48df886f9,4459.0,0.000733,0.019466,0.0,0.0,0.0,0.0,1.0
1,0deb4b6a8,4459.0,0.000348,0.016071,0.0,0.0,0.0,0.0,1.0
2,34b15f335,4459.0,0.001336,0.028498,0.0,0.0,0.0,0.0,1.0
3,a8cb14b00,4459.0,0.000306,0.01594,0.0,0.0,0.0,0.0,1.0
4,2f0771a37,4459.0,0.000264,0.015147,0.0,0.0,0.0,0.0,1.0


In [17]:
feature_df['distinct_vals'] = feature_df['feature'].apply(lambda x : len(train[x].value_counts()))
feature_df['squared_deviation'] = feature_df['feature'].apply(lambda x : np.var(train[x]))
feature_df['target_corr'] = feature_df['feature'].apply(lambda x : np.corrcoef(target, train[x])[0][1])
feature_df.head()

Unnamed: 0,feature,count,mean,std,min,25%,50%,75%,max,distinct_vals,squared_deviation,target_corr
0,48df886f9,4459.0,0.000733,0.019466,0.0,0.0,0.0,0.0,1.0,32,0.000379,0.010188
1,0deb4b6a8,4459.0,0.000348,0.016071,0.0,0.0,0.0,0.0,1.0,5,0.000258,0.013805
2,34b15f335,4459.0,0.001336,0.028498,0.0,0.0,0.0,0.0,1.0,29,0.000812,0.014694
3,a8cb14b00,4459.0,0.000306,0.01594,0.0,0.0,0.0,0.0,1.0,3,0.000254,-0.002917
4,2f0771a37,4459.0,0.000264,0.015147,0.0,0.0,0.0,0.0,1.0,6,0.000229,0.016647


Plot critical extimations

In [18]:
distinct_vals_hist = go.Histogram(x=feature_df['distinct_vals'], opacity=0.45, marker=dict(color="red"))
distinct_vals_hist_layout = dict(height=400, title='Distribution of Distinct Values for Each Feature', legend=dict(orientation="h"));
fig_distinct_vals_hist = go.Figure(data=[distinct_vals_hist], layout=distinct_vals_hist_layout);
iplot(fig_distinct_vals_hist)

In [19]:
feature_std_hist = go.Histogram(x=feature_df['std'], opacity=0.45, marker=dict(color="red"))
feature_std_hist_layout = dict(height=400, title='Distribution of the STDs of Features', legend=dict(orientation="h"));
fig_feature_std_hist = go.Figure(data=[feature_std_hist], layout=feature_std_hist_layout);
iplot(fig_feature_std_hist)

So there are 256 columns in the dataset having zero variance ie. they have constant values.

In [20]:
target_corr_hist = go.Histogram(x=feature_df['target_corr'], opacity=0.45, marker=dict(color="red"))
target_corr_hist_layout = dict(height=400, title='Distribution of the target_corr', legend=dict(orientation="h"));
fig_target_corr_hist = go.Figure(data=[target_corr_hist], layout=target_corr_hist_layout);
iplot(fig_target_corr_hist)

Note that most of the data have some degree of correlation with the target. Though minor data do not have correlation with the target, we CANNOT say that these data have no relation with the target. This is because:

In [21]:
np.corrcoef(np.cos(np.linspace(0,2*np.pi,100)), np.sin(np.linspace(0,2*np.pi,100)))[0][1]

7.601674702090559e-18

Nevertheless, we can get rid of those 250 features with 0 std values.

In [22]:
train[feature_df['feature'][feature_df['std']!=0]].shape

(4459, 4735)

In [23]:
train.drop(feature_df['feature'][feature_df['std']==0], axis=1).shape

(4459, 4735)

In [24]:
train = train.drop(feature_df['feature'][feature_df['std']==0], axis=1)

In [25]:
train.shape

(4459, 4735)

Observe correlations

In [28]:
train_coef_matrix = np.corrcoef(train.values.T)
print(train_coef_matrix.shape)

(4735, 4735)


In [61]:
cv2.imwrite('The normalized correlation matrix of all features.png',train_coef_matrix*255)
cv2.imshow('The normalized correlation matrix of all features',train_coef_matrix)
cv2.waitKey(-1)
cv2.destroyAllWindows()

It seems that most features are not strongly correlate with others

### 3. Estimate directly

### 3.1. Estimate with lightboost

In [28]:
import lightgbm as lgb

In [29]:
# lgbtrain = lgb.Dataset(train.values, label=target.values/1e7)
lgbtrain = lgb.Dataset(train.values, label=np.log1p(target.values))
# lgbtrain = lgb.Dataset(train.values, label=target.values)

In [30]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    "min_child_weight":10,
    'zero_as_missing':True 
    }

lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgbtrain,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=20)

[50]	cv_agg's rmse: 1.58742 + 0.0305161
[100]	cv_agg's rmse: 1.50458 + 0.0316924
[150]	cv_agg's rmse: 1.46337 + 0.0325538
[200]	cv_agg's rmse: 1.44265 + 0.0327991
[250]	cv_agg's rmse: 1.43354 + 0.032863
[300]	cv_agg's rmse: 1.43088 + 0.0337591
[350]	cv_agg's rmse: 1.42942 + 0.0337614


The parameters seems not propriate

### 3.2. Estimate with SVR

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [32]:
X_train, X_validate, y_train, y_validate = train_test_split(train.values, np.log1p(target.values), test_size=0.2, random_state=42)

In [34]:
svr_lin = SVR(kernel='linear', C=1.0)
svr_lin.fit(X_train, y_train)
svr_lin_predicted_train_values = svr_lin.predict(X_train)
print(mean_squared_error(svr_lin_predicted_train_values, y_train))
svr_lin_predicted_validate_values = svr_lin.predict(X_validate)
print(mean_squared_error(svr_lin_predicted_validate_values, y_validate))

1.507725056802162
2.9714373887689662


Overfitting badly

In [35]:
svr_poly = SVR(kernel='poly', C=1.0)
svr_poly.fit(X_train, y_train)
svr_poly_predicted_train_values = svr_poly.predict(X_train)
print(mean_squared_error(svr_poly_predicted_train_values, y_train))
svr_poly_predicted_validate_values = svr_poly.predict(X_validate)
print(mean_squared_error(svr_poly_predicted_validate_values, y_validate))

3.131977187531916
2.8979772086075193


In [36]:
svr_rbf = SVR(kernel='rbf', C=1.0)
svr_rbf.fit(X_train, y_train)
svr_rbf_predicted_train_values = svr_rbf.predict(X_train)
print(mean_squared_error(svr_rbf_predicted_train_values, y_train))
svr_rbf_predicted_validate_values = svr_rbf.predict(X_validate)
print(mean_squared_error(svr_rbf_predicted_validate_values, y_validate))

3.0678371137786593
2.8502377931274188


### 4. PCA - Principal Component Analysis    

Principal Component Analysis is the technique for finding most informative vectors of a high-dimensional datasets. In other words, PCA extracts the important variables in form of components from a datasets containing large number of features. The important features are extracted with the goal to capture maximum possible information from the dataset.  

The first principal component is a linear combination of dataset features having maximum variance. It determines the direction of highest variability in the data. If the components are uncorrelated, their directions should be orthogonal. This suggests the correlation b/w the components in zero. All succeeding principal component follows the similar concept i.e. they capture the remaining variation without being correlated with the previous component. 


### 4.1 Finding Right Number of Components

We can use PCA with N number of components and obtain the right number which matches a threshold value of explained variance. 

In [37]:
def _get_number_components(model, threshold):
    component_variance = model.explained_variance_ratio_
    explained_variance = 0.0
    components = 0

    for var in component_variance:
        explained_variance += var
        components += 1
        if(explained_variance >= threshold):
            break
    return components

### Get the optimal number of components
pca = PCA()
train_pca = pca.fit_transform(standardized_train)
components = _get_number_components(pca, threshold=0.85)
components

780

In [38]:
train_pca.shape

(4459, 4459)

So, for a threshold value = 0.85, we can choose 993 components. These components will explain about 85% of the variance of the dataset

In [39]:
def plot_3_components(x_trans, title):
    trace = go.Scatter3d(x=x_trans[:,0], y=x_trans[:,1], z = x_trans[:,2],
                          name = target, mode = 'markers', text = target, showlegend = False,
                          marker = dict(size = 8, color=x_trans[:,1], colorscale ='Rainbow', 
                          line = dict(width = 1, color = '#fefefe'), opacity = 0.7))
    layout = go.Layout(title = title, showlegend= True)
    fig = dict(data=[trace], layout=layout)
    iplot(fig)

def plot_2_components(x_trans, title):
    trace = go.Scatter(x=x_trans[:,0], y=x_trans[:,1], name=target, mode='markers',
        text = target, showlegend = False,
        marker = dict(size = 8, color=x_trans[:,1], line = dict(width = 1, color = '#fefefe'), opacity = 0.7))
    layout = go.Layout(title = title, hovermode= 'closest',
        xaxis= dict(title= 'First Component',
            ticklen = 5, zeroline= False, gridwidth= 2),
        yaxis=dict(title= 'Second Component',
            ticklen = 5, gridwidth = 2), showlegend= True)
    fig = dict(data=[trace], layout=layout)
    iplot(fig)

### 4.2 Implementing PCA

Lets implement the PCA and visualize the first three and two components. 

In [40]:
### Implement PCA 
obj = model = PCA(n_components = 4500)
X_pca = obj.fit_transform(train.values)
print(X_pca.shape)

(4459, 4459)


In [41]:
## Visualize the Components 
plot_3_components(X_pca, 'PCA - First Three Components (Zoom In to view)')

In [42]:
lgbtrain_pca = lgb.Dataset(X_pca, label=np.log1p(target.values))

In [None]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    "min_child_weight":10,
    'zero_as_missing':True 
    }

lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgbtrain_pca,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=20)

This indicate that important infomation is lost

### 4.3 PCA Variants 

Sklearn provides different Variants of PCA which can be helpful as well. 

**4.3.1 Kernel PCA:** 

KernelPCA is an extension of PCA which achieves non-linear dimensionality reduction through the use of kernels. It has many applications including denoising, compression and structured prediction (kernel dependency estimation). 

In [None]:
from sklearn.decomposition import PCA, KernelPCA

In [None]:
kpca = KernelPCA(kernel="rbf",n_components = components)
X_kpca = kpca.fit_transform(train.values)

In [None]:
## Visualize the Components 
plot_3_components(X_kpca, 'kernel PCA - First Three Components (Zoom In to view)')

In [None]:
lgbtrain_kpca = lgb.Dataset(X_kpca, label=np.log1p(target.values))

In [None]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    "min_child_weight":10,
    'zero_as_missing':True 
    }

lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgbtrain_kpca,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=20)

Above experiments indicates that limited component PCA technique could cause infomation loss

## 5. Try deep learning

In [26]:
import tensorflow as tf


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



In [27]:
class DNNModel():

    def __init__(self):
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        with self.graph.as_default():
            self.model_scope = tf.variable_scope('model', reuse=tf.AUTO_REUSE)
            self.input_scope = tf.variable_scope('input', reuse=tf.AUTO_REUSE)
            with self.input_scope:
                self.input_feature = tf.placeholder(tf.float32,shape=[None,train.shape[1]],name='input_feature')
                self.input_truth = tf.placeholder(tf.float32,shape=[None,1],name='ground_truth')
            with self.model_scope:
                # Define the model
                self.hidden_1 = tf.layers.dense(self.input_feature,
                                                units=1000,
                                                activation= tf.nn.sigmoid, # tf.nn.leaky_relu,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2a = tf.layers.dense(self.hidden_1,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2b = tf.layers.dense(self.hidden_2a,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2c = tf.layers.dense(self.hidden_2b+self.hidden_1,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2d = tf.layers.dense(self.hidden_2c+self.hidden_2a,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2e = tf.layers.dense(self.hidden_2d+self.hidden_2b,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2f = tf.layers.dense(self.hidden_2e+self.hidden_2c,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2g = tf.layers.dense(self.hidden_2f+self.hidden_2d,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2h = tf.layers.dense(self.hidden_2g+self.hidden_2e,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_2i = tf.layers.dense(self.hidden_2h+self.hidden_2f,
                                                units=1000,
                                                activation=tf.nn.sigmoid,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
                self.hidden_3 = tf.layers.dense(self.hidden_2i+self.hidden_2g,
                                                units=1,
                                                activation=None,
                                                use_bias=True,
                                                kernel_initializer=None,
                                                bias_initializer=tf.zeros_initializer(),
                                                kernel_regularizer=None,
                                                bias_regularizer=None,
                                                activity_regularizer=None,
                                                kernel_constraint=None,
                                                bias_constraint=None,
                                                trainable=True,
                                                name=None,
                                                reuse=None)
            
                self.loss = tf.losses.mean_squared_error(labels=self.input_truth,
                                                        predictions=self.hidden_3,
                                                        weights=1.0,
                                                        scope=None,
                                                        loss_collection=tf.GraphKeys.LOSSES)
                self.variable_summaries(self.loss)
                self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(self.loss, 
                                                                        var_list=[var for var in tf.trainable_variables()]) # original 0.0001
                self.merged_summary = tf.summary.merge_all()
                
                self.saver = tf.train.Saver(max_to_keep=10000000)
                self.train_writer = None
            
    
    def variable_summaries(self,var):
        """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
        with tf.name_scope(var.name.split(':')[0]+'_'+var.name.split(':')[1]):
            if len(var.shape)==0:
                tf.summary.scalar('value', var)
            if len(var.shape)>0:
                mean = tf.reduce_mean(var)
                tf.summary.scalar('mean', mean)
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
                tf.summary.scalar('stddev', stddev)
                tf.summary.scalar('max', tf.reduce_max(var))
                tf.summary.scalar('min', tf.reduce_min(var))
                tf.summary.histogram('histogram', var)
            if len(var.shape)==4 and (var.shape[-1]==1 or var.shape[-1]==3):
                tf.summary.image('image',var)
    
    # Initialization
    def initialize_sess(self,log_path):
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())
            if self.train_writer is not None:
                self.train_writer.close()
            self.train_writer = tf.summary.FileWriter(log_path, self.sess.graph)
    
    # Model training
    def train(self,feature,truth,step=None):
        summary, _, current_loss = self.sess.run([self.merged_summary,self.optimizer,self.loss],
                                                                  feed_dict={self.input_feature:feature, 
                                                                             self.input_truth: truth})
        self.train_writer.add_summary(summary, global_step = step)
        
        return current_loss
        
    # Model estimating
    def estimate(self,feature):
        result = self.sess.run([self.hidden_3],
                                         feed_dict={self.input_feature:feature})
        
        return result
    
    def save_model(self,model_path):
        with self.graph.as_default():
            self.saver.save(self.sess,model_path + '\\model')
    
    def load_model(self,model_path):
        with self.graph.as_default():
            self.saver.restore(self.sess, tf.train.latest_checkpoint(model_path))
    
    def __del__(self):
        self.sess.close()

In [28]:
test_model = DNNModel()

In [29]:
with test_model.graph.as_default():
    for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES ):
        print(var.name,var.shape)

model/dense/kernel:0 (4735, 1000)
model/dense/bias:0 (1000,)
model/dense_1/kernel:0 (1000, 1000)
model/dense_1/bias:0 (1000,)
model/dense_2/kernel:0 (1000, 1000)
model/dense_2/bias:0 (1000,)
model/dense_3/kernel:0 (1000, 1000)
model/dense_3/bias:0 (1000,)
model/dense_4/kernel:0 (1000, 1000)
model/dense_4/bias:0 (1000,)
model/dense_5/kernel:0 (1000, 1000)
model/dense_5/bias:0 (1000,)
model/dense_6/kernel:0 (1000, 1000)
model/dense_6/bias:0 (1000,)
model/dense_7/kernel:0 (1000, 1000)
model/dense_7/bias:0 (1000,)
model/dense_8/kernel:0 (1000, 1000)
model/dense_8/bias:0 (1000,)
model/dense_9/kernel:0 (1000, 1000)
model/dense_9/bias:0 (1000,)
model/dense_10/kernel:0 (1000, 1)
model/dense_10/bias:0 (1,)
model/beta1_power:0 ()
model/beta2_power:0 ()
model/model/dense/kernel/Adam:0 (4735, 1000)
model/model/dense/kernel/Adam_1:0 (4735, 1000)
model/model/dense/bias/Adam:0 (1000,)
model/model/dense/bias/Adam_1:0 (1000,)
model/model/dense_1/kernel/Adam:0 (1000, 1000)
model/model/dense_1/kernel/Ada

In [30]:
log_path = 'E:\\Python Workspace\\Santander Value Prediction Challenge\\Logs'
test_model.load_model(model_path=log_path)

INFO:tensorflow:Restoring parameters from E:\Python Workspace\Santander Value Prediction Challenge\Logs\model


In [30]:
# Train the model
from sklearn.utils import shuffle
num_epochs = 500
batch_size = 100
num_batchs = train.shape[0]//batch_size -1

temp_count = 0

In [31]:
temp_count = 0
log_path = 'E:\\Python Workspace\\Santander Value Prediction Challenge\\Logs'
test_model.initialize_sess(log_path=log_path)
for epoch in range(num_epochs):
    all_feature_shuffled, all_truth_shuffled = shuffle(train.values-0.5, np.log1p(target.values), random_state=epoch)
    for batch_num in range(num_batchs):
        feature_batch = all_feature_shuffled[batch_num*batch_size:(1+batch_num)*batch_size]
        truth_batch = np.expand_dims(all_truth_shuffled[batch_num*batch_size:(1+batch_num)*batch_size],-1)
        # Learning
        current_loss = test_model.train(feature_batch,truth_batch,step=temp_count)

        temp_count += 1
        if temp_count%50==1:
            print('temp_count: ',temp_count,'current_loss:',np.sqrt(current_loss))
            
test_model.save_model(log_path)

temp_count:  1 current_loss: 15.653296
temp_count:  51 current_loss: 1.9674063
temp_count:  101 current_loss: 1.749079
temp_count:  151 current_loss: 1.6594568
temp_count:  201 current_loss: 1.818427
temp_count:  251 current_loss: 1.6941551
temp_count:  301 current_loss: 1.8737876
temp_count:  351 current_loss: 1.6838222
temp_count:  401 current_loss: 1.8502116
temp_count:  451 current_loss: 1.6037098
temp_count:  501 current_loss: 1.8355166
temp_count:  551 current_loss: 1.8628697
temp_count:  601 current_loss: 1.8304194
temp_count:  651 current_loss: 1.5769958
temp_count:  701 current_loss: 1.9161303
temp_count:  751 current_loss: 1.7594094
temp_count:  801 current_loss: 1.770742
temp_count:  851 current_loss: 1.9337054
temp_count:  901 current_loss: 1.8117758
temp_count:  951 current_loss: 1.6067218
temp_count:  1001 current_loss: 1.8549409
temp_count:  1051 current_loss: 1.6944523
temp_count:  1101 current_loss: 1.7709773
temp_count:  1151 current_loss: 1.6338995
temp_count:  1201 

temp_count:  9851 current_loss: 1.7063934
temp_count:  9901 current_loss: 1.6658914
temp_count:  9951 current_loss: 1.6103961
temp_count:  10001 current_loss: 1.5798397
temp_count:  10051 current_loss: 1.5449967
temp_count:  10101 current_loss: 1.5189183
temp_count:  10151 current_loss: 1.6699767
temp_count:  10201 current_loss: 1.4834794
temp_count:  10251 current_loss: 1.616264
temp_count:  10301 current_loss: 1.6129668
temp_count:  10351 current_loss: 1.7194326
temp_count:  10401 current_loss: 1.6119529
temp_count:  10451 current_loss: 1.6683772
temp_count:  10501 current_loss: 1.6711011
temp_count:  10551 current_loss: 1.5672597
temp_count:  10601 current_loss: 1.7048507
temp_count:  10651 current_loss: 1.7343006
temp_count:  10701 current_loss: 1.7486913
temp_count:  10751 current_loss: 1.5812665
temp_count:  10801 current_loss: 1.5374457
temp_count:  10851 current_loss: 1.6423122
temp_count:  10901 current_loss: 1.4982674
temp_count:  10951 current_loss: 1.8869298
temp_count:  11

temp_count:  19401 current_loss: 1.570726
temp_count:  19451 current_loss: 1.4898676
temp_count:  19501 current_loss: 1.6063435
temp_count:  19551 current_loss: 1.5427822
temp_count:  19601 current_loss: 1.5502476
temp_count:  19651 current_loss: 1.4766989
temp_count:  19701 current_loss: 1.6643118
temp_count:  19751 current_loss: 1.6039445
temp_count:  19801 current_loss: 1.5881728
temp_count:  19851 current_loss: 1.6995994
temp_count:  19901 current_loss: 1.542207
temp_count:  19951 current_loss: 1.448581
temp_count:  20001 current_loss: 1.5494655
temp_count:  20051 current_loss: 1.4276297
temp_count:  20101 current_loss: 1.6089612
temp_count:  20151 current_loss: 1.6176412
temp_count:  20201 current_loss: 1.7834475
temp_count:  20251 current_loss: 1.3897353
temp_count:  20301 current_loss: 1.4853282
temp_count:  20351 current_loss: 1.5598828
temp_count:  20401 current_loss: 1.3674115
temp_count:  20451 current_loss: 1.6968856
temp_count:  20501 current_loss: 1.5513549
temp_count:  2

In [32]:
feature_part_A = train.values

In [33]:
with test_model.graph.as_default():
    feature_part_B = test_model.sess.run(test_model.hidden_1, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_B.shape)

(4459, 1000)


In [34]:
with test_model.graph.as_default():
    feature_part_C = test_model.sess.run(test_model.hidden_2a, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_C.shape)

(4459, 1000)


In [35]:
with test_model.graph.as_default():
    feature_part_D = test_model.sess.run(test_model.hidden_2b, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_D.shape)

(4459, 1000)


In [36]:
with test_model.graph.as_default():
    feature_part_E = test_model.sess.run(test_model.hidden_2c, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_E.shape)

(4459, 1000)


In [37]:
with test_model.graph.as_default():
    feature_part_F = test_model.sess.run(test_model.hidden_2d, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_F.shape)

(4459, 1000)


In [38]:
with test_model.graph.as_default():
    feature_part_G = test_model.sess.run(test_model.hidden_2e, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_G.shape)

(4459, 1000)


In [39]:
with test_model.graph.as_default():
    feature_part_H = test_model.sess.run(test_model.hidden_2f, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_H.shape)

(4459, 1000)


In [40]:
with test_model.graph.as_default():
    feature_part_I = test_model.sess.run(test_model.hidden_2g, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_E.shape)

(4459, 1000)


In [41]:
with test_model.graph.as_default():
    feature_part_J = test_model.sess.run(test_model.hidden_2h, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_E.shape)

(4459, 1000)


In [42]:
with test_model.graph.as_default():
    feature_part_K = test_model.sess.run(test_model.hidden_2i, feed_dict={test_model.input_feature: feature_part_A})
print(feature_part_E.shape)

(4459, 1000)


In [43]:
feature_concat = np.concatenate([feature_part_B,
                                 feature_part_C,
                                 feature_part_D,
                                 feature_part_E,
                                 feature_part_F,
                                 feature_part_G,
                                 feature_part_H,
                                 feature_part_I,
                                 feature_part_J,
                                 feature_part_K],axis=-1)
print(feature_concat.shape)

(4459, 10000)


In [44]:
import lightgbm as lgb
lgbtrain_kdnn = lgb.Dataset(feature_concat, label=np.log1p(target.values))

In [45]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    "min_child_weight":10,
    'zero_as_missing':True 
    }

lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgbtrain_kdnn,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=20)

[50]	cv_agg's rmse: 1.51216 + 0.0344623
[100]	cv_agg's rmse: 1.40655 + 0.0387267
[150]	cv_agg's rmse: 1.36181 + 0.040527
[200]	cv_agg's rmse: 1.34209 + 0.0407245
[250]	cv_agg's rmse: 1.33254 + 0.0412806
[300]	cv_agg's rmse: 1.32888 + 0.0405519
[350]	cv_agg's rmse: 1.32683 + 0.040182
[400]	cv_agg's rmse: 1.3259 + 0.0401887
[450]	cv_agg's rmse: 1.32483 + 0.0399169


In [None]:
lgb_reg = lgb.train(lgbm_params, lgbtrain_kdnn, num_boost_round = 20000, verbose_eval=50, seed = 23, early_stopping_rounds=100)

In [None]:
pred_test = lgb_reg.predict(X_test)
sub_df = pd.DataFrame({"ID":test_df["ID"].values})
sub_df["target"] = np.expm1(pred_test)
sub_df.to_csv("DNN_lgb.csv", index=False)