In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.width', 1000)
import seaborn as sns
from seaborn import heatmap
color = sns.color_palette()
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/train.csv')
df_test = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print('Shape of train_df ', df_train.shape) #Train dataset
print('Shape of test_df ', df_test.shape) #Test dataset

In [None]:
##### Train set #####
print('num of null values in train set:\n', df_train.isnull().sum())
print('\n\n\n')

##### Test set #####
print('num of null values in test set:\n', df_test.isnull().sum())

In [None]:
##### Train set #####
print('num of null values in train set:', df_train.isnull().sum().sum())

##### Test set #####
print('num of null values in test set:', df_test.isnull().sum().sum())

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_dtype = df_train.dtypes.reset_index()
df_dtype.columns = ["Count", "Column Type"]
df_dtype.groupby("Column Type").aggregate('count').reset_index()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(range(df_train.shape[0]), np.sort(df_train['target'].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.title('Distribution of Target', fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(df_train['target'].values, bins=50, kde=False)
plt.xlabel('Target', fontsize=14)
plt.title('Target Histogram', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(np.log1p(df_train['target'].values), bins=50, kde=False)
plt.xlabel('Target', fontsize=14)
plt.title('Log of Target Histogram', fontsize=16)
plt.show()

In [None]:
from scipy.stats import spearmanr
labels = []
values = []
for col in df_train.columns:
    if col not in ["ID", "target"]:
        labels.append(col)
        values.append(spearmanr(df_train[col].values, df_train["target"].values)[0])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
 
corr_df = corr_df[(corr_df['corr_values']>0.1) | (corr_df['corr_values']<-0.1)]
ind = np.arange(corr_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(12,30))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='b')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables")
plt.show()

In [None]:
y_train = df_train['target']
y_train[:5]

In [None]:
y_train.describe()

In [None]:
sns.set(style="white", palette="muted", color_codes=True)

# Set up the matplotlib figure
f, axes = plt.subplots(figsize=(13, 6), sharex=True)
sns.despine(left=True)

# Plot a simple histogram with binsize determined automatically
sns.distplot(y_train, hist=True, color="b", kde_kws={"shade": True}, ax=axes)
plt.ylabel('Frequency')
plt.xlabel('Value')

#plt.setp(axes, yticks=[])
plt.tight_layout()

In [None]:
# Plotting Target Variable
x = np.array(y_train)
x.sort()
plt.figure(figsize=(17, 6))
plt.plot(x)
plt.title('')
plt.ylabel('Value')
plt.xlabel('Data points')
plt.legend(['Target'], loc='upper left')
plt.show()

In [None]:
np.log1p(y_train).describe()

In [None]:
sns.set(style="white", palette="muted", color_codes=True)

# Set up the matplotlib figure
f, axes = plt.subplots(figsize=(13, 6), sharex=True)
sns.despine(left=True)

# Plot a simple histogram with binsize determined automatically
sns.distplot(np.log1p(y_train), hist=True, color="b", kde_kws={"shade": True}, ax=axes)
plt.ylabel('Frequency')
plt.xlabel('Value')

#plt.setp(axes, yticks=[])
plt.tight_layout()

In [None]:
x = np.array(np.log1p(y_train))
x.sort()
plt.figure(figsize=(17, 6))
plt.plot(x)
plt.title('')
plt.ylabel('Value')
plt.xlabel('Data points')
plt.legend(['Target'], loc='upper left')
plt.show()

In [None]:
df_train.drop("target", axis = 1, inplace = True)
df_train.drop("ID", axis = 1, inplace = True)

In [None]:
from sklearn.feature_selection import mutual_info_regression
corr = mutual_info_regression(np.log1p(df_train), np.log1p(y_train))

In [None]:
x = np.array(corr)
x.sort()
plt.figure(figsize=(18, 6))
plt.plot(x, color = "b")
plt.title('')
plt.ylabel('Mutual information')
plt.xlabel('Features')
plt.legend(['Mutual information'], loc='upper left')
plt.show()

In [None]:
data_corr = np.log1p(df_train).corr()

In [None]:
# Set up the matplotlib figure
f, axes = plt.subplots(figsize=(18, 18), sharex=True)
heatmap(data_corr, vmin=0, vmax=1, cmap="BuPu")
plt.ylabel('Features')
plt.xlabel('Features')
plt.tight_layout()

In [None]:
# Extract the number of the features with high correlation.
colnew = []
for i in data_corr.columns:
    if len(data_corr[(data_corr[i]>0.99) & (data_corr[i]!=1)]) >= 1:
        colnew.append(i)
len(colnew)

In [None]:
print(colnew)

In [None]:
df_train = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/train.csv')
df_train.drop('ID', axis = 1, inplace = True)
df_train.head()

In [None]:
df_train.isna().sum().sum()

In [None]:
df_train.duplicated().value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X = df_train.drop(['target'], axis=1).values
y = df_train['target'].values
X_train_split, X_validation, y_train_split, y_validation = train_test_split(X, y, 
                                                                            test_size=0.2, 
                                                                            random_state=42)

In [None]:
print("X_train_split shape:", X_train_split.shape)
print("X_validation shape:", X_validation.shape)
print("y_train_split shape:", y_train_split.shape)
print("y_validation shape:", y_validation.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sklearn

scaler=StandardScaler()
X_train_split=scaler.fit_transform(X_train_split)
X_validation=scaler.transform(X_validation)

mlp=MLPRegressor(activation='tanh', solver='sgd', hidden_layer_sizes=(5,5,5,5), 
                 max_iter=500, alpha=0.05, learning_rate='adaptive')
pipeline=Pipeline([('transformer',scaler),('estimator',mlp)])
pip = pipeline.fit(X,y)
print('pipeline score',pipeline.score(X_validation,y_validation))


In [None]:
cv=KFold(n_splits=3)
scorescrossval=cross_val_score(pipeline,X_validation,y_validation,cv=cv)
print('scores cross val',scorescrossval)


In [None]:
print('accuracy: %0.2f (+/- %0.2f)'%(scorescrossval.mean(),scorescrossval.std()*2))
mlp.fit(X_train_split,y_train_split)
predictions=mlp.predict(X_validation)
RTWO=sklearn.metrics.r2_score(y_validation,predictions)

In [None]:
prediction=pipeline.predict(X_validation)

In [None]:
from sklearn.neural_network import MLPRegressor
mlpr = MLPRegressor(hidden_layer_sizes= (5,5,5,5),
                           max_iter=500, alpha=0.05, solver='sgd',
                           learning_rate='adaptive', activation='tanh')

mlpr.fit(X_train_split, y_train_split)
pred = mlpr.predict(X_train_split)

In [None]:
import sklearn.metrics as skm
import numpy as np
nonneg_pred = np.clip(pred, df_train['target'].min(), df_train['target'].max())
rmse = skm.mean_squared_error(y_train_split, nonneg_pred, squared=False) 
print('MLPRegressor - RMSE: {:.5f}' .format(rmse))

In [None]:
df_test = pd.read_csv("/kaggle/input/santander-value-prediction-challenge/test.csv")
ids = df_test['ID']
df_test.drop(['ID'], axis=1, inplace=True)

In [None]:
final_model = MLPRegressor(hidden_layer_sizes= (5,5,5,5),
                           max_iter=500, alpha=0.05, solver='sgd',
                           learning_rate='adaptive', activation='tanh')
final_model.fit(X, y)
pred = final_model.predict(df_test)
nonneg_pred = np.clip(pred, df_train['target'].min(), df_train['target'].max())
sub = pd.DataFrame({'ID': ids, 'target': nonneg_pred})
sub.to_csv('submission.csv', index=False)