# Prerequisites

#!pip install kaggle

!mkdir $HOME/.kaggle
from google.colab import files
uploaded = files.upload()

!mv kaggle.json $HOME/.kaggle/kaggle.json
!chmod 600 $HOME/.kaggle/kaggle.json
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
!kaggle competitions download -c santander-value-prediction-challenge

# **The Challenge**


According to Epsilon research, 80% of customers are more likely to do business with you if you provide personalized service. Banking is no exception.

The digitalization of everyday lives means that customers expect services to be delivered in a personalized and timely manner… and often before they´ve even realized they need the service. In their 3rd Kaggle competition, Santander Group aims to go a step beyond recognizing that there is a need to provide a customer a financial service and intends to determine the amount or value of the customer's transaction. This means anticipating customer needs in a more concrete, but also simple and personal way. With so many choices for financial services, this need is greater now than ever before.

In this competition, Santander Group is asking Kagglers to help them identify the value of transactions for each potential customer. This is a first step that Santander needs to nail in order to personalize their services at scale.
The evaluation metric for this competition is Root Mean Squared Logarithmic Error.

The RMSLE is calculated as

$$\epsilon=\sqrt{\frac{1}{n}\sum_{i=1}^{n}[log(p_i+1)-log(a_i+1)]^2}$$
Where:

\\(\epsilon\\) is the RMSLE value (score)

\\(n\\) is the total number of observations in the (public/private) data set,

\\(p_i\\) is your prediction of target, and

\\(a_i\\) is the actual target for \\(i\\).

\\(\log(x)\\) is the natural logarithm of \\(x\\)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.sparse import csr_matrix

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.linear_model import *
from lightgbm import LGBMRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from pymrmre import mrmr

import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format

from IPython.core.interactiveshell import InteractiveShell
from IPython.display import *
InteractiveShell.ast_node_interactivity = "all"

In [None]:
train= pd.read_csv('../input/santander-value-prediction-challenge/train.csv') #/content/train.csv.zip') #


In [None]:
train.shape #,test.shape
train.info()
train.describe()

In [None]:
train.set_index('ID',inplace=True)

In [None]:
train_stat = train.agg(['nunique','sum','var','kurtosis','std','min','max','skew']).T
train_stat['Null']=train.isnull().sum()
train_stat['Zeros']=(train==0).sum()
train_stat['Duplicated']=train.T.duplicated()
train_stat['Missing']=1.0*train_stat.Zeros/train.shape[0]

In [None]:
print('Null Values:',train_stat.Null.sum())
print('Zero Values:',train_stat.Zeros.sum())
print('Constant Valued Columns:',np.sum(train_stat['nunique']==1))
print('Zeroed Valued Columns:',len(train_stat[train_stat.Zeros==train.shape[0]]))
print('Duplicated Columns:',train_stat['Duplicated'].sum())

In [None]:
HTML('<p>The dataset has <b>'+str(np.sum(train.duplicated()))+' duplicate</b> Records.')

# Sparsity and other characteristics

In [None]:
plt.figure(figsize=(10,7));
plt.spy(train,aspect='auto');
plt.suptitle('Sparsity in train data',fontsize=20);

In [None]:
sns.distplot(train_stat['nunique']);
plt.xlim(0,1000);
plt.suptitle('Unique Values Density');

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,6))
sns.distplot(train.target,ax=axs[0]);
sns.distplot(np.log1p(train.target),ax=axs[1]);
axs[0].title.set_text('Target Distribution');
axs[1].title.set_text('Log1p(Target) Distribution');

In [None]:
sns.lineplot(data=train ,x=range(train.shape[0]), y=np.sort(train.target));
sns.lineplot(data=train ,x=range(train.shape[0]), y=np.sort(np.log1p(train.target)));

In [None]:
f,ax= plt.subplots(1,2,figsize=(20,5));
sns.distplot(train_stat.Zeros,ax=ax[0]);
sns.distplot(train_stat['nunique'],ax=ax[1]);
ax[0].set_title('Zero values in Columns',fontsize=15);
ax[1].set_title('Unique values in Columns',fontsize=15);

In [None]:
dup_cols = train.columns[train.T.duplicated()]
const_cols=train.columns[train.nunique()==1]
cols_to_remove = dup_cols.append(const_cols).unique()
cols_to_remove
train.drop(columns=cols_to_remove,inplace=True)

In [None]:
X = train.copy()
Y= pd.DataFrame(np.log1p(X.pop('target')),index=train.index)
X = pd.DataFrame(MinMaxScaler().fit_transform(X),columns=X.columns,index=X.index)
solutions = mrmr.mrmr_ensemble(features=X,targets=Y,solution_length=50,solution_count=1)
feat_mrmr=solutions[0][0]

In [None]:
X_freg= SelectKBest(f_regression,k=50).fit(X,Y)
X_mut= SelectKBest(mutual_info_regression,k=50).fit(X,Y)
feat_freg=X_freg.get_support()
feat_mut=X_mut.get_support()
feat_freg = X.columns[feat_freg]
feat_mut = X.columns[feat_mut]

In [None]:
#feat_freg
#type(feat_mut)
#type(feat_mrmr)
feat_final = feat_mut.append(pd.Index(feat_mrmr)).unique()

In [None]:
def stat_cols(df):
  df["sum"] = df.sum(axis=1)
  '''
  df["Variance"]=df.var(axis=1)
  df["Mean"]=df.mean(axis=1)
  df["Median"]=df.median(axis=1)
  df["Std_Dev"]=df.std(axis=1)
  df["Skew"]=df.skew(axis=1)
  df["Kurt"]=df.kurtosis(axis=1)
  df["Max"]=df.max(axis=1)
  df["Min"]=df.min(axis=1)
  '''
  df["Values"]=df[df!=0].sum(axis=1)
  df["Variance_NonZero"]=df[df!=0].var(axis=1)
  df["Mean_NonZero"]=df[df!=0].mean(axis=1)
  df["Median_NonZero"]=df[df!=0].median(axis=1)
  df["Std_Dev_NonZero"]=df[df!=0].std(axis=1)
  df["Skew_NonZero"]=df[df!=0].skew(axis=1)
  df["Kurt_NonZero"]=df[df!=0].kurtosis(axis=1)
  df["Max_NonZero"]=df[df!=0].max(axis=1)
  df["Min_NonZero"]=df[df!=0].min(axis=1)

  return df

In [None]:
X = train[feat_final]
#X = stat_cols(X)
scaler = MinMaxScaler().fit(X)
X= scaler.transform(X)
X=csr_matrix(X)
y=np.log1p(train.target)

In [None]:
reg=LGBMRegressor(objective='regression_l2',n_jobs=-1,metric='rmsle',verbosity=-1,boosting='gbdt')
params = {
        'n_estimators': Integer(50, 100),  
        'num_leaves':   Integer(5,100),     
        'min_child_samples': Integer(1, 50), 
        'feature_fraction': Real(0.1, 0.8),
        'max_depth' : Integer(1, 50),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'reg_lambda': Real(1e-9, 1000, 'log-uniform'),
        'scale_pos_weight': Real(1, 10),
        'max_bin': Integer(100, 1000),        
        'min_child_weight': Real(1, 10),      
        'subsample_for_bin': Integer(100, 3000)  
        }


In [None]:
from sklearn.metrics import make_scorer
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power((y_pred - y_true), 2))), False
    
def rmsle_score(y_true, y_pred):
    return  np.sqrt(np.mean(np.power((y_pred - y_true), 2)))

custom_rmsle = make_scorer(rmsle_score,greater_is_better=False)

In [None]:
bayes_cv_tuner = BayesSearchCV(
                              estimator = reg,
                              search_spaces = params,
                              cv = 3,
                              scoring = custom_rmsle,
                              #n_jobs = 3,
                              n_iter = 15,
                              verbose = 1,
                              random_state = 42
                               )

In [None]:
result= bayes_cv_tuner.fit(X,y)
print(bayes_cv_tuner.best_params_)
print("Best Score = ",np.round(bayes_cv_tuner.best_score_, 4))
#pd.DataFrame(bayes_cv_tuner.cv_results_)
bst_bayes = bayes_cv_tuner.best_estimator_
bst_bayes

In [None]:
reg = LGBMRegressor(boosting='gbdt', feature_fraction=0.1,
              learning_rate=0.10751053203260903, max_bin=634, max_depth=21,
              metric='rmsle', min_child_samples=50, min_child_weight=1.0,
              num_leaves=100, objective='regression_l2',
              reg_lambda=42.10605429311716, scale_pos_weight=7.735846770006023,
              subsample_for_bin=2784, verbosity=2)
result= reg.fit(X,y)

In [None]:
rmsle(y,reg.predict(X))

In [None]:
mod1 = RidgeCV(cv=5)
mod1.fit(X,y)

In [None]:
rmsle(y,mod1.predict(X))

In [None]:
test=pd.read_csv('../input/santander-value-prediction-challenge/test.csv')
test.drop(columns=cols_to_remove,inplace=True)
X_test = test[feat_final]
#X = stat_cols(X)
X_test= scaler.transform(X_test)
X_test=csr_matrix(X_test)
test['target']=np.expm1(reg.predict(X_test))
test[['ID', 'target']].to_csv('submission1.csv', index=False, float_format='%.2f')

In [None]:
!pip install scikit-optimize
!pip install pymrmre

In [None]:
!pip install arfs==0.1

In [None]:
!pip install scikit-learn==0.23.1

In [None]:
!pip install Boruta

In [None]:
from sklearn.linear_model import *
from lightgbm import LGBMRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression