In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import seaborn as sns
from warnings import filterwarnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
filterwarnings(action='ignore')

In [None]:
train_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/train.csv')
test_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/test.csv')

In [None]:
train_df

## EDA

### Normalizing the target

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

sns.distplot(train_df['target'], fit=stats.norm, ax=ax[0])
ax[0].set_title('Before Normalization')

# use log1p instead of log to get the normalized value for even tiny values -> 0
train_df['target'] = np.log1p(train_df['target'])
ax[1].set_title('After Normalization')
sns.distplot(train_df['target'], fit=stats.norm, ax=ax[1])
plt.show()

In [None]:
# check for duplicated rows
print('num of duplicated rows in train set: ', train_df.duplicated().sum())
# drop ID because there is no need to keep it
train_df.drop(['ID'], axis=1, inplace=True)

# check for duplicated rows
print('num of duplicated rows in test set: ', test_df.duplicated().sum())
# drop ID because there is no need to keep it
ids = test_df['ID']
test_df.drop(['ID'], axis=1, inplace=True)

In [None]:
# check for null values
print('num of null values in train set', train_df.isnull().sum().sum())

# check for null values
print('num of null values in test set', test_df.isnull().sum().sum())

In [None]:
# check if there're any feature with 0 variance(they give us no information)

zero_var_train = []
for col in train_df.columns:
    if train_df[col].var() == 0:
        zero_var_train.append(col)

print('num of columns with zero variance in the train set: ', len(zero_var_train))

zero_var_test = []
for col in test_df.columns:
    if test_df[col].var() == 0:
        zero_var_test.append(col)

print('num of columns with zero variance in the test set: ', len(zero_var_test))

In [None]:
# check for duplicate columns

def duplicate_columns(df):
    dups = []
    columns = df.columns

    for i in range(len(columns)):
        col1 = df.iloc[:, i]
        for j in range(i + 1, len(columns)):
            col2 = df.iloc[:, j]
            # break early if dtypes aren't the same (helps deal with
            # categorical dtypes)
            if col1.dtype is not col2.dtype:
                break
            # otherwise compare values
            if col1.equals(col2):
                dups.append(columns[i])
                break
    return dups


train_dups = duplicate_columns(train_df)
print('num of duplicated cols in the train set: ', len(train_dups))

# test_dups = duplicate_columns(test_df)
# print('num of duplicated cols in the test set: ', len(test_dups))

In [None]:
# dropping useless features

useless_features = list(set(zero_var_train + train_dups))

train_df = train_df.drop(useless_features, axis=1)
test_df = test_df.drop(useless_features, axis=1)

## Adding some statistical features

In [None]:
# adding some statistical features to boost the model

for df in [train_df, test_df]:
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['mean'] = df.mean(axis=1)
    df['non_zero_sum'] = (df != 0).sum(axis=1)
    df['zero_sum'] = (df == 0).sum(axis=1)
    df['sum'] = df.sum(axis=1)
    df['variance'] = df.var(axis=1)
    df['median'] = df.median(axis=1)
    df['mode'] = df.mode(axis=1)
    df['log_sum'] = np.log1p(df['sum'])
    df['log_non_zero'] = np.log1p(df['non_zero_sum'])
    df['log_zero'] = np.log1p(df['zero_sum'])
    df['log_mean'] = np.log1p(df['mean'])
    df['log_max'] = np.log1p(df['max'])
    df['log_min'] = np.log1p(df['min'])
    df['log_variance'] = np.log1p(df['variance'])
    df['log_mode'] = np.log1p(df['mode'])
    df['log_median'] = np.log1p(df['median'])

## Correlation

### Pearson

In [None]:
pearson_selection = train_df.corr().nlargest(20, 'target')['target'].index

In [None]:
plt.figure(figsize=(20, 15))
sns.heatmap(train_df[pearson_selection].corr(), cmap='Greys', annot=True)

### Spearman

In [None]:
spearman_selection = train_df.corr(
    method='spearman').nlargest(20, 'target')['target'].index

In [None]:
plt.figure(figsize=(20, 15))
sns.heatmap(train_df[spearman_selection].corr(), cmap='Greys', annot=True)

## Train and Validation split

In [None]:
X = train_df.drop(['target'], axis=1)
y = train_df['target']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)

## Feature importance using SHAP

### XGBoost

In [None]:
from xgboost import XGBRegressor
import shap

xgb = XGBRegressor(verbose=False).fit(X_train, y_train)

In [None]:
explainer = shap.Explainer(xgb)
shap_values = explainer(X_train)

shap.plots.beeswarm(shap_values)

In [None]:
X_importance = X_val

explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_importance)

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame(
    [X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)

In [None]:
shap_selected_features_xgb = list(
    importance_df[importance_df['shap_importance'] != 0]['column_name'])

In [None]:
len(shap_selected_features_xgb)

### CatBoost

In [None]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(random_state=42, verbose=False).fit(X_train, y_train)
explainer = shap.Explainer(cat)
shap_values = explainer(X_train)

explainer = shap.TreeExplainer(cat)
shap_values = explainer.shap_values(X_importance)

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame(
    [X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)

In [None]:
explainer = shap.Explainer(cat)
shap_values = explainer(X_train)

shap.plots.beeswarm(shap_values)

In [None]:
shap_selected_features_cat = list(
    importance_df[importance_df['shap_importance'] != 0]['column_name'])

In [None]:
len(shap_selected_features_cat)

### LightGBM

In [None]:
from lightgbm import LGBMRegressor

X_importance = X_val
lgbm = LGBMRegressor(random_state=42).fit(X_train, y_train)
explainer = shap.Explainer(lgbm)
shap_values = explainer(X_train)

explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_importance)

shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame(
    [X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)

In [None]:
explainer = shap.Explainer(lgbm)
shap_values = explainer(X_train)

shap.plots.beeswarm(shap_values)

In [None]:
shap_selected_features_lgbm = list(
    importance_df[importance_df['shap_importance'] != 0]['column_name'])

In [None]:
len(shap_selected_features_lgbm)

In [None]:
shap_selection = common_elements = np.intersect1d(
    shap_selected_features_xgb, shap_selected_features_cat)
shap_selection = np.intersect1d(shap_selection, shap_selected_features_lgbm)

In [None]:
len(shap_selection)

## Training model

In [None]:
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, mean_squared_error

In [None]:
X_train = X_train[shap_selection]
X_val = X_val[shap_selection]
test_df = test_df[shap_selection]

In [None]:
# xgboost
xgb = XGBRegressor(colsample_bytree=0.055, colsample_bylevel=0.5,
                   gamma=1.5, learning_rate=0.02, max_depth=32,
                   objective='reg:linear', booster='gbtree',
                   min_child_weight=57, n_estimators=1000, reg_alpha=0,
                             reg_lambda=0, eval_metric='rmse', subsample=0.7,
                   silent=1, n_jobs=-1, early_stopping_rounds=14,
                   random_state=42, nthread=-1)

# randomforest
rf = RandomForestRegressor(random_state=42)

# catboost
cb = CatBoostRegressor(random_state=42, verbose=False)

# lightgbm
lgbm = LGBMRegressor(objective='regression', num_leaves=144,
                     learning_rate=0.005, n_estimators=720, max_depth=13,
                     metric='rmse', is_training_metric=True,
                     max_bin=55, bagging_fraction=0.8, verbose=-1,
                     bagging_freq=5, feature_fraction=0.9, random_state=42)


# defining ensemble
ensemble_regressor = VotingRegressor(
    [('rf', rf), ('xgb', xgb), ('cb', cb), ('lgbm', lgbm)])

# training each model
for reg in (rf, xgb, cb, lgbm, ensemble_regressor):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_val)
    print(reg.__class__.__name__, mean_squared_error(y_val, y_pred))

## Predicting test set 

In [None]:
prediction = np.expm1(ensemble_regressor.predict(test_df))

In [None]:
sub = pd.DataFrame({'ID': ids, 'target': prediction})

In [None]:
sub.to_csv('sub.csv', index=False)