In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/train.csv')
test_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/test.csv')

In [None]:
print('Shape of train_df ', train_df.shape)
print('Shape of Test_df ', test_df.shape)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(range(train_df.shape[0]), np.sort(train_df['target'].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.title('Distribution of Target', fontsize=14)
plt.show()

There isn't much visible outliers but the distribution range is high which we can check using histogram

In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(train_df['target'].values, bins=50, kde=False)
plt.xlabel('Target', fontsize=14)
plt.title('Histogram of Target', fontsize=16)
plt.show()

It is a skewed distribution, let's try to plot Histogram with log values

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(12,8))
sns.distplot( np.log1p(train_df["target"].values), bins=50, kde=False)
plt.xlabel('Target', fontsize=12)
plt.title("Log of Target Histogram", fontsize=14)
plt.show()

In [None]:
print("All Features in Train data with NaN Values =", str(train_df.columns[train_df.isnull().sum() != 0].size) )

So there are no missing values

Now let's check if there are any constant columns. We can find them by calculating the variance of the columns and removing the columns where the variance is zero

In [None]:
const_columns_to_remove = []

for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0:
            const_columns_to_remove.append(col)

# Now remove that array of const columns from the data
train_df.drop(const_columns_to_remove, axis=1, inplace=True)

test_df.drop(const_columns_to_remove, axis=1, inplace=True)

# Print to see the reduction of columns
print('train_df rows and columns after removing constant columns: ', train_df.shape)

print('Following `{}` Constant Column\n are removed'.format(len(const_columns_to_remove)))
print(const_columns_to_remove)

In [None]:
train_df.shape

Now we will check if there are any duplicate columns

In [None]:
print(train_df.columns.duplicated())


In [None]:
train_df = train_df.loc[:,~train_df.columns.duplicated()]
print('Train rows and columns after removing duplicate columns: ', train_df.shape)

This dataset does not have any duplicate columns

In [None]:
X_train = train_df.drop(['ID', 'target'], axis=1)
y_train = np.log1p(train_df['target'].values)
X_test_original = test_df.drop('ID', axis=1)
X_train_split, X_validation, y_train_split, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Now let's first train the model using LightGBM

In [None]:
def light_gbm_model_run(train_x, train_y, validation_x, validation_y, test_x):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 100,
        "learning_rate" : 0.001,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    # Given its a regression case, I am using the RMSE as the metric.

    lgbm_train = lgb.Dataset(train_x, label=train_y)
    
    lg_validation = lgb.Dataset(validation_x, label=validation_y)
    
    evals_result_lgbm = {}

    model_light_gbm = lgb.train(params, lgbm_train, 5000,
                      valid_sets=[lgbm_train, lg_validation],
                      early_stopping_rounds=100,
                      verbose_eval=150,
                      evals_result=evals_result_lgbm )

    pred_test_light_gbm = np.expm1(model_light_gbm.predict(test_x, num_iteration=model_light_gbm.best_iteration ))

    return pred_test_light_gbm, model_light_gbm, evals_result_lgbm

In [None]:
# Training and output of LightGBM Model
predicted_y_test_org_light_gbm, model_lgbm, evals_result = light_gbm_model_run(X_train_split, y_train_split, X_validation, y_validation, X_test_original)

In [None]:
gain_light_gbm = model_lgbm.feature_importance('gain')

feature_imp_light_gbm = pd.DataFrame({'feature': model_lgbm.feature_name(),
                                      'split': model_lgbm.feature_importance('split'),
                                      'gain': 100 * gain_light_gbm / gain_light_gbm.sum()
}).sort_values('gain', ascending=False)

print(feature_imp_light_gbm[:50])

Now training the model in XGBoost

In [None]:
def xgb_model_run(train_x, train_y, validation_x, validation_y, test_x):
    params = {
        'objective': 'reg:squarederror', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42
          
    }

    training_data = xgb.DMatrix(train_x, train_y)
    validation_data = xgb.DMatrix(validation_x, validation_y)

    watchlist = [(training_data, 'train'), (validation_data, 'valid')]

    model_xgb = xgb.train(params, training_data, 50, watchlist, maximize=False, early_stopping_rounds=100, verbose_eval=100 )

    data_test = xgb.DMatrix(test_x)
    predict_test_xgb = np.expm1(model_xgb.predict(data_test, ntree_limit=model_xgb.best_ntree_limit ) )

    return predict_test_xgb, model_xgb

In [None]:
predicted_y_test_org_xgbm, model_xgb = xgb_model_run(X_train_split, y_train_split, X_validation, y_validation, X_test_original)

Training the model in CatBoost

In [None]:
model_catboost = CatBoostRegressor(iterations=500,
                                   learning_rate=0.01,
                                   depth=10,
                                   eval_metric='RMSE',
                                   random_seed = 42,
                                   bagging_temperature=0.2,
                                   od_type='Iter',
                                   metric_period=50,
                                   od_wait=20
                                   )

model_catboost.fit(X_train_split, y_train_split,
                   eval_set=(X_validation, y_validation),
                   use_best_model=True,
                   verbose=50
                   )

predicted_y_test_org_catboost = np.expm1(model_catboost.predict(X_test_original))

Creating output file combining all three algorithms

In [None]:
submission_final = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/sample_submission.csv')

submission_lgb = pd.DataFrame()
submission_lgb['target'] = predicted_y_test_org_light_gbm

submission_xgb = pd.DataFrame()
submission_xgb['target'] = predicted_y_test_org_xgbm

submission_catboost = pd.DataFrame()
submission_catboost['target'] = predicted_y_test_org_catboost

submission_final['target'] = (submission_lgb['target'] * 0.5 + submission_xgb['target'] * 0.3 + submission_catboost['target'] * 0.2)

In [None]:
submission_final.head()

In [None]:
submission_final.to_csv('./submission.csv', index=False)