# The Task

The task is to calculate the loss associated with loan defaults. 

Although the features are anonymized, they have properties relating to real-world features. 

Submissions are scored on the root mean squared error.

# Preparing environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RidgeCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import mean_squared_error,make_scorer

# Loading data and having a look at it

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
test.head()

In [None]:
print(train.isnull().values.sum())
print(test.isnull().values.sum())

In [None]:
test.describe()

In [None]:
fig = plt.figure(figsize= (10,6))
loss_count = train['loss'].value_counts().sort_index()
sns.barplot(x=loss_count.index,y=loss_count)

# Separating target from features, training data from validation data

In [None]:
X = train.drop('loss', axis=1)
y = train['loss']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
# Neural networks tend to perform best when their inputs are on a common scale

# max_ = train.max(axis=0)
# min_ = train.min(axis=0)
# train = (train - min_) / (max_ - min_)
# valid = (valid - min_) / (max_ - min_)

from sklearn.preprocessing import MinMaxScaler
# default scaling range - [0,1]
scaler = MinMaxScaler() 
scaled_X_train = pd.DataFrame(scaler.fit_transform(X_train))
scaled_X_valid = pd.DataFrame(scaler.transform(X_valid))
scaled_X_train.columns = X_train.columns
scaled_X_valid.columns = X_valid.columns
scaled_X_train.head()

In [None]:
scaled_X_train.shape

# Feature selection

In [None]:
fselector = SelectKBest(score_func=f_regression, k=70)
# Run score function on (X, y) and get the appropriate features. Then reduce X to the selected features.
X_train_selected = fselector.fit_transform(scaled_X_train, y_train)
print(sorted(fselector.scores_, reverse=True)[:20])

In [None]:
# select the same features on the validation set
X_valid_selected = fselector.transform(scaled_X_valid)

# Basic models

In [None]:
def model_score(model):
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_valid_selected) 
    return mean_squared_error(y_valid, y_pred, squared=False)

In [None]:
# %%time
## result 9.08 in 1min 30s

# ada_model = AdaBoostRegressor()
# print(model_score(ada_model))
# ada_model.get_params()

In [None]:
# %%time
## result 8.11 in 2min 51s

# bag_model = BaggingRegressor(n_estimators = 50)
# print(model_score(bag_model))

In [None]:
# %%time
## result 7.94 in 31 s

# kn_model = KNeighborsRegressor(n_neighbors=2000,leaf_size=100)
# model_score(kn_model)

In [None]:
# %%time
##  result 7.95 in 18s

# cat_model = CatBoostRegressor(depth=4, iterations=400, l2_leaf_reg=12, learning_rate=0.03, silent=True)
# print(model_score(cat_model))
# cat_model.get_params()

In [None]:
# %%time
# # result 7.96 in 1min
# # result 7.95 in 9min

# xgb_model = XGBRegressor()
# print(model_score(xgb_model))
# xgb_model.get_params()

In [None]:
#  best feature score - 70

# lgbm_model = LGBMRegressor(learning_rate=0.01, max_depth=50, n_estimators=100, num_leaves=300)
# features_scores = {}
# for num_of_features in range(30,100):
#     y = train['loss']
#     X = train.loc[:, train.columns != 'loss']
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)
#     fselector = SelectKBest(score_func=f_regression, k=num_of_features)
#     # Run score function on (X, y) and get the appropriate features. Then reduce X to the selected features.
#     X_train_selected = fselector.fit_transform(X_train, y_train)
#     X_valid_selected = fselector.transform(X_valid)
#     features_scores.update({num_of_features : model_score(lgbm_model)})

# features_scores = sorted(features_scores.items(), key=lambda item: item[1])
# for s in features_scores:
#     print(s)

In [None]:
# %%time
# # result 7.93 in 5s
# # result 7.91 in 49s

# lgbm_model = LGBMRegressor(learning_rate=0.01, max_depth=50, n_estimators=100, num_leaves=300)
# print(model_score(lgbm_model))
# lgbm_model.get_params()

In [None]:
# result - 8.07 in 5min 55s

# %%time
# rf_model = RandomForestRegressor()
# model_score(rf_model)

In [None]:
#  result - 7.93 in 1min 33s

# %%time
# gb_model = GradientBoostingRegressor()
# model_score(gb_model)

In [None]:
%%time
# result 7.889 - lgbm + cat = knn
# result 7.88.. - lgbm + cat = ridge   in 3 min
# result 7.887 - lgbm + cat + knn + xgb + gb = ridge   in 3 min (same when features are normalized)
estimators = [('lgbm',LGBMRegressor(learning_rate=0.01, max_depth=50, n_estimators=100, num_leaves=300)),
              ('cat',CatBoostRegressor(depth=4, iterations=400, l2_leaf_reg=12, learning_rate=0.03, silent=True),
              ('knn',KNeighborsRegressor(n_neighbors=2000,leaf_size=100)),
              ('xgb',XGBRegressor()),
              ('gb',GradientBoostingRegressor()))
]

st_model = StackingRegressor(estimators=estimators,
                        final_estimator=RidgeCV())

st_model.fit(X_train_selected, y_train)
y_pred = st_model.predict(X_valid_selected) 
print(mean_squared_error(y_valid, y_pred, squared=False))

## Neural Networks

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
import tensorflow as tf

In [None]:
# define the keras model
n_features = X_train_selected.shape[1]

# different activator functions - 'relu', 'elu', 'selu', and 'swish'

neu_model = Sequential()
neu_model.add(Dense(32, input_dim=n_features, activation='relu', kernel_initializer='he_normal'))
neu_model.add(Dropout(rate=0.3)) # apply 30% dropout to the next layer to teach more broad, general patterns
neu_model.add(BatchNormalization()) # rescaling each batch for quicker and more stable performance
neu_model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
neu_model.add(Dropout(rate=0.3))
neu_model.add(BatchNormalization())
neu_model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
neu_model.add(Dropout(rate=0.3))
neu_model.add(BatchNormalization())
neu_model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
neu_model.add(BatchNormalization())
neu_model.add(Dense(1, activation='linear'))

In [None]:
def custom_loss_function(y_true, y_pred):
    squared_difference = tf.square(((y_true - y_pred)**2)/len(y_true))
    return tf.reduce_mean(squared_difference, axis=-1)

In [None]:
# compile the keras model
# neg_mean_squared_error_scorer = make_scorer(mean_squared_error,squared=False,greater_is_better=False)

# optimizer tells the network how to change its weights
# Adam is an Stochastic Gradient Descent (SGD) algorithm that has an adaptive learning rate
# popular optimizers - rmsprop, Adam, and sgd

neu_model.compile(loss='mse', optimizer='adam')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=50, # how many epochs to wait before stopping
    restore_best_weights=True,
)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

# gradually decreasing the learning rate over the course of training can improve performance 
# (the weights "settle in" to a minimum)
# This schedule will multiply the learning rate by 0.2 if the validation loss didn't decrease after an epoch
lr_schedule = ReduceLROnPlateau(
    patience=0,
    factor=0.2,
    min_lr=0.001,
)

In [None]:
minibatch = round(X_train_selected.shape[0]/500)
minibatch

In [None]:
%%time
# fit the keras model on the dataset
# can have several callbacks

history = neu_model.fit(X_train_selected, y_train, validation_data=(X_valid_selected, y_valid),
                        epochs=100,callbacks=[early_stopping,lr_schedule], batch_size=minibatch, verbose=1)


# history = neu_model.fit(X_train_selected, y_train, validation_data=(X_valid_selected, y_valid),
#                         epochs=100, batch_size=minibatch, verbose=1)

In [None]:
# evaluate on valid set

# result on first try - 7.95
# normalized - 7.92
# changed batch_size from 200 to 400 - loss dropped from 62 to 59.9 but RMSE rose t0 8.01 
# - Overfitting, needs Early Stopping or other Keras Callback/ less units/ dropout layer
# on batch_size 200 with dropouts and batch_normalization but without Early Stopping - 7.94
# with Early Stopping back and one more layer - 7.91
# 2 additional layers made Val_loss high and jumpy - 7.92

y_pred = neu_model.predict(X_valid_selected)
print(mean_squared_error(y_valid, y_pred, squared=False))

In [None]:
# The fit method keeps a record of the loss produced during training in a History object.
# loss during epochs - a learning curve

# convert the training history to a dataframe
history_df = pd.DataFrame(history.history)
# use Pandas native plot method
history_df.loc[5:, ['loss', 'val_loss']].plot(); #plot from the 5th epoch
print("Minimum validation loss: {}".format(history_df['val_loss'].min()))

# Model selection

In [None]:
# %%time
# param_grid = {'depth': [6,4],
#               'learning_rate' : [0.03, 0.05],
#               'l2_leaf_reg': [7,12,17],
#               'iterations': [400,300],
#               'silent': [True]
#              }
# model = CatBoostRegressor()
# # my_scorer = make_scorer(mean_squared_error, squared=False)
# neg_mean_squared_error_scorer = make_scorer(mean_squared_error,squared=False,greater_is_better=False)

# grid_search = GridSearchCV(estimator = model, param_grid = param_grid, scoring = neg_mean_squared_error_scorer,
#                            cv = 3, n_jobs = 1, verbose = 2)

# grid_search.fit(X_train_selected, y_train)
# best_model = grid_search.best_estimator_

# scores = {}
# for i in range(len(grid_search.cv_results_['params'])):
#     parameters = str(grid_search.cv_results_['params'][i])
#     mean_score = -grid_search.cv_results_['mean_test_score'][i]
#     rank = grid_search.cv_results_['rank_test_score'][i]
#     scores.update({ parameters : ([mean_score,rank]) })

# scores = sorted(scores.items(), key=lambda item: item[1][1])
# for s in scores:
#     print(s)


# y_pred = best_model.predict(X_valid_selected)
# print('Result for the best model:',grid_search.best_params_,mean_squared_error(y_valid, y_pred, squared=False))

# Generating submission

In [None]:
scaled_test = pd.DataFrame(scaler.transform(test))
scaled_test.columns = test.columns
scaled_test.head()

In [None]:
my_model = st_model

test_selected = fselector.transform(scaled_test)
predictions = my_model.predict(test_selected)
predictions

In [None]:
output = pd.DataFrame(test['id'])
output['loss'] = predictions
output = output.set_index('id')
output

In [None]:
output.to_csv('submission_loss_norm.csv', index=True)