# Tabular Playground Challenge

# Step 1: Reading and Understanding the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (12, 8)

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)
import warnings
warnings.filterwarnings('ignore')
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import random
seed = 12
np.random.seed(seed)

from datetime import date

In [None]:
# important funtions
def datasetShape(df):
    rows, cols = df.shape
    print("The dataframe has",rows,"rows and",cols,"columns.")
    
# select numerical and categorical features
def divideFeatures(df):
    numerical_features = df.select_dtypes(include=[np.number])
    categorical_features = df.select_dtypes(include=[np.object])
    return numerical_features, categorical_features

In [None]:
base = '/kaggle/input/tabular-playground-series-jan-2021/'
data_file = base + "train.csv"
df = pd.read_csv(data_file)
df.head()

In [None]:
data_file = base + "test.csv"
df_test = pd.read_csv(data_file)
df_test.head()

In [None]:
# check dataset shape
datasetShape(df)

In [None]:
df.drop('id', inplace=True, axis=1)

In [None]:
# check for duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

# Step 2: EDA

### Univariate Analysis

In [None]:
# boxplots of numerical features for outlier detection

fig = plt.figure(figsize=(16,20))
for i in range(len(df.columns)):
    fig.add_subplot(3, 5, i+1)
    sns.boxplot(y=df.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
# check for missing values
df.isna().any().sum()

In [None]:
import matplotlib.gridspec as gridspec
fig = plt.figure(constrained_layout=True, figsize=(16,6))
grid = gridspec.GridSpec(ncols=2, nrows=1, figure=fig)
ax1 = fig.add_subplot(grid[0, :2])
ax1.set_title('Histogram')
sns.distplot(df.loc[:,'target'], norm_hist=True, ax = ax1)
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

# correlation heatmap for all features
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot=True)
plt.show()

No Data Cleaning required. Lets check for data skewness further.

# Step 3: Data Preparation

### Outlier Treatment

We will take log of the feature values using np.log1p()

In [None]:
# plot sample skewed feature
plt.figure(figsize=(10,4))
sns.distplot(df['cont1'])
plt.show()

In [None]:
skewed_features = df.apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_features

In [None]:
# # transform skewed features
# for feat in skewed_features.index:
#     if abs(skewed_features.loc[feat]) > 0.0005:
#         df[feat] = np.log1p(df[feat])
#         if 'Close' not in feat:
#             df_test[feat] = np.log1p(df_test[feat])

Not applying skewness.

In [None]:
# plot sample treated feature
plt.figure(figsize=(10,4))
sns.distplot(df['cont1'])
plt.show()

# Step 4: Data Modelling

### Split Train-Test Data

In [None]:
# shuffle samples
df_shuffle = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df_y = df_shuffle.pop('target')
df_X = df_shuffle

# split into train dev and test
X_train, X_test, y_train, y_test = skms.train_test_split(df_X, df_y, train_size=0.9, random_state=seed)
print(f"Train set has {X_train.shape[0]} records out of {len(df_shuffle)} which is {round(X_train.shape[0]/len(df_shuffle)*100)}%")
print(f"Test set has {X_test.shape[0]} records out of {len(df_shuffle)} which is {round(X_test.shape[0]/len(df_shuffle)*100)}%")

### Feature Scaling

In [None]:
import sklearn.linear_model as sklm

In [None]:
# scaler = skp.RobustScaler()
scaler = skp.MinMaxScaler()
# scaler = skp.StandardScaler()

# apply scaling to all numerical variables except dummy variables as they are already between 0 and 1
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

# scale test data with transform()
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

# view sample data
X_train.describe()

## Model Building

In [None]:
def expm1(x):
    return np.expm1(x)
def getRmse(y_train, y_train_pred):
#     print(skm.mean_squared_error(expm1(y_train), expm1(y_train_pred)))
    print(skm.mean_squared_error(y_train, y_train_pred))

### Ridge

In [None]:
lmr = sklm.Ridge(alpha=0.001)
lmr.fit(X_train, y_train)

# predict
y_train_pred = lmr.predict(X_train)
y_test_pred = lmr.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.005, 0.01, 0.03, 0.05, 0.1, 0.5, 1.0, 5.0, 10]}
ridge = sklm.Ridge()

# cross validation
model_cv_ridge = skms.GridSearchCV(estimator = ridge, n_jobs=-1, param_grid = params, 
                             scoring= 'neg_mean_squared_error', cv = 5, 
                             return_train_score=True, verbose = 3)            
model_cv_ridge.fit(X_train, y_train)
print(model_cv_ridge.best_estimator_)
y_train_pred = model_cv_ridge.predict(X_train)
y_test_pred = model_cv_ridge.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

### CatBoost

In [None]:
import catboost as cb

cbr = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
cbr.fit(X_train, y_train, eval_set=(X_test, y_test))
print(cbr.best_score_)

y_train_pred = cbr.predict(X_train)
y_test_pred = cbr.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

### Gradient Boosting

In [None]:
import sklearn.ensemble as ske

xgb = ske.GradientBoostingRegressor(criterion='mse', random_state=1)
xgb.fit(X_train, y_train)

# predict
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

### Extra Trees

In [None]:
xgb = ske.ExtraTreesRegressor(criterion='mse', random_state=1)
xgb.fit(X_train, y_train)

# predict
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

### RandomForest

In [None]:
xgb = ske.RandomForestRegressor(criterion='mse', random_state=1)
xgb.fit(X_train, y_train)

# predict
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

### XGBoost

In [None]:
import xgboost as xg
xgb = xg.XGBRegressor(objective ='reg:squarederror', random_state=1)
xgb.fit(X_train, y_train)

# predict
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
getRmse(y_train, y_train_pred)
getRmse(y_test, y_test_pred)

## Deep Learning Model

In [None]:
import tensorflow as tf
print("TF version:-", tf.__version__)
import keras as k
tf.random.set_seed(seed)

In [None]:
THRESHOLD = 0
bestModelPath = './best_model.hdf5'

class myCallback(k.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('mse') < THRESHOLD):
            print("\n\nStopping training as we have reached our goal.")   
            self.model.stop_training = True

mycb = myCallback()
checkpoint = k.callbacks.ModelCheckpoint(filepath=bestModelPath, monitor='val_loss', verbose=1, save_best_only=True)

callbacks_list = [mycb,
                  checkpoint
                 ]
            
def plotHistory(history):
    print("Min. Validation MSE",min(history.history["val_mse"]))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()

In [None]:
epochs = 40

model_1 = k.models.Sequential([
    k.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    k.layers.Dropout(0.2),
    
#     k.layers.Dense(4096, activation='relu'),
#     k.layers.Dropout(0.2),

    k.layers.Dense(256, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(1, activation='linear'),
])
print(model_1.summary())

model_1.compile(optimizer='adam',
              loss='mse',
              metrics='mse'
)
history = model_1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs,
                 callbacks=[callbacks_list])

In [None]:
plotHistory(history)

# Test Evaluation & Submission

In [None]:
def getTestResults(m=None):
    df_final = df.sample(frac=1, random_state=1).reset_index(drop=True)
    test_cols = [x for x in df.columns if 'target' not in x]
    df_final_test = df_test[test_cols]
    df_y = df_final.pop('target')
    df_X = df_final

#     scaler = skp.RobustScaler()
    scaler = skp.MinMaxScaler()
#     scaler = skp.StandardScaler()

    df_X = pd.DataFrame(scaler.fit_transform(df_X), columns=df_X.columns)

    X_test = pd.DataFrame(scaler.transform(df_final_test), columns=df_X.columns)
    
    if m is None:

#         lmr = sklm.Ridge(alpha=0.0001)
#         lmr.fit(df_X, df_y)

        lmr = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
        lmr.fit(df_X, df_y)

#         lmr = ske.ExtraTreesRegressor(criterion='mse', random_state=1)
#         lmr.fit(df_X, df_y)

#         lmr = ske.RandomForestRegressor(criterion='mse', random_state=1)
#         lmr.fit(df_X, df_y)

#         lmr = xg.XGBRegressor(objective ='reg:squarederror', random_state=1)
#         lmr.fit(df_X, df_y)

    else:
        lmr = m

    # predict
    y_train_pred = lmr.predict(df_X)
    y_test_pred = lmr.predict(X_test)
    if m is not None:
        y_test_pred = [y[0] for y in y_test_pred]
    getRmse(df_y, y_train_pred)
    return y_test_pred

# ML models
results = getTestResults()

# Neural Network model
# results = getTestResults(k.models.load_model(bestModelPath))

In [None]:
submission = pd.DataFrame({
    'id': df_test['id'],
    'target': results,
})
submission.head()

In [None]:
submission.to_csv('./submission_Catboost.csv', index=False)

Metrics - RMSE

1 - NN - .71704

2 - Ridge(0.0001) - .72782

3 - CatBoost - .70001

4 - ExtraTrees - .70887

5 - RF - .70985

6 - XGB - .70463

#### CatBoost performed best.