In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

This is my second notebook with February Tabular-Playgroud-Series. In the first one I've learnt how to use lightgbm algorithm. https://www.kaggle.com/godzill22/tbs-feb-2021-with-lightgbm. In this notebook I want to learn how to use neural network to solve regression model and whether ANN can improve my score.

In [None]:
# Import main libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

# Statistics
from scipy import stats
from scipy.stats import skew, kurtosis

# Preprocessing

## Exploratory Data Analysis first.

In [None]:
# Import our dataset
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

In [None]:
train.head()

In [None]:
# Create a list of numerical and categoricals columns
num_feat = [col for col in train.columns if col.startswith("cont")]
cat_feat = [col for col in train.columns if col.startswith("cat")]

In [None]:
missingno.matrix(train[num_feat], figsize=(18,4))

In [None]:
# Save id column for submission and drop it from train/test dataset
id_col = test['id']
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

### Numerical Features

In [None]:
# Describe continues features
train[num_feat].describe().T

In [None]:
num_feat.append("target")

In [None]:
corr = train[num_feat].corr()

fig = plt.figure(figsize=(14,9))

sns.heatmap(corr,cmap='coolwarm', annot=True, cbar=False)

There is some correlation between features but interestingly none of them is correlated with the target. As we can see on the plot below correlation between feature 'cont9' and the target doesn't exist.

In [None]:
target_ser = corr['target']
target_ser = target_ser.drop('target').sort_values(ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(x=target_ser.index, y=target_ser.values, palette='cool')
plt.title("Correlation numeric features");

In [None]:
num_feat.remove("target")

In [None]:
fig = plt.figure(figsize=(18,30))

for i, col in enumerate(num_feat):
    plt.subplot(8,2, i+1)
    sns.kdeplot(x=train[col], color='b', shade=True)
    plt.grid()
    plt.tight_layout()
    
fig.show()

In most cases the distribution of continues features seems to be multimodial, except feature "cont6".

In [None]:
fig = plt.figure(figsize=(18,30))

for i, col in enumerate(num_feat):
    plt.subplot(8,2, i+1)
    sns.scatterplot(x=col, y="target", data=train, alpha=0.3)
    plt.tight_layout()
    
fig.show()

In [None]:
#Discretization of continues variables

#from sklearn.preprocessing import KBinsDiscretizer

#est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='quantile')

#train.loc[:,num_feat] = est.fit_transform(train.loc[:,num_feat])

Discretization didn't help to improve my base model.

#### Skewness and Kurtosis

In [None]:
train[num_feat].kurtosis()

In [None]:
train[num_feat].skew()

In [None]:
def signaltonoise(a, axis=0, ddof=0):
    a = np.asanyarray(a)
    m = a.mean(axis)
    sd = a.std(axis=axis, ddof=ddof)
    return np.where(sd == 0, 0, m/sd)

for col in num_feat:
    print(f"Column '{col}' signal-to-noise ratio:  {signaltonoise(train[col]):2f}")

#### Traget column

In [None]:
plt
sns.displot(train['target'])
plt.title(f"skewness: {train['target'].skew()}");

Our target column distribution is bimodial, and is skewed to the left.

### Categorical features

In [None]:
for col in cat_feat:
    
    set_diff = set(train[col].unique()) - set(test[col].unique())
    print(f"Train and Test Dataset column: {col} has different of unuque value: {set_diff}")

In [None]:
train['cat6'].value_counts()

In [None]:
test['cat6'].value_counts()

There is a different in column 'cat6' and that need to be remembered when we onhotencode or using get_dummies method.

In [None]:
fig = plt.figure(figsize=(18,30))

for i, col in enumerate(cat_feat):
    plt.subplot(8,3, i+1)
    sns.countplot(x=train[col], palette='mako_r')

fig.show()

In [None]:
# Check distribution of categorical features against target feature.
fig = plt.figure(figsize=(18, 30))

for i, col in enumerate(cat_feat):
    plt.subplot(8, 2, i+1)
    sns.boxenplot(x=col, y='target', data=train)
fig.show()

### LabelEncode categorical features

Normally, when converting categorical features we would be able to distinguish between nominal and ordinal but not in this case. I've notice that some of Kagglers've chosen method appropriate for ordinal transformation. I think the only reason for that would be to reduce dimensionality of the dataset as it is already quite big for computation power of out pc. For that point I will use LabelEncoder from sklearn.

### Preparing the dataset for machine learning

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_feat:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
# Test for skewness again
test.skew()

I am going to remove column 'cat6' as it is skewed the most.

In [None]:
train.drop('cat6', axis=1, inplace=True)
test.drop('cat6', axis=1, inplace=True)
cat_feat.remove('cat6')

Creating dummy variable didn't improve the model, as a matter of fact it worsen the model.

In [None]:
#def get_dummies(df):
    #dummy_df = pd.get_dummies(df[cat_feat], drop_first=True)
    #df.drop(cat_feat, axis=1, inplace=True)
    #new_df = pd.concat([dummy_df, df], axis=1)
    #return new_df

#train = get_dummies(train)
#test = get_dummies(test)

### Modeling ANN

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
all_cols = [col for col in train.columns if col.startswith("c")]

In [None]:
X = train.iloc[:,:-1].values
y = train.iloc[:, -1].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.1, random_state=45)

In [None]:
# We need to normalize the dataset for tensorflow
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val= scaler.transform(X_val)

In [None]:
X_train.shape

In [None]:
# Create a simple model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(23, activation="relu"))
model.add(Dense(23, activation="relu"))
model.add(Dense(23, activation="relu"))
model.add(Dense(23, activation="relu"))


model.add(Dense(1))
model.compile(optimizer='adam', loss='mae')

In [None]:
model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val) , epochs=10, batch_size=128)

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df.plot();

**Model evaluation**

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
base_mae = model.evaluate(X_val, y_val, verbose=0)
print(f"Base Model MAE: {base_mae}")

In [None]:
mean_y = np.mean(y)
off = 100 * (base_mae / mean_y)
print(f"We are off about {off:.2f}%")

In [None]:
# model.evaluate(X_train, y_train, verbose=0)

In [None]:
base_pred = model.predict(X_val)

In [None]:
base_rmse = np.sqrt(mean_squared_error(y_val, base_pred))
print(f"Base model RMSE: {base_rmse}")

Ok, so this is our base line. There is a lot to do in order to bit my best score from previous notebook  (0.84364). Well we see, after all we doing it for fun and to learn something new.

In [None]:
from tensorflow.keras.layers import Dropout
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import LeakyReLU
from keras.optimizers import Adam, RMSprop, SGD

In [None]:
def create_ann():
    # Instantiate a model
    model = Sequential()
    # Add hidden layer 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    # Add hidden layer
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    # Add hidden layer
    model.add(Dense(54, activation='relu'))
    model.add(Dropout(0.5))
    # Add output layer
    model.add(Dense(1))
    
    # Compile the model
    model.compile(optimizer='adam', loss="mae")
    
    return model


def evaluate_model(model):
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"RMSE: {rmse}")

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

ann_model = create_ann()

ann_model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=100, callbacks=[early_stop])

In [None]:
loss_df = pd.DataFrame(ann_model.history.history).head()
loss_df.plot();

In [None]:
evaluate_model(ann_model)

Now let's use KFold to make sure we train our model on different samples and take a mean of it.

In [None]:
from sklearn.model_selection import KFold

In [None]:
def create_ann():
    # Instantiate a model
    model = Sequential()
    # Add hidden layer 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    # Add hidden layer
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    # Add hidden layer
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    # Add output layer
    model.add(Dense(1))
    
    # Compile the model
    model.compile(optimizer=RMSprop(lr=0.001), loss="mean_squared_error")
    
    return model

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=45)
oof = np.zeros(len(X))
score_list = []
fold = 1

y_pred_list = []
for train_idx, test_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[test_idx]
    y_train, y_val = y[train_idx], y[test_idx]
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_val = sc.transform(X_val)
    
    early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    
    ann_model = create_ann()
    ann_model.fit(X_train, 
                  y_train, 
                  validation_data=(X_val, y_val), 
                  batch_size=128, epochs=30, 
                  callbacks=[early_stop])
    
    
    y_pred_list.append(ann_model.predict(X_val))
    
    
    oof[test_idx] = np.mean(y_pred_list, axis=0).reshape(len(X_val),)
    score = np.sqrt(mean_squared_error(y_val, oof[test_idx]))
    score_list.append(score)
    print(f"RMSE fold -{fold} : {score}")
    fold +=1
    
print(f"RMSE mean 5 folds: {np.mean(score_list)}")

In [None]:
print(f"RMSE mean 5 folds: {np.mean(score_list)}")

### Submmit to Kaggle

In [None]:
sc = StandardScaler()
scaled_X = sc.fit_transform(X)
scaled_test = sc.transform(test)

ann_model.fit(scaled_X, y)
ann_y_pred = ann_model.predict(scaled_test)

In [None]:
sub_k = pd.DataFrame({"id": id_col,
                      "target": ann_y_pred.reshape(-1)})

In [None]:
sub_k.to_csv("sub_tbs_feb_ann.csv", index=False)

My score on submission was 0.87081 and as we compete in this competition in 4th and 5th decimal point this score isn't good at all.
So dear Kagglers can someone point me to the right direction and tell me what else can be done to improve this neural network?