# TPS feb 2021.

This notebook contains

- necessary preprocessing of data (i.e. processing of categorical data)
- EDA
- feature engineering
- different models

# Contents

<a name="toc"></a>

<ol>
<li><a href='#env'>Load data and import libs</a></li>
<li><a href='#preprocessing'>Data preprocessing</a></li>
<li><a href='#models'>Models</a></li>

<ul>
    <li><a href='#nn'>Simple neural network with TensorFlow</a></li>
    <li><a href='#rf'>Random Forrest Regressor</a></li>
</ul>
    
<li><a href='#result'>Predict and submit</a></li>
</ol>

# Load data and import libs <href name="env"></href>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
dft = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')
subm = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
df.head(3)

In [None]:
dft.head(3)

There are categorical (catX) and contituous (contY) data.

In [None]:
df.info()

In [None]:
dft.info()

There are no NaN values in data.

In [None]:
plt.figure(figsize=(20,2))
plt.boxplot(df.target, vert=False)
plt.grid(True)

plt.figure(figsize=(20,2))
sns.displot(df.target)
plt.show()

There are some outliners in target values.

It makes sense to drop target=0.

<a href="#toc">Back to Contents</a>

# Data preprocessing <href name="preprocessing"></href>

In [None]:
vocab = set()
dict_replace = dict()

for i in range(10):
    vocab = vocab.union(set(df["cat{}".format(i)].unique()))   

for i, l in enumerate(vocab):
    dict_replace[l] = i
dict_replace

In [None]:
# change letters to digits
for i in range(10):
    df["cat{}".format(i)] = df["cat{}".format(i)].map(dict_replace)
# make one-hot-encoding
df = pd.get_dummies(df, columns=["cat{}".format(i) for i in range(10)])

for i in range(10):
    dft["cat{}".format(i)] = dft["cat{}".format(i)].map(dict_replace)
dft = pd.get_dummies(dft, columns=["cat{}".format(i) for i in range(10)])

One column (besides target) from df.columns is not presented in dft.columns.

In [None]:
df.columns.shape, dft.columns.shape

In [None]:
for col_ in df.columns:
    if col_ not in dft.columns:
        print(col_)

Let's add the missing column with zero values to dft.

In [None]:
dft['cat6_0'] = np.zeros(len(dft))

<a href="#toc">Back to Contents</a>

# Simple neural network with TF <href name="nn"></href>

In [None]:
df.columns

In [None]:
X = df.drop(['id', 'target'], axis=1).values
y = df.target

In [None]:
# number of neurons on the first layer is equal to number of features 
NUM = X.shape[1]

model = Sequential()
model.add(Dense(NUM, input_dim=NUM, activation='relu')) # First layer if neural network with NUM neurons
model.add(Dropout(0.2))                                 # Dropout (forget 20% of weights) to avoid overfitting

model.add(Dense(NUM, activation='relu'))                # Hidden dense layer
model.add(Dropout(0.2))                                 # Dropout (forget 20% of weights) to avoid overfitting

model.add(Dense(NUM//4, activation='relu'))             # The next hidden layer
model.add(Dropout(0.1))

model.add(Dense(1, activation=None))                    # Output layer

# you can try with another optimizers (Adam, RMSprop, tf.keras.optimizers)
model.compile(loss='mse', optimizer='SGD', metrics=['mse','mae',])

In [None]:
history = model.fit(X, y, epochs=10, validation_split=0.2)

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(211)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.grid(True)
plt.legend()
plt.subplot(212)
plt.plot(history.history['mae'], label='mae')
plt.plot(history.history['val_mae'], label='val_mae')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
model.summary()

# Random Forrest Regressor <href name="rf"></href>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=4)

In [None]:
for data_name in ['X_train', 'X_test', 'y_train', 'y_test']:
    print("{} {}".format(data_name, locals()[data_name].shape))

In [None]:
param_grid = {'n_estimators': range(0,100,10)}
rf = RandomForestRegressor()
rf_gs = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)

In [None]:
rf_gs.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

<a href="#toc">Back to Contents</a>

# Predict and submit <href name="result"></href>

Contents of submission file

In [None]:
subm.head(1)

In [None]:
prediction = rf.predict(dft.drop(['id'], axis=1).values)

In [None]:
pd.DataFrame({'id':dft.id, 'target':prediction.T.squeeze()}).to_csv("submission.csv", index=False, header=True)

#### Conclusion


The result after submission on kaggle is <b> 0.86841 </b>
You can try to improve the result with another NN architecrure (change activation functions, number of layers and neurons, optimizer, ...)

<font color='green' size=20>Good luck!</font>

<a href="#toc">Back to Contents</a>