# TPS Jan 2022: Tree, Kernel, NN and ensemble models

### This notebook presents the use of some typical such as Tree models (Random Forest, XGBoost), Kernel model (Support Vector Machines) and Artificial Neural Networks as well as their ensemble to solve the TPS problem of Jan 2022.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import mean_squared_error as mse

from xgboost import XGBRegressor as xgb

from sklearn.svm import SVR

from tensorflow.keras import models, layers, optimizers, metrics

import warnings
warnings.simplefilter('ignore')

np.random.seed(2022)

# Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

train.sample(5)

In [None]:
print(f'There are {train.shape[0]} samples in the train dataset')

In [None]:
X_train = pd.get_dummies(train[['country', 'store', 'product']])
X_test = pd.get_dummies(test[['country', 'store', 'product']])

y_train = train['num_sold']

date_train = pd.to_datetime(train['date'])
date_test = pd.to_datetime(test['date'])

del train, test

## Visualize target

Num_sold distribution is skewed


In [None]:
plt.figure(figsize=(10,7))
y_train.hist(bins=100, grid=False, color='green')
plt.xlabel('Num sold', fontsize=16)
plt.ylabel('Frequence', fontsize=16)
plt.show()

## Transform target

Distribution is more symmetric by considering log of num_sold

In [None]:
plt.figure(figsize=(10,7))
np.log(y_train).hist(bins=100, grid=False, color='green')
plt.xlabel('Num sold', fontsize=16)
plt.ylabel('Frequence', fontsize=16)
plt.show()

# Feature engineering

In [None]:
for df in [X_train, X_test]:
    df['year'] = date_train.dt.year
    df['month'] = date_train.dt.month
    df['week'] = date_train.dt.isocalendar().week.astype('int64')
    df['day'] = date_train.dt.day
    df['dayofweek'] = date_train.dt.dayofweek

In [None]:
X_train.sample(5)

## Transform target, normalize features and split train/validation

In [None]:
y_train = np.log(y_train)

for col in X_train.columns:
    mean = X_train[col].mean()
    std  = X_train[col].std()
    X_train[col] -= mean
    X_train[col] /= std
    X_test[col] -= mean
    X_test[col] /= std
    
X_train.sample(5)

In [None]:
X_train_subset, X_val, y_train_subset, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=2022)

# Models

In [None]:
y_test_all_models = []

## Random Forest Regressor

### Training

In [None]:
model = rfr(n_estimators=100)
model.fit(X_train_subset, y_train_subset)

### Validation

In [None]:
y_train_pred = model.predict(X_train_subset)
rmse_train = np.sqrt(mse(y_train_pred, y_train_subset))

y_pred = model.predict(X_val)
rmse_val = np.sqrt(mse(y_pred, y_val))

In [None]:
plt.figure(figsize=(20,7))

plt.subplot(1,2,1)
plt.plot(y_train_subset, y_train_pred, 'r.')
plt.plot([min(y_train_subset), max(y_train_subset)], [min(y_train_subset), max(y_train_subset)], 'k')
plt.xlim(min(y_train_subset), max(y_train_subset))
plt.ylim(min(y_train_subset), max(y_train_subset))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_train,3)}')
plt.title('Train subset', fontsize=20)

plt.subplot(1,2,2)
plt.plot(y_val, y_pred, 'r.')
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'k')
plt.xlim(min(y_val), max(y_val))
plt.ylim(min(y_val), max(y_val))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_val,3)}')
plt.title('Validation subset', fontsize=20)

plt.show()

### Prediction

In [None]:
y_test_rfr = model.predict(X_test)
y_test_all_models.append(y_test_rfr)

## Extreme Gradient Boosting Regressor

### Training

In [None]:
model = xgb(n_estimators=100, learning_rate=0.3)
model.fit(X_train_subset, y_train_subset, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=0)

### Validation

In [None]:
y_train_pred = model.predict(X_train_subset)
rmse_train = np.sqrt(mse(y_train_pred, y_train_subset))

y_pred = model.predict(X_val)
rmse_val = np.sqrt(mse(y_pred, y_val))

In [None]:
plt.figure(figsize=(20,7))

plt.subplot(1,2,1)
plt.plot(y_train_subset, y_train_pred, 'r.')
plt.plot([min(y_train_subset), max(y_train_subset)], [min(y_train_subset), max(y_train_subset)], 'k')
plt.xlim(min(y_train_subset), max(y_train_subset))
plt.ylim(min(y_train_subset), max(y_train_subset))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_train,3)}')
plt.title('Train subset', fontsize=20)

plt.subplot(1,2,2)
plt.plot(y_val, y_pred, 'r.')
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'k')
plt.xlim(min(y_val), max(y_val))
plt.ylim(min(y_val), max(y_val))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_val,3)}')
plt.title('Validation subset', fontsize=20)

plt.show()

### Prediction

In [None]:
y_test_xgb = model.predict(X_test)
y_test_all_models.append(y_test_xgb)

## Support Vector Machines

### Training

In [None]:
model = SVR(kernel='rbf', C=100, gamma=1)
model.fit(X_train_subset, y_train_subset)

### Validation

In [None]:
y_train_pred = model.predict(X_train_subset)
rmse_train = np.sqrt(mse(y_train_pred, y_train_subset))

y_pred = model.predict(X_val)
rmse_val = np.sqrt(mse(y_pred, y_val))

In [None]:
plt.figure(figsize=(20,7))

plt.subplot(1,2,1)
plt.plot(y_train_subset, y_train_pred, 'r.')
plt.plot([min(y_train_subset), max(y_train_subset)], [min(y_train_subset), max(y_train_subset)], 'k')
plt.xlim(min(y_train_subset), max(y_train_subset))
plt.ylim(min(y_train_subset), max(y_train_subset))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_train,3)}')
plt.title('Train subset', fontsize=20)

plt.subplot(1,2,2)
plt.plot(y_val, y_pred, 'r.')
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'k')
plt.xlim(min(y_val), max(y_val))
plt.ylim(min(y_val), max(y_val))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_val,3)}')
plt.title('Validation subset', fontsize=20)

plt.show()

### Prediction

In [None]:
y_test_svm = model.predict(X_test)
y_test_all_models.append(y_test_svm)

## Artificial Neural Networks

### Network

In [None]:
model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(1))

model.compile( optimizer = optimizers.RMSprop(learning_rate=0.01), loss='mse', metrics=['mae'] )

### Training

In [None]:
history = model.fit(X_train_subset, y_train_subset, epochs=3000, batch_size=128, verbose=1)

### Validation

In [None]:
y_train_pred = model.predict(X_train_subset)
rmse_train = np.sqrt(mse(y_train_pred, y_train_subset))

y_pred = model.predict(X_val)
rmse_val = np.sqrt(mse(y_pred, y_val))

In [None]:
plt.figure(figsize=(20,7))

plt.subplot(1,2,1)
plt.plot(y_train_subset, y_train_pred, 'r.')
plt.plot([min(y_train_subset), max(y_train_subset)], [min(y_train_subset), max(y_train_subset)], 'k')
plt.xlim(min(y_train_subset), max(y_train_subset))
plt.ylim(min(y_train_subset), max(y_train_subset))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_train,3)}')
plt.title('Train subset', fontsize=20)

plt.subplot(1,2,2)
plt.plot(y_val, y_pred, 'r.')
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'k')
plt.xlim(min(y_val), max(y_val))
plt.ylim(min(y_val), max(y_val))
plt.xlabel('True', fontsize=16)
plt.ylabel('Prediction', fontsize=16)
plt.text(7, 6, f'RMSE={round(rmse_val,3)}')
plt.title('Validation subset', fontsize=20)

plt.show()

### Prediction

In [None]:
y_test_ann = model.predict(X_test)
y_test_all_models.append(y_test_ann.reshape(len(y_test_ann)))

# Submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission['num_sold'] = np.exp( np.array(y_test_all_models).mean(axis=0) ).astype('int64') # ensemble model
submission.to_csv('submission.csv', index=False)

submission.sample(5)