# Tabular Playground: Playing with sklearn

In this notebook, we will present: 
- A shallow EDA allowing to build baseline models
- Build a simple model with Scikit Learn and XGBoost
- Stacking models with Scikit Learn
- Hyperparameters tuning with Scikit Learn
- Submission and prediction visualization

In [None]:
# Imports
# Data
import numpy as np
import pandas as pd 
# Viz
import seaborn as sns
import matplotlib.pyplot as plt 
# Tensorflow
import tensorflow as tf 
# Scikit Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
import sklearn
sklearn.set_config(display='diagram')
# XGBoost
from xgboost import XGBRegressor

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading data
root = '/kaggle/input/tabular-playground-series-feb-2021/'
train = pd.read_csv(f'{root}/train.csv')
assert all(train.isna().sum() == 0), 'Some NA in the data'

# Features
features = [c for c in train.columns if ('cat' in c) or ('cont' in c)]
target = 'target'
cat_features = [c for c in train.columns if 'cat' in c]
cont_features = [c for c in train.columns if 'cont' in c]

# Train, val
train, test = train_test_split(train)

# EDA

### Categorical Features
* Letters are the different classes
* No NaNs
* The features can be imbalanced (eg. `cat0`, `cat6`). Some even seem to have a Zipf law distribution. 
* One-Hot encoding is a good baseline to handle those features. 

In [None]:
categories = np.sort(list(set([x for f in cat_features for x in train[f].unique()])))
category_summary = pd.DataFrame(columns=cat_features, index=categories, data=np.zeros([len(categories), len(cat_features)]))
for f in cat_features: 
    for c in categories: 
        category_summary.loc[c, f] = (train[f] == c).sum()
display(category_summary.astype(int))

### Continuous Features
* They are all in the interval $[0, 1]$, somewhat already transformed. 
* Some features are discontinuous (eg. `cont1`)
* Many features look like a mixture of gaussian, we could look into fitting those
* No NaNs

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(30, 15))
for i, f in enumerate(cont_features + [target]): 
    xidx = i // 5
    yidx = i % 5
    sns.distplot(train[f], ax=axs[xidx, yidx])
fig.show()

### Target

See plot above
* Not normally distributed, looks more bi-modal
* Huge bias that needs to be taken into account, we will use: `sklearn.compose.TransformedTargetRegressor`

In [None]:
# Helpers
def val_model(model, test): 
    # Validation Predictions
    test_predict = model.predict(test[features])
    # Viz Results
    plt.figure(figsize=(10, 7))
    plt.scatter(
        test[target], 
        test_predict, 
        marker='+', 
        label='Prediction'
    )
    plt.plot(
        [test[target].min(), test[target].max()], 
        [test[target].min(), test[target].max()],
        c='r',
        label='Perfect Prediction', 
    )
    plt.xlabel('Target')
    plt.ylabel('Prediction')
    plt.legend()
    plt.show()
    # Metric
    mse = mean_squared_error(test[target], test_predict)
    print(f'MSE: {mse}\tRMSE: {np.sqrt(mse)}')
    
def create_submission(model, fp=None): 
    test = pd.read_csv(f'{root}/test.csv')
    assert all(test[features].isna().sum() == 0), 'Unexpected NAs'
    prediction = model.predict(test[features])
    submission = pd.DataFrame(
        columns=['id', 'target'], 
    )
    submission['id'] = test['id']
    submission['target'] = prediction
    if bool(fp): 
        display(submission.head())
        print(f'Saving: {fp}')
        submission.to_csv(fp, index=False)
    else: 
        return submission

# Preprocessing

In [None]:
# Preprocessing
column_transformer = ColumnTransformer([
    (
        'numerical', 
        SimpleImputer(strategy='median'), 
        cont_features, 
    ), 
    (
        'categorical', 
        OneHotEncoder(handle_unknown='ignore'), 
        cat_features, 
    )
])
scaler = StandardScaler()
preprocessing = Pipeline([
    ('column', column_transformer), 
    ('scaler', scaler), 
])

# Scikit Learn
### A simple model - XGB

In [None]:
# Model 
model = TransformedTargetRegressor(
    Pipeline(
        [
            ('preprocessing', preprocessing), 
            ('regressor', XGBRegressor(
                n_estimators=100, 
                max_depth=2,
                objective='reg:squarederror', 
                tree_method='gpu_hist'
            )), 
        ], 
        verbose=False
    ), 
    transformer=QuantileTransformer()
)

model.fit(train[features], train[target])

In [None]:
val_model(model, test)

### Stacking

In [None]:
# Models
rf = RandomForestRegressor(
    n_estimators=50, 
    max_depth=2, 
    criterion='mse', 
    max_features=4,
)
xgb = XGBRegressor(
    n_estimators=100, 
    max_depth=2,
    objective='reg:squarederror', 
    tree_method='gpu_hist'
)
ridge = Ridge()
lasso = Lasso()

stacked = StackingRegressor(
    estimators=[
        # ('random_forest', rf), 
        ('xgboost', xgb), 
        ('ridge', ridge), 
        ('lasso', lasso), 
    ], 
    final_estimator=Ridge(),
    cv=5,
    passthrough=False, 
    verbose=0
)

# Model 
stacked_model = TransformedTargetRegressor(
    Pipeline(
        [
            ('preprocessing', preprocessing), 
            ('regressor', stacked), 
        ], 
        verbose=False
    ), 
    transformer=StandardScaler()
)


stacked_model.fit(train[features], train[target])

In [None]:
val_model(stacked_model, test)

### Hyperparameters

In [None]:
# Get the list of all parameters to train on
print('\n'.join(list(stacked_model.get_params().keys())))

In [None]:
param_grid = {
    'regressor__regressor__ridge__alpha': [1],                        # Ridge Pen
    'regressor__regressor__lasso__alpha': [1],                        # Lasso Pen
    'regressor__regressor__xgboost__n_estimators': [100],             # XBB n estimators
    'regressor__regressor__xgboost__max_depth': [2]                   # XGB depth
}

cross_val_stacked = GridSearchCV(
    estimator=stacked_model, 
    param_grid=param_grid, 
    cv=5, 
    return_train_score=False,   
    scoring='neg_mean_squared_error', 
    verbose=False
)

cross_val_stacked.fit(train[features], train[target])

In [None]:
val_model(cross_val_stacked, test)

### Model Submission

In [None]:
create_submission(stacked_model, fp='submission.csv')

# TensorFlow Example

In [None]:
# Prepropcessing the data the same as before
preprocessing.fit(train[features])
train_X =  preprocessing.transform(train[features])
train_y =  train[target].values
test_X  =  preprocessing.transform(test[features])
test_y  =  test[target].values

In [None]:
def build_tf_neural_network(input_shape=70, n_layers=3, units=1024, leak_relu=0.1, dropout=0, skip_conn=True, normalization=False):
    inputs = tf.keras.Input(shape=(input_shape), name="input")
    x = inputs
    for i_layer in range(n_layers): 
        if normalization: 
            x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(units, activation=tf.keras.layers.LeakyReLU(leak_relu))(x)
        x = tf.keras.layers.Dropout(dropout)(x)
        if skip_conn: 
            x = tf.keras.layers.Concatenate()([x, inputs])
    # Some more layers
    x = tf.keras.layers.Dense(64, activation=None)(x)
    x = tf.keras.layers.Dense(64, activation=None)(x)
    x = tf.keras.layers.Dense(64, activation=None)(x)
    outputs = tf.keras.layers.Dense(1, activation=None)(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.00001), 
        loss=tf.keras.losses.MeanSquaredError(), 
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    return model

In [None]:
nn_model = build_tf_neural_network()
nn_model.summary()

In [None]:
# Training
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10)
history = nn_model.fit(
    x=train_X, y=train_y, 
    epochs=50, 
    batch_size=4096, 
    validation_data=(test_X, test_y), 
    callbacks=[early_stopping], 
    verbose=False
)
# History
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.show()

# Test
test_predict = nn_model.predict(test_X)
# Metric
mse = mean_squared_error(test_y, test_predict)
print(f'MSE: {mse}\tRMSE: {np.sqrt(mse)}')

In [None]:
nn_pipeline = Pipeline([
        ('preprocessing', preprocessing), 
        ('regressor', nn_model), 
    ], 
    verbose=False
)
val_model(nn_pipeline, test)

 # Conclusion
 
 Those models are not good as shown by the validation graphs. But the score achieved is close to the current top models. 
 
 ##### Next steps
 * More EDA / Feature engineering: 
   * what are we supposed to do with the seemingly discrete `cont` feature ?
   * can we fit a multi-modal model on each features? 
   * the target is bi-modal, what should we do with this? 
* Tune hyperparameters: I did not spend much time on this
* Stack more models: Random Forest, SVM Regressor, etc
