# Tabular Playground Series - March 2021

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gc

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [None]:
pd.set_option('max_columns', None)

# Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
train = train.sample(frac=1, random_state=1)

print(train.shape)

In [None]:
mb = train.memory_usage(index=True, deep=True).sum() / 1024**2
print(mb)

In [None]:
train.head()

# Missing Values

In [None]:
train.isnull().sum().to_frame().T

# Label Distribution

In [None]:
(train.target.value_counts() / len(train)).to_frame()

In [None]:
y_train = train.target.values
train.drop(['id', 'target'], axis=1, inplace=True)

# Column Types

In [None]:
for c, d in zip(train.columns, train.dtypes):
    print(f'{c:<10}{d}')

In [None]:
cat_features = train.columns.values[train.dtypes.values == 'O'].tolist()
num_features = [c for c in train.columns if c not in cat_features]
features = num_features + cat_features

print(cat_features)
print(num_features)

# Explore Categorical Levels

In [None]:
train[cat_features].nunique()

In [None]:
cat_features.remove('cat10')
features.remove('cat10')

# Preprocessing

In [None]:
num_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())  
    ]
)

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [None]:
preprocessor.fit(train)
X_train = preprocessor.transform(train)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)

In [None]:
del train
gc.collect()

# Sample Training Data

In [None]:
X_sample, X_valid, y_sample, y_valid = train_test_split(X_train, y_train, test_size=0.8, stratify=y_train, random_state=1)

print(X_sample.shape)
print(X_valid.shape)

# Model Selection

## Logistic Regression

In [None]:
%%time 

lr_clf = LogisticRegression(max_iter=1000, solver='saga', penalty='elasticnet')

lr_parameters = {
    'l1_ratio': [0, 1],
    'C': [0.01, 0.1, 1, 10]
}

lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
lr_grid.fit(X_sample, y_sample)

lr_model = lr_grid.best_estimator_

print('Best Parameters:', lr_grid.best_params_)
print('Best CV Score:  ', lr_grid.best_score_)
print('Training Acc:   ', lr_model.score(X_sample, y_sample))
print('Validation Acc: ', lr_model.score(X_valid, y_valid))

In [None]:
lr_summary = pd.DataFrame(lr_grid.cv_results_['params'])
lr_summary['cv_score'] = lr_grid.cv_results_['mean_test_score']

for r in lr_parameters['l1_ratio']:
    temp = lr_summary.query(f'l1_ratio == {r}')
    plt.plot(temp.C, temp.cv_score, label=r)
plt.xscale('log')
plt.xlabel('Regularization Parameter (C)')
plt.ylabel('CV Score')
plt.legend(title='L1 Ratio', loc='lower right')
plt.grid()
plt.show()

for p, s in zip(lr_grid.cv_results_['params'], lr_grid.cv_results_['mean_test_score']):
    print(f"l1: {p['l1_ratio']:<.3f},  C: {p['C']:>8.3f},  score: {s:.4f}")

# Decision Trees

In [None]:
%%time 

dt_clf = DecisionTreeClassifier(random_state=1)

dt_parameters = {
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16],
    'min_samples_leaf': [2, 4, 8, 16]
}

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
dt_grid.fit(X_sample, y_sample)

dt_model = dt_grid.best_estimator_

print('Best Parameters:', dt_grid.best_params_)
print('Best CV Score:  ', dt_grid.best_score_)
print('Training Acc:   ', dt_model.score(X_sample, y_sample))
print('Validation Acc: ', dt_model.score(X_valid, y_valid))

In [None]:
dt_summary = pd.DataFrame(dt_grid.cv_results_['params'])
dt_summary['cv_score'] = dt_grid.cv_results_['mean_test_score']

for ms in dt_parameters['min_samples_leaf']:
    temp = dt_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

for p, s in zip(dt_grid.cv_results_['params'], dt_grid.cv_results_['mean_test_score']):
    print(f"depth: {p['max_depth']:>3},  min_inst: {p['min_samples_leaf']:>4},  score: {s:.4f}")

# Random Forest

In [None]:
%%time 

rf_clf = RandomForestClassifier(random_state=1, n_estimators=50)

rf_parameters = {
    'max_depth': [4, 8, 16, 20, 24, 28, 32],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
rf_grid.fit(X_sample, y_sample)

rf_model = rf_grid.best_estimator_

print('Best Parameters:', rf_grid.best_params_)
print('Best CV Score:  ', rf_grid.best_score_)
print('Training Acc:   ', rf_model.score(X_sample, y_sample))
print('Validation Acc: ', rf_model.score(X_valid, y_valid))

In [None]:
rf_summary = pd.DataFrame(rf_grid.cv_results_['params'])
rf_summary['cv_score'] = rf_grid.cv_results_['mean_test_score']

for ms in rf_parameters['min_samples_leaf']:
    temp = rf_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

for p, s in zip(rf_grid.cv_results_['params'], rf_grid.cv_results_['mean_test_score']):
    print(f"depth: {p['max_depth']:>3},  min_inst: {p['min_samples_leaf']:>4},  score: {s:.4f}")

# Train Final Model

In [None]:
print(rf_grid.best_params_)

In [None]:
final_model = RandomForestClassifier(random_state=1, n_estimators=50, max_depth=24, min_samples_leaf=4)
final_model.fit(X_train, y_train)

print(final_model.score(X_train, y_train))

# Save Final Model

In [None]:
joblib.dump(preprocessor, 'tps_preprocessor_01.joblib')
joblib.dump(final_model, 'tps_model_01.joblib')
print('Model written to file.')