In [None]:
# found this competition ~2 days before the submission deadline - trying a *really* quick attempt :/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df_test  = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
print(df_train.columns)
print(df_test.columns)

In [None]:
df_train.drop(['row_id'], axis=1, inplace=True)
df_test.drop(['row_id'], axis=1, inplace=True)
print(df_train.columns)
print(df_test.columns)

In [None]:
# clean duplicates
print(df_train.duplicated().sum())
df_train = df_train.drop_duplicates()
print(df_train.duplicated().sum())

In [None]:
print(df_train.isna().sum().sum())
print(df_test.isna().sum().sum())

In [None]:
print(df_train['target'])

In [None]:
# class imbalance
n_samples = df_train['target'].value_counts().sort_index()
n_samples /= n_samples.sum().sum()
print(n_samples)

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df_train['target'] = enc.fit_transform(df_train['target'])
print(df_train['target'])

In [None]:
# split training into train + validation for model testing
from sklearn.model_selection import train_test_split

X = df_train.drop(['target'], axis=1)
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# check class imbalance in train/val splits
# class imbalance
n_samples = y_train.value_counts().sort_index()
n_samples /= n_samples.sum().sum()
print(n_samples)

# check class imbalance in train/val splits
# class imbalance
n_samples = y_val.value_counts().sort_index()
n_samples /= n_samples.sum().sum()
print(n_samples)

# hyperparameter tuning

In [None]:
# defining search space
from hyperopt import hp
mln = [v + 1 for v in range(1, 20)]
mln.append(None)
print(mln)

params = {
    'n_estimators'   : hp.quniform('n_estimators', 50, 200, 25),
    'max_depth'      : hp.quniform('max_depth', 1, 20, 1),
    'max_leaf_nodes' : hp.choice('max_leaf_nodes', mln),
    'max_features'   : hp.choice('max_features', ['sqrt', 'log2']),
}

# Objective function

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK

def objective(params):
    global X_train, y_train, X_val, y_val
    
    params['n_estimators']   = int(params['n_estimators'])
    params['max_depth']      = int(params['max_depth'])
    
    if params['max_leaf_nodes'] is not None:
        params['max_leaf_nodes'] = int(params['max_leaf_nodes'])
    
    # fit model
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # make predictions with fitted model
    y_pred = model.predict(X_val)
    
    # metrics
    accuracy  = accuracy_score(y_val, y_pred)
    # return metrics
    return {
        'loss'     : -accuracy,
        'status'   : STATUS_OK,
        'accuracy' : accuracy,
        }

# run trials

In [None]:
"""from hyperopt import fmin, tpe, Trials

trials = Trials()
best = fmin(objective,
            space = params,
            algo = tpe.suggest,
            max_evals = 50,
            trials = trials,
           max_queue_len = 5)
print(best)"""

# Best parameters
* max_depth = 18
* max_features = log2
* max_leaf_nodes = 20
* n_estimators = 150

In [None]:
# run another search, around best parameters
# defining search space
params_fine = {
    'n_estimators'   : hp.quniform('n_estimators', 100, 200, 10),
    'max_depth'      : 18,
    'max_leaf_nodes' : None,
    'max_features'   : 'log2',
}

In [None]:
"""trials = Trials()
best_fine = fmin(objective,
            space = params_fine,
            algo = tpe.suggest,
            max_evals = 15,
            trials = trials)
print(best_fine)"""

# Cross validation for model performance

In [None]:
"""from sklearn.model_selection import cross_val_score
model = RandomForestClassifier(
    #n_estimators   = 190,
    n_estimators   = 500,
    max_depth      = 18,
    max_leaf_nodes = None,
    max_features   = 'log2')

scores = cross_val_score(model,
                         X,
                         y,
                         cv = 5,
                         scoring = 'accuracy',
                         verbose = 1,
                        n_jobs = -1)"""

In [None]:
#print(np.mean(scores))
#print(np.std(scores))

# Fit model with optimised hyperparameters on full training data

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators   = 500,
    max_depth      = 18,
    max_leaf_nodes = None,
    max_features   = 'log2')

model.fit(X, y)

pred = model.predict(df_test)
pred = pred.astype(int)
pred = enc.inverse_transform(pred)

In [None]:
print(pred)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission.target = pred
submission.to_csv('submission.csv', index=False)

In [None]:
print(submission.head())