# RandomForestClassifier

Related notebooks:

1. EDA - https://www.kaggle.com/agorinenko/feb-2022-part1-eda
2. CatBoostClassifier - https://www.kaggle.com/agorinenko/feb-2022-part2-cat-boost-classifier
3. LGBMClassifier - https://www.kaggle.com/agorinenko/feb-2022-part3-lgbm-classifier
4. XGBClassifier - https://www.kaggle.com/agorinenko/feb-2022-part4-xgb-classifier

In [None]:
from functools import partial

import numpy as np
import pandas as pd

from hyperopt import Trials, fmin, tpe, hp
from hyperopt import STATUS_OK


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, cross_val_score


from sklearn.ensemble import RandomForestClassifier

# Load data from eda notebook

In [None]:
train_df = pd.read_csv('../input/feb-2022-eda/train.csv', index_col="row_id")
test_df = pd.read_csv('../input/feb-2022-eda/test.csv')

Let's separate the target variable and the features.

In [None]:
features_columns = [e for e in train_df.columns if e != 'row_id' and e != 'target']

Encode the target variable.

In [None]:
target_col = 'target_num'

le = LabelEncoder()
train_df[target_col] = le.fit_transform(train_df.target)

train_df.head()

In [None]:
X_train = train_df[features_columns].astype(np.float64)
y_train = train_df[target_col].astype(np.float64)

X_test = test_df[features_columns].astype(np.float64)

# Optimize global parameters

In [None]:
# !pip install hyperopt==0.2.5 -q

In [None]:
# def objective(params, model, X_train, y_train):
#     model.set_params(**params)

#     skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
#     score = cross_val_score(estimator=model, X=X_train, y=y_train, scoring='accuracy', cv=skf, n_jobs=-1)

#     return {'loss': -score.mean(), 'params': params, 'status': STATUS_OK}

In [None]:
# model = RandomForestClassifier(random_state=42)

In [None]:
# %%time

# trials = Trials()
# max_depth_choice = np.arange(50, 1050, 100, dtype=int)
# max_features_choice = ['auto', 'sqrt', 'log2']
# n_estimators_choice = np.arange(80, 200, 10, dtype=int)
# criterion_choice = ['gini', 'entropy']
# class_weight_choice = ['balanced', 'balanced_subsample']
# min_samples_split_choice = np.arange(0.1, 1, 0.1, dtype=float)
# min_samples_leaf_choice = np.arange(1, 5, 1, dtype=int)
# min_weight_fraction_leaf_choice = np.arange(0, 0.6, 0.1, dtype=float)

# search_space = {
#     'max_depth': hp.choice('max_depth', max_depth_choice),
#     'max_features': hp.choice('max_features', max_features_choice),
#     'n_estimators': hp.choice('n_estimators', n_estimators_choice),
#     'criterion': hp.choice('criterion', criterion_choice),
#     'class_weight': hp.choice('class_weight', class_weight_choice),
#     'min_samples_split': hp.choice('min_samples_split', min_samples_split_choice),
#     'min_samples_leaf': hp.choice('min_samples_leaf', min_samples_leaf_choice),
#     'min_weight_fraction_leaf': hp.choice('min_weight_fraction_leaf', min_weight_fraction_leaf_choice),
# }
# best = fmin(
#     fn=partial(objective, model=model, X_train=X_train, y_train=y_train),
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=trials,
#     rstate=np.random.RandomState(1),
#     show_progressbar=True
# )

In [None]:
# max_depth = max_depth_choice[best['max_depth']]
# max_features = max_features_choice[best['max_features']]
# n_estimators = n_estimators_choice[best['n_estimators']]
# criterion = criterion_choice[best['criterion']]
# class_weight = class_weight_choice[best['class_weight']]
# min_samples_split = min_samples_split_choice[best['min_samples_split']]
# min_samples_leaf = min_samples_leaf_choice[best['min_samples_leaf']]
# min_weight_fraction_leaf = min_weight_fraction_leaf_choice[best['min_weight_fraction_leaf']]


# print(f'max_depth: {max_depth}')
# print(f'max_features: {max_features}')
# print(f'n_estimators: {n_estimators}')
# print(f'criterion: {criterion}')
# print(f'class_weight: {class_weight}')
# print(f'min_samples_split: {min_samples_split}')
# print(f'min_samples_leaf: {min_samples_leaf}')
# print(f'min_weight_fraction_leaf: {min_weight_fraction_leaf}')

# Train model

In [None]:
%%time

model = RandomForestClassifier(random_state=42,
                              max_depth=950,
                              max_features='log2',
                              n_estimators=160,
                              criterion='gini',
                              class_weight='balanced',
                              min_samples_split=0.1,
                              min_samples_leaf=1,
                              min_weight_fraction_leaf=0)
model.fit(X_train, y_train)

# Validate

In [None]:
scores = cross_validate(model, 
                        X_train, y_train, 
                        cv=5,                      
                        scoring=('accuracy'))

In [None]:
print(f'Mean validation accuracy score: {scores["test_score"].mean()}')

# Predict

In [None]:
y_pred = model.predict(X_test)

# Submission

In [None]:
def save_submission(y_pred):  
    y_pred = y_pred.astype(np.int64)
    y_pred_class = le.inverse_transform(y_pred)
    submission = test_df[['row_id']].copy() 
    submission["target"] = y_pred_class
    
    assert len(y_pred_class) == submission.shape[0]
    assert 2 == submission.shape[1]
    
    submission.to_csv("submission.csv",index=False)
    return submission

In [None]:
save_submission(y_pred.flatten()).head()