# Random Forest Optuna Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna 
import optuna.visualization as optvis
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
def opt_plot(study, plot):
    if plot == 0: return optvis.plot_optimization_history(study)
    if plot == 1: return optvis.plot_slice(study)
    if plot == 2: return optvis.plot_parallel_coordinate(study)
    if plot == 3: return optvis.plot_contour(study)
    if plot == 4: return optvis.plot_param_importances(study)

# Load Data

In [None]:
train = pd.read_csv('../input/reducing-commercial-aviation-fatalities/train.csv')
print(train.shape)

In [None]:
train.head()

# Checking Missing Values

In [None]:
train.isna().sum(axis=0).to_frame().T

In [None]:
(train.event.value_counts() / len(train)).to_frame()

In [None]:
y_train = train.event.values
train.drop(['crew', 'experiment', 'time', 'seat', 'event'], axis=1, inplace=True)

In [None]:
x_train = train.iloc[:,0:27]
x_train.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_idx, valid_idx = train_test_split(range(len(x_train)), test_size=0.9, random_state=1, stratify=y_train)

print(len(train_idx))
print(len(valid_idx))

indices = [(train_idx, valid_idx)]

# Random Forest Model

In [None]:
%%time 

def rf_objective(trial):
    
    n  = trial.suggest_int('n_estimators', 20, 300)
    md = trial.suggest_int('max_depth', 2, 556)
    mi = trial.suggest_int('min_inst', 1, 64)
    
    clf =  RandomForestClassifier(random_state=1, max_depth=md, min_samples_leaf=mi, n_estimators=n)
    
    scores = cross_val_score(clf, x_train, y_train, n_jobs=-1, cv=indices, scoring='neg_log_loss')
    
    return scores.mean()
    
rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(rf_objective, n_trials=20)

In [None]:
opt_plot(rf_study, plot=0)

In [None]:
opt_plot(rf_study, plot=1)

In [None]:
opt_plot(rf_study, plot=2)

In [None]:
opt_plot(rf_study, plot=3)

In [None]:
opt_plot(rf_study, plot=4)