In [22]:
import pickle

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
dataset = pd.read_csv('../dataset/train_preprocessing.csv')
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Relatives,Alone
0,0,3,1,22,7.25,0,1,0
1,1,1,0,38,71.2833,1,1,0
2,1,3,0,26,7.925,0,0,1
3,1,1,0,35,53.1,0,1,0
4,0,3,1,35,8.05,0,0,1


In [3]:
X = dataset.drop('Survived', axis=1)
Y = dataset.Survived
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=101)

In [4]:
steps = [('scaller', MinMaxScaler()), 
         ('classifier', RandomForestClassifier())]

pipeline = Pipeline(steps)

In [18]:
param_grid = {
    'classifier__n_estimators': [100, 300, 500, 700, 1000],
    'classifier__max_depth': [None, 1, 2, 3],
    'classifier__criterion': ['gini', 'entropy']
}

search = GridSearchCV(pipeline, param_grid, n_jobs=-1, scoring='recall')
search.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaller',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('classifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                      

In [19]:
search.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': None,
 'classifier__n_estimators': 500}

In [20]:
predictor = search.best_estimator_

In [21]:
print(predictor.score(x_train, y_train))
print(accuracy_score(predictor.predict(x_test), y_test))
print(confusion_matrix(predictor.predict(x_test), y_test))
print(classification_report(predictor.predict(x_test), y_test))

0.9864864864864865
0.8251121076233184
[[119  22]
 [ 17  65]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       141
           1       0.75      0.79      0.77        82

    accuracy                           0.83       223
   macro avg       0.81      0.82      0.81       223
weighted avg       0.83      0.83      0.83       223



In [24]:
pd.DataFrame(confusion_matrix(predictor.predict(x_test), y_test), columns=['Predict 0', 'Predict 1'], index=['Actual 0', 'Actual 1'])

Unnamed: 0,Predict 0,Predict 1
Actual 0,119,22
Actual 1,17,65


In [23]:
acc = f'{accuracy_score(predictor.predict(x_test), y_test):.4f}'
with open(f'../models/random-forest-{acc}.pkl', 'wb') as file:
    pickle.dump(predictor, file)