# Baseline Model
---
Antes de começar a experimentar com modelos mais complexos, é válido estabelecer um baseline de resultado. Normalmente opta-se por um modelo simples, como uma árvore de decisão ou regressão logística. Este ultime será utilizado neste notebook.

In [1]:
#Imports
import pandas as pd 
import numpy as np

from copy import deepcopy

from data_treatment import treat_train_data, treat_test_data

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterSampler

from sklearn.ensemble import RandomForestClassifier

In [2]:
#Load data 
train_data_path = "./data/train.csv"
test_data_path = "./data/test.csv"

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
#Separate target from features
print(train_df.shape)
y = train_df["Survived"]
X = train_df.drop(["Survived"], axis=1)
print(X.shape, y.shape)
print(X.head())

(891, 12)
(891, 11) (891,)
   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


In [4]:
#Define Parameter grid for randomized search
param_grid = {'n_estimators': [2, 5, 10, 25, 50, 100, 250, 500, 1000],
              'min_samples_leaf': [1, 2, 3, 4, 5, 10],
              'max_features': [None, "sqrt", "log2"]}
param_list = list(ParameterSampler(param_grid, n_iter=30))

In [5]:
#Run Stratified kfold
skf = StratifiedKFold(n_splits=5)

best_conf = None
best_acc = 0
for param_conf in param_list:
    
    accs = []
    for train_index, test_index in skf.split(deepcopy(X), y):

        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, treat_params = treat_train_data(X_train)
        X_test = treat_test_data(X_test, treat_params)

        model = RandomForestClassifier(n_estimators=param_conf["n_estimators"], 
                                       min_samples_leaf=param_conf["min_samples_leaf"],
                                       max_features=param_conf["max_features"]).fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        accs.append(acc)

    if np.mean(accs) > best_acc:
        print("Updated best acc: %s" % np.mean(accs))
        best_acc = np.mean(accs)
        best_conf = param_conf
        

Updated best acc: 0.583535423065643
Updated best acc: 0.7856735987505439
Updated best acc: 0.7992446947233958
Updated best acc: 0.812696739150865
Updated best acc: 0.8216037594328934


In [6]:
#After finding best params, retrain model with complete training set

#Treat data
X, params = treat_train_data(X)
X_test = treat_test_data(test_df, params)

best_model = RandomForestClassifier(n_estimators=best_conf["n_estimators"], 
                                    min_samples_leaf=best_conf["min_samples_leaf"],
                                    max_features=best_conf["max_features"]).fit(X, y)
predictions = best_model.predict(X_test)
test_df["Survived"] = np.array(predictions)

In [7]:
#Save to csv
test_df[["PassengerId", "Survived"]].to_csv("submission_rf.csv", index=False)