# <span style='color:#A80808'>Objective</span>

This notebook provides a baseline Random Forest model. A [fast comparison](https://www.kaggle.com/sytuannguyen/model-selection) has shown that Random Forest is a good model for solving the present problem. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier as rfc

# <span style='color:#A80808'>Data</span>

Data used for training the model is prepared by this [notebook](https://www.kaggle.com/sytuannguyen/spaceship-titanic-feature-engineering).

In [None]:
train = pd.read_csv('../input/spaceship-titanic-feature-engineering/train.csv')
train_targets = train.pop('Transported').astype('int64')
train.head(3)

# <span style='color:#A80808'>Random Forest model</span>

In [None]:
# hyperparameters
params=dict(n_estimators=100,
            criterion='gini', 
            max_depth=None, 
            min_samples_split=2, 
            min_samples_leaf=1, 
            min_weight_fraction_leaf=0.0, 
            max_features='auto', 
            max_leaf_nodes=None, 
            min_impurity_decrease=0.0, 
            bootstrap=True, 
            oob_score=False, 
            n_jobs=-1, 
            random_state=42, 
            verbose=0, 
            warm_start=False, 
            class_weight=None, 
            ccp_alpha=0.0, 
            max_samples=None)

# <span style='color:#A80808'>Cross-validation</span>

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores=[]
models=[]
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train_targets)):
    X_train = train.iloc[train_idx]
    X_val = train.iloc[val_idx]
    y_train = train_targets[train_idx]
    y_val = train_targets[val_idx]
    
    model = rfc(**params)
    model.fit(X_train, y_train)
    models.append(model)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_pred, y_val)

    print(f'Fold {fold}, accuracy score: {score}')
    print('_'*60)
    scores.append(score)

print(f'Average accuracy score: {np.mean(scores)}')

# <span style='color:#A80808'>Prediction</span>

In [None]:
test = pd.read_csv('../input/spaceship-titanic-feature-engineering/test.csv')

In [None]:
preds = []
for model in models:
    preds.append(model.predict(test))

# <span style='color:#A80808'>Submission</span>

In [None]:
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
submission['Transported'] = stats.mode(np.array(preds), axis=0)[0].reshape(-1).astype(bool)
submission.to_csv("submission.csv", index=False)

submission.head()