# **Titanic using RandomForest**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

**Read dataset**

In [None]:
train_df = pd.read_csv("../input/titanic/train.csv")
train_df.head()

In [None]:
train_df.describe()

**Fill in missing values**

In [None]:
train_df.isnull().sum()

In [None]:
train_df["Age"].fillna(train_df.Age.mean(), inplace=True)
train_df["Embarked"].fillna("S", inplace=True)
train_df.isnull().sum()

**Format only the data you need**

In [None]:
x_train = train_df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Parch', 'SibSp']]
x_train = pd.get_dummies(x_train)
x_train.head()

In [None]:
y_train = train_df[['Survived']]
y_train.head()

**Learn in a RandomForest**

In [None]:
clf = RandomForestClassifier(random_state = 10, max_features='sqrt')
pipe = Pipeline([('classify', clf)])
param = {'classify__n_estimators':list(range(20, 30, 1)),
         'classify__max_depth':list(range(3, 10, 1))}
grid = GridSearchCV(estimator = pipe, param_grid = param, scoring = 'accuracy', cv = 10)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

**Read dataset**

In [None]:
test_df = pd.read_csv("../input/titanic/test.csv")
test_df.head()

**Fill in missing values**

In [None]:
test_df.isnull().sum()

In [None]:
test_df["Age"].fillna(test_df.Age.mean(), inplace=True)
test_df["Fare"].fillna(test_df.Fare.mean(), inplace=True)
test_df.isnull().sum()

**Format only the data you need**

In [None]:
x_test = test_df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Parch', 'SibSp']]
x_test = pd.get_dummies(x_test)
x_test.head()

**Predict the answer**

In [None]:
y_test = grid.predict(x_test)

In [None]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'].values, 'Survived': y_test.astype(np.int32)})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)