In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#importing the data and overview 

train_df= pd.read_csv('../input/titanic/train.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df['Survived'].value_counts()

# survived = 1
# didn't survive = 0

In [None]:
fig, ax=plt.subplots(figsize=(8,6))
sns.countplot(x='Survived', data=train_df, hue='Sex')
ax.set_ylim(0,500)
plt.title("the probability of survivors based on sex")
plt.show()

# survived = 1
# didn't survive = 0
# blue = male
# yellow = female

In [None]:
sns.heatmap(train_df.corr(), annot=True)

In [None]:
#Data preparation
#looking for null values and taking care of them

train_df.isnull().sum()

In [None]:
train_df["Age"].fillna(train_df["Age"].mean(), inplace = True)

In [None]:
train_df['Sex']=train_df['Sex'].replace('male', 0)
train_df['Sex']=train_df['Sex'].replace('female', 1)

In [None]:
train_df.drop(['Name', 'PassengerId', 'Fare', 'Ticket','Embarked', 'Cabin'], axis = 1, inplace = True)

In [None]:
train_df.isnull().sum()

In [None]:
missing=train_df.isnull().sum().sort_values(ascending=False)
missing=missing.drop(missing[missing==0].index)
missing

In [None]:
#defining features and the label

X= train_df.drop('Survived', axis=1)
y= train_df['Survived']

In [None]:
#spliting the dataset to work on train and test


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#feature scaling


from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaler.fit(X_train)

In [None]:
scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

In [None]:
#training the model

from sklearn.ensemble import RandomForestClassifier

Randomforest_model = RandomForestClassifier(n_estimators=10, max_features='auto', random_state=101)

In [None]:
Randomforest_model.fit(scaled_X_train,y_train)

In [None]:
#predicting test

y_pred = Randomforest_model.predict(scaled_X_test)

In [None]:
#testing the model

from sklearn.metrics import confusion_matrix,classification_report

confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
Randomforest_model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=Randomforest_model.feature_importances_,columns=['Feature Importance'])

In [None]:
#grid Search


from sklearn.model_selection import GridSearchCV

n_estimators=[64, 100, 128, 200]
max_features=[2, 3, 4, 5]
bootstrap=[True, False]

grid_parameters= {'n_estimators':n_estimators,
            'max_features':max_features,
            'bootstrap':bootstrap}


Randomforest_model= RandomForestClassifier()
grid_search= GridSearchCV(Randomforest_model, grid_parameters, cv=5 )

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_pred= grid_search.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#OOB error

Randomforest_model_OOB= RandomForestClassifier(max_features=4, n_estimators=128, oob_score=True)

In [None]:
Randomforest_model_OOB.fit(X_train, y_train)

In [None]:
Randomforest_model_OOB.oob_score_