In [None]:
#Initialization of python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from pandas import Series, DataFrame
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
np.random.seed(500)
import warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
#Read dataset
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [None]:
#Look into dataset properties
train.info(); test.info()

In [None]:
#Data Visualization respect to Survival rate

#Distribution of Survival
fig = plt.figure(figsize=(12,6)) 
plt.subplot2grid((2,3),(0,0))
SV= train.Survived.value_counts()
SV.index = ['Died','Survived']
SV.plot(kind='bar'); plt.title("Distribution of Survival") 
print(SV, '\n')

#Distribution of Survival considering Gender
plt.subplot2grid((2,3),(0,1))
sns.countplot(x='Sex', hue='Survived', data=train)
print(train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
plt.legend(('Died', 'Survived')); plt.title("Survival considering Gender") 

#Distribution of Survival considering Ticket Class
print('\n')
plt.subplot2grid((2,3),(0,2))
sns.countplot(x='Pclass', hue='Survived', data=train)
print(train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
plt.legend(('Died', 'Survived')); plt.title("Survival considering Ticket Class") 

In [None]:
#Age Distribution within classes
fig = plt.figure(figsize=(12,6)) 
plt.subplot2grid((2,3),(0,0), colspan=2)
train.Age[train.Pclass == 1].plot(kind='kde')    
train.Age[train.Pclass == 2].plot(kind='kde')
train.Age[train.Pclass == 3].plot(kind='kde')
plt.title("Age Distribution within classes"); plt.xlabel("Age"); plt.legend(('1st Class', '2nd Class','3rd Class')) 

#Distribution of Survival considering Port of Embarkation
plt.subplot2grid((2,3),(0,2))
sns.countplot(x='Embarked', hue='Survived', data=train)
print(train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())
plt.legend(('Died', 'Survived')); plt.title("Survival considering Port of Embarkation") 

In [None]:
#Delete insignificant data from the dataset
Arrange_train = train.drop(['PassengerId','Name','Ticket','Cabin','Survived'], axis=1)
Arrange_test = test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [None]:
#Care of Missing Values
Arrange_train['Age'].fillna(Arrange_train['Age'].median(), inplace=True)
Arrange_train['Embarked'].fillna(Arrange_train['Embarked'].mode()[0], inplace=True)
Arrange_test['Age'].fillna(Arrange_test['Age'].median(), inplace=True)
Arrange_test['Fare'].fillna(Arrange_test['Fare'].median(), inplace=True)

In [None]:
#changing Categorical(Sex, Embarked) into numerical
#Train data
SetSexTR = Arrange_train['Sex'].copy().values
SetSexTR[SetSexTR == 'male'] = 0 
SetSexTR[SetSexTR == 'female'] = 1 
SetEmbTR = Arrange_train['Embarked'].copy().values
SetEmbTR[SetEmbTR =='S'] = 0 
SetEmbTR[SetEmbTR =='C'] = 1
SetEmbTR[SetEmbTR =='Q'] = 2
#Test data
SetSex = Arrange_test['Sex'].copy().values
SetSex[SetSex == 'male'] = 0  
SetSex[SetSex == 'female'] = 1 
SetEmb = Arrange_test['Embarked'].copy().values
SetEmb[SetEmb =='S'] = 0 
SetEmb[SetEmb =='C'] = 1
SetEmb[SetEmb =='Q'] = 2

In [None]:
#assigning the numeric values to columns that held string values
#Train data
X_train = Arrange_train.copy()
X_train.loc[:,'Sex'] = SetSexTR 
X_train.loc[:,'Embarked'] = SetEmbTR
Y_train = train['Survived'].astype(int)
#Test data
X_test = Arrange_test.copy()
X_test.loc[:,'Sex'] = SetSex 
X_test.loc[:,'Embarked'] = SetEmb

In [None]:
#Standardize the features 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Apply Principal Component Analysis (PCA) 
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [None]:
#Hyperparemeter tunung of Classifire with GridSearchCV
model = RandomForestClassifier()
n_estimators = [10, 50, 100]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
k= KFold(n_splits=5, random_state=100)
grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=k,n_jobs=4, scoring='accuracy',error_score=0, verbose=1)
grid_result = grid_search.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print(grid_result.best_estimator_)

In [None]:
#Hyperparemeter tunung of Classifire with RandomizedSearchCV
model = RandomForestClassifier()
n_iter_search = 200
k= KFold(n_splits=10, random_state=100)
rand_param = {
    'n_estimators': [10, 50, 100],
    'max_features': ['sqrt', 'log2']
 }
Rf_search = RandomizedSearchCV(model, param_distributions=rand_param, n_iter=n_iter_search, cv=k, n_jobs=-1, verbose=1)
Rf_result = Rf_search.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (Rf_result.best_score_, Rf_result.best_params_))
print(Rf_result.best_estimator_)

In [None]:
#Perform training using optimal model(having tuned hyperparemeter) with cross validation
model = RandomForestClassifier(max_features='log2', n_estimators=50)
score = cross_val_score(model, X_train, Y_train, cv=10, n_jobs=4)
model.fit(X_train, Y_train)
print("Cross-validation accuracy: {0:.4f}".format(score.mean()))

#Perform Prediction on test data
y_pred = model.predict(X_test)