In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import warnings
warnings.filterwarnings('ignore')

# Data Loading from Csv

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')
df_test_results = pd.read_csv('../input/dummy-submission/Best_Submission.csv')

In [None]:
df_test.insert(loc=1,column='Survived',value=df_test_results['Survived'])
df_train = pd.concat([df_train,df_test],axis=0)
df_train.reset_index(inplace=True)
df_train.drop('index',axis=1,inplace=True)

In [None]:
df_train.info()

# Data Cleaning

## Percentage of Missing Data

In [None]:
def Missing_Data_Plot(df):
  missing_data = []
  for col_name in df.columns:
    missing_data.append( df[col_name].isnull().values.sum()/len(df['PassengerId'])*100 )

  sns.set(rc={'figure.figsize':(16,8)})
  sns.barplot(x=df.columns, y=missing_data)
  plt.xlabel('Columns')
  plt.ylabel('Percentage of Missing values')
  plt.title('Missing Values')
  plt.yticks(np.arange(0,100,20))

In [None]:
Missing_Data_Plot(df_train)

## Handling Missing Values

### 1. Age

In [None]:
temp = df_train.loc[(df_train['Age'].isnull() == False)]            # Storing the dataframe with no missing values of Age into Temp

In [None]:
print(df_train['Parch'].unique())                           # Checking if the both temp and df_train have same unique value of Prach and SibSp
print(temp['Parch'].unique())

print(df_train['SibSp'].unique())
print(temp['SibSp'].unique())

We have one value of Parch that has no Age value,
and only two rows are there of that value
therefore we can reduce it to 8 as later we are going to group the Parch Field anyway.

In [None]:
df_train['Parch'].loc[df_train['Parch'] == 9] = 6

In [None]:
temp = df_train.loc[(df_train['Age'].isnull() == False)] 
Parch_key = list(temp['Parch'].unique())                   #Parch Feature is mapped to a Dictionary
Parch_val = [i for i in range(0,len(Parch_key))]

Sib_key = list(temp['SibSp'].unique())                     #SibSp Feature is mapped to a Dictionary
Sib_val = [i for i in range(0,len(Sib_key))]

Parch_dict = {}
Sib_dict = {}

for i in Parch_key:
  for j in Parch_val:
    Parch_dict[i] = j
    Parch_val.remove(j)
    break


for i in Sib_key:
  for j in Sib_val:
    Sib_dict[i] = j
    Sib_val.remove(j)
    break


Age_mat = []                                              # Now I have created a Age Matrix with Parch and SibSp Dictionaries,
for i in list(temp['Parch'].unique()):                    # we can now fill age value with the coreesponding Parch & SibSp Value
  temp_2 = []
  for j in list(temp['SibSp'].unique()):
    age = []
    for ind in temp.index:
      if temp['Parch'][ind]==i and temp['SibSp'][ind]==j:
        age.append(temp['Age'][ind])
    temp_2.append(np.mean(age).round())
  Age_mat.append(temp_2)
Age_mat = np.array(Age_mat)                               #Final Age Matrix is Saved to Age_mat

In [None]:
print(Age_mat)                                           #How our Age Matrix Looks

In [None]:
def Age_Imputer(df,Age_mat,Parch_dict,Sib_dict):                                   
  temp = df.loc[(df['Age'].isnull() == True)]        # Here is the function which will use the parch and SibSp values 
  for ind in temp.index:
    a = temp['Parch'][ind]  
    b = temp['SibSp'][ind]                            # to fill the missing Age Values
    i = Parch_dict[a]
    j = Sib_dict[b]
    df['Age'][ind] = Age_mat[i,j]

In [None]:
Age_Imputer(df_train,Age_mat,Parch_dict,Sib_dict)          # Calling the Imputer Function

In [None]:
Missing_Data_Plot(df_train)
df_train.info()

As Cabin has almost 80% values missing, there is no way we can Impute the Values without inducing somekind of bias into the data.
Therefore we will drop this Column Later.

Now we only left with one value of Fare and 2 Values of Embarked

### 2. Embarked

In [None]:
temp = df_train.loc[(df_train['Embarked'].isnull() == False)]

In [None]:
count = []
for i in range(1,4):
  embar = []
  P1 = temp['Embarked'].loc[(temp['Pclass'] == i)]
  embar.append(len(P1.loc[(P1 == 'S')]))
  embar.append(len(P1.loc[(P1 == 'Q')]))
  embar.append(len(P1.loc[(P1 == 'C')]))
  count.append(embar)
count

In [None]:
x = np.arange(3)
plt.bar(x+0.2, count[0]/np.sum(count[0])*100, color='r',width=0.2, label = 'Class-1')
plt.bar(x, count[1]/np.sum(count[1])*100, color='orange',width=0.2, label = 'Class-2')
plt.bar(x-0.2, count[2]/np.sum(count[2])*100, color='g',width=0.2, label = 'Class-3')
plt.xticks(x, ['S', 'Q', 'C'])
plt.legend()
plt.title('Relation between Class and Embarked')
plt.xlabel('Embarked')
plt.ylabel('%age. of Passengers')
plt.show()

We can see from the plots :
1. That most of the passenger have Embarked from 'S'
2. The passengers that Embarked from 'S' are mostly of Class-3 & Class-2
3. The passengers that Embarked from 'C' are mostly of Class-1

Since there are only 2 missing values we can :
1. Either Fill the most frequent value i.e. 'S'.
2. Or We can Fill the values as per the Class they are in.

I have used the Second Method Since this Notebook is common for Playground Competition, which have the same dataset but with many missing values.

In [None]:
def Embarked_Imputer(df):                                                       # Function to impute values of Emabrked features using PClass Feature
  temp = df.loc[df['Embarked'].isnull() == True]
  for idk in temp.index:
    if temp['Pclass'][idk] == 3 or temp['Pclass'][idk] == 2:
      df['Embarked'][idk] = 'S'
    else:
      df['Embarked'][idk] = 'C'

In [None]:
Embarked_Imputer(df_train)

### 3. Fare

For Fare Imputation, according to my hypothesis should be dependent of 3 factors only :
1. Age of Passenger ("Age")
2. From where he is Boarding the Ship ("Embarked")
3. Which Class he is travelling ("Pclass")

Therefore i have made a 2D matrix of fare with x ad y index being the "Age_Bin" and "Emabrked & Class Combined"

In [None]:
temp = df_train.dropna(axis=0)

In [None]:
mask_1 = (temp['Embarked'] == 'S' )
mask_2 = (temp['Embarked'] == 'Q')
mask_3 = (temp['Embarked'] == 'C')
temp_1 = temp[{'Age','Fare','Pclass'}].loc[mask_1]
temp_2 = temp[{'Age','Fare','Pclass'}].loc[mask_2]
temp_3 = temp[{'Age','Fare','Pclass'}].loc[mask_3]

In [None]:
embarked_filter = [temp_1, temp_2, temp_3]

In [None]:
def Class_Filter(temp):
  group = temp.groupby(by='Pclass')
  temp_4 = np.array(group.get_group(1))
  temp_5 = np.array(group.get_group(2))
  temp_6 = np.array(group.get_group(3))
  class_filter = [temp_4,temp_5,temp_6]
  return class_filter 

In [None]:
class_filtered = []
for filt in embarked_filter:
  class_filtered.append(Class_Filter(filt))  

In [None]:
a = np.where(class_filtered[0][0][0] == 35)
f = np.where(class_filtered[0][0][0] == 53.1)

In [None]:
def age_bin(temp):
  age_cl_1 = []
  age_cl_2 = []
  age_cl_3 = []
  age_cl_4 = []
  new_fare = []
  for i in range(0,len(temp)):
    age = temp[i,a[0][0]]
    fare = temp[i,f[0][0]].round(2)
    if age > 0 and age <= 20:
      age_cl_1.append(fare)
    if age > 20 and age <= 40:
      age_cl_2.append(fare)
    if age > 40 and age <= 60:
      age_cl_3.append(fare)
    if age > 60 and age <= 90:
      age_cl_4.append(fare)

  new_fare = [np.mean(age_cl_1).round(2), np.mean(age_cl_2).round(2), np.mean(age_cl_3).round(2), np.mean(age_cl_4).round(2)]
  return new_fare

In [None]:
fare = []
for i in range(0,3):
  for j in range(0,3):
    slice = class_filtered[i][j]
    fare.append(age_bin(slice))

In [None]:
Fare_mat = np.array(fare)

In [None]:
print(Fare_mat)                            #How our Fare matrix Looks

Visualising Distribution of 'Fare' with 'Class','Emarked' and 'Age group'

In [None]:
Age_bin = np.arange(4)
plt.figure(figsize=[16,12])
plt.bar(Age_bin-0.2,Fare_mat[0],color = 'orange', width = 0.2, label = 'Class-1')
plt.bar(Age_bin,Fare_mat[1],color = 'red', width = 0.2, label = 'Class-2')
plt.bar(Age_bin+0.2,Fare_mat[2],color = 'blue', width = 0.2, label = 'Class-3')
plt.xticks(Age_bin,['0-20','20-40','40-60','>60'])
plt.xlabel('Age Groups')
plt.ylabel('Fare')
plt.title("Fare Distribution for Emabrked = 'S'")
plt.legend(loc='upper right')
plt.show()

In [None]:
Age_bin = np.arange(4)
plt.figure(figsize=[16,12])
plt.bar(Age_bin-0.2,Fare_mat[3],color = 'orange', width = 0.2, label = 'Class-1')
plt.bar(Age_bin,Fare_mat[4],color = 'red', width = 0.2, label = 'Class-2')
plt.bar(Age_bin+0.2,Fare_mat[5],color = 'blue', width = 0.2, label = 'Class-3')
plt.xticks(Age_bin,['0-20','20-40','40-60','>60'])
plt.xlabel('Age Groups')
plt.ylabel('Fare')
plt.title("Fare Distribution for Emabrked = 'Q'")
plt.legend(loc='upper right')
plt.show()

In [None]:
Age_bin = np.arange(4)
plt.figure(figsize=[16,12])
plt.bar(Age_bin-0.2,Fare_mat[6],color = 'orange', width = 0.2, label = 'Class-1')
plt.bar(Age_bin,Fare_mat[7],color = 'red', width = 0.2, label = 'Class-2')
plt.bar(Age_bin+0.2,Fare_mat[8],color = 'blue', width = 0.2, label = 'Class-3')
plt.xticks(Age_bin,['0-20','20-40','40-60','>60'])
plt.xlabel('Age Groups')
plt.ylabel('Fare')
plt.title("Fare Distribution for Emabrked = 'C'")
plt.legend(loc='upper right')
plt.show()

In [None]:
def Fare_Imputer(train,new):
  df_array = np.array(train)
  age_dict = { '0': 0,'1': 1,'2': 2,'3': 3}
  comb_dict = { 'S1': 0, 'S2': 1, 'S3': 2, 'Q1': 3, 'Q2': 4, 'Q3': 5, 'C1': 6, 'C2': 7, 'C3': 8 }
  for i in range(0,len(train['PassengerId'])):
    if np.isnan(df_array[i,9]):
      age = df_array[i,5]
      clas = df_array[i,2]
      embar = df_array[i,11]
      comb = str(embar) + str(clas)
      x = comb_dict[comb]
      y = age_dict[str(math.floor(age/20))]
      if np.isnan(new[x,y]):
        df_array[i,9] = train['Fare'].median()
      else: 
        df_array[i,9] = new[x,y].round(2)
  col = train.columns
  return pd.DataFrame(df_array,columns=col)

In [None]:
df_train = Fare_Imputer(df_train,Fare_mat)


In [None]:
df_train.info()

In [None]:
new_df_train = df_train.copy()

# Feature Engineering : Adding extra features


## 1. Deriving a new feature "Alone" if a traveler is alone or not

In [None]:
# Alone_Function creates a new feature "Alone"

def Alone(df):
  df.insert(12,'Alone',"")
  df.insert(13,'Small Family',"")
  df.insert(14,'Big Family',"")
  alone = []
  sf = []
  bf = []

  for ind in df.index:
    if df['SibSp'][ind] == 0 and df['Parch'][ind] == 0:
      alone.append(1)
      sf.append(0)
      bf.append(0)
    elif df['SibSp'][ind] > 2 or df['Parch'][ind] > 2 :
      alone.append(0)
      sf.append(0)
      bf.append(1)
    else:
      alone.append(0)
      sf.append(1)
      bf.append(0)
  df.iloc[:,12] = alone
  df.iloc[:,13] = sf
  df.iloc[:,14] = bf

In [None]:
Alone(new_df_train) #Applying Function on Train Set

## Visualising  feature Alone with Survival

In [None]:
col_list = ['Alone','Small Family','Big Family']
Survived = [ (new_df_train[col].loc[(new_df_train['Survived'] == 1)].sum()/new_df_train[col].sum()*100).round(2) for col in col_list ]
Not_Survived = [ (new_df_train[col].loc[(new_df_train['Survived'] == 0)].sum()/new_df_train[col].sum()*100).round(2) for col in col_list ]

In [None]:
plt.figure(figsize = [16,8])
plt.bar(0, Survived[0], color='orange',width=0.2, label = 'Survived')
plt.bar(1,Survived[1], color='orange',width=0.2)
plt.bar(2, Survived[2], color='orange',width=0.2)

plt.bar(0.2, Not_Survived[0], color='g',width=0.2, label = 'Not-Survived')
plt.bar(1.2, Not_Survived[1], color='g',width=0.2,)
plt.bar(2.2, Not_Survived[2], color='g',width=0.2,)

plt.bar(0.1, (Survived[0]/Not_Survived[0])*100, color='black',width=0.01, label = 'Chances of Survival')
plt.bar(1.1, (Survived[1]/Not_Survived[0])*100, color='black',width=0.01,)
plt.bar(2.1, (Survived[2]/Not_Survived[0])*100, color='black',width=0.01,)

plt.xlabel('Percentage of Passengers')
plt.ylabel('Type of Group of Passenger')
plt.title('Group of Passengers vs Survival')
plt.xticks([0.1,1.1,2.1],['Alone','Small Family','Big Family'])
plt.legend()

from the Graph it is clear that 
1. most of the passengers that were traveling alone have died, and chances of survival of alone person are least.
2. Most of the small families survived, may be its because of the fact that small family members are less in no. hence gathered easily at the rescue point.

## 1. Deriving a new feature "Salutation" i.e. Title of the person travelling then we will group them by 'Regular' ,'Officer', 'Royalty'

In [None]:
#Salutation Function for Extracting Salutation and Length of Name from Name Column
def Salutation(df):
  temp = []
  length = []
  for name in df['Name']: 
    temp_name = name.split(", ")[1]
    length.append(len(name))
    temp.append(temp_name.split(" ")[0])
  df['Salutation'] = temp
  df['Len_Name'] = length

In [None]:
Salutation(new_df_train)

In [None]:
sal_list = new_df_train['Salutation'].unique()
sal_count = []
for i in sal_list:
  temp=0
  for j in new_df_train['Salutation']:
    if j == i:
      temp+=1
  sal_count.append(temp)    
sal_count

plt.figure(figsize=[16,8])
sns.barplot(x=sal_list, y=sal_count)
plt.xlabel('Salutation Type')
plt.ylabel('Count of Salutation')

In [None]:
temp = []
for sal in new_df_train['Salutation']:
  if sal in ['Mr.','Mrs.']:
    temp.append('Adults')
  elif sal in ['Miss','Master.']:
    temp.append('Teen/Kid')
  elif sal in ['Dr.','Major.','Col.','Capt.']:
    temp.append('Officer')
  else:
    temp.append('Royalty')
new_df_train['New_Sal'] = temp

In [None]:
sal_list = new_df_train['New_Sal'].unique()
Survived_Sal = []
Not_Survived_Sal = []
for i in sal_list:
  tot = new_df_train['Survived'].loc[(new_df_train['New_Sal'] == i)]
  Survived_Sal.append( round( ( tot.sum() / len(tot) ),4) * 100 )
  Not_Survived_Sal.append( round( ( ( len(tot) - tot.sum() ) / len(tot) ),4)*100 )
print(Survived_Sal)
print(Not_Survived_Sal)

In [None]:
Sal_type = np.arange(4)
plt.bar(Sal_type-0.2,Survived_Sal,color = 'orange', width = 0.2, label = 'Survived')
plt.bar(Sal_type,Not_Survived_Sal,color = 'g', width = 0.2, label = 'Not-Survived')

plt.xticks(Age_bin,['Adults','Teens/Kids','Officers','Royalty'])
plt.xlabel('Sal Types')
plt.ylabel('No. of Passengers')
plt.title("Survival on basis of Salutation")
plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
def Encoder(df,col_name):
  Encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
  temp = pd.DataFrame(Encoder.fit_transform(df[[col_name]]))
  temp.columns = Encoder.get_feature_names([col_name])
  return temp

In [None]:
encoded_columns = {'Pclass','Embarked','Sex','Parch','SibSp','New_Sal'}
for col in encoded_columns:
  temp = Encoder(new_df_train,col)
  new_df_train = pd.concat([new_df_train,temp],axis=1)

Dropping Extra Columns like **'Name' , 'Sex' , 'Embarked' , 'Cabin' , 'Ticket' , 'New_Sal' , 'Salutation'**

In [None]:
drop_col_list = {'Name', 'Sex','Embarked','Cabin','Ticket','New_Sal','Salutation'}
new_df_train.drop(drop_col_list,axis=1,inplace=True)
new_df_train = new_df_train.reset_index(drop=True)

In [None]:
new_df_train.info()

In [None]:
new_df_train = new_df_train.astype(int)
new_df_train['Fare'] = new_df_train['Fare'].astype(float)

In [None]:
new_df_train.info()

# Feature Selection / Feature Importance

### Importing Neccessary Models

In [None]:
# Importing Models for Feature Selection
from sklearn.feature_selection import SelectFromModel

# Importing TrainTest Split and MinMaxScaler for Scaling the Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Importing Grid Search CV for HyperParameterTuning and Stratifiedkfold for CV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# Importing Classifier Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# Importing Metrics From Sklearn for Evaluation
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
Y = new_df_train['Survived']
X = new_df_train.drop( 'Survived' ,axis=1 )

In [None]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(X,Y)
features = pd.DataFrame()
features['feature'] = X.columns
features['Importance'] = clf.feature_importances_
features.sort_values(by=['Importance'],ascending = True,inplace=True)
features.set_index('feature',inplace=True)

# Ploting the Feature Importance
features.plot(kind = 'barh', figsize = (25,25))

In [None]:
model = SelectFromModel(clf,prefit=True)
reduced_df_train = model.transform(X)

In [None]:
train_set = reduced_df_train[:891,:]
test_set = reduced_df_train[891:,:]
Y = np.array(Y)
Y = Y.reshape(1309,1)
X = train_set

# Splitting Data

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y[:891,-1], random_state = 42,test_size=0.3)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Using Grid Search CV for Hyperparameter tuning Of Models

# Using Models without HyperParameter Tuning

In [None]:
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

classifier_2 = LogisticRegression()
classifier_2.fit(X_train,y_train)

classifier_3 = GradientBoostingClassifier()
classifier_3.fit(X_train,y_train)

classifier_4 = XGBClassifier()
classifier_4.fit(X_train,y_train)

classifier_5 = VotingClassifier(estimators=[('rf',classifier),('gb',classifier_3),('xgb',classifier_4)], voting='hard',n_jobs=-1,)
classifier_5.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)
print("\033[1m" + "For Random Forest :" + "\033[0m")
print(classification_report(y_test,y_pred))

y_pred_2 = classifier_2.predict(X_test)
print("\033[1m" + "For Logistic Regression :" + "\033[0m")
print(classification_report(y_test,y_pred_2))

y_pred_3 = classifier_3.predict(X_test)
print("\033[1m" + "For Gardient Boosting Classifier :" + "\033[0m")
print(classification_report(y_test,y_pred_3))

y_pred_4 = classifier_4.predict(X_test)
print("\033[1m" + "For XGBoost Classifier :" + "\033[0m")
print(classification_report(y_test,y_pred_4))

y_pred_5 = classifier_5.predict(X_test)
print("\033[1m" + "For Voting Classifier :" + "\033[0m")
print(classification_report(y_test,y_pred_5))

In [None]:
# Summary of Predictions :
predictions = [y_pred, y_pred_2, y_pred_3,y_pred_4,y_pred_5]
j = 1
for i in predictions:
  print("\033[1m" +'For Classifier :' + str(j) + "\033[0m" )
  print(accuracy_score(y_test,i))
  print(confusion_matrix(y_test,i))
  j+=1

# Calculating Best Parameters for RandomForestClassifier

In [None]:
grid   =   [ {
               'max_depth': [4,6,8],                                             #Random_forest_grid
               'min_samples_split': [2, 3, 10],
               'min_samples_leaf': [1, 3, 10],
               'criterion' : ['gini', 'entropy' ],
               'max_features' : ['sqrt','auto','log2'],
               'n_estimators': [50,10] } ]


In [None]:
cross_val = StratifiedKFold(n_splits = 3)  
GSCV = GridSearchCV( RandomForestClassifier(), param_grid = grid[0], cv = cross_val, verbose=0 )
GSCV.fit(X_train,y_train)

In [None]:
print(" Best Parameters:" + str(GSCV.best_params_))
print(" Best Score:" + str(GSCV.best_score_))
RF_param = GSCV.best_params_

# Calculating Best Parameters for Logistic Regression

In [None]:
grid   =   [  {
               'solver' : ['newton-cg', 'lbfgs', 'liblinear'],            #Logistic_Regression_Grid
               'penalty' : ['l1', 'l2'],
               'C': [0.001, 0.01, 0.1, 1, 10]  }  ]

In [None]:
GSCV = GridSearchCV( LogisticRegression(), param_grid = grid[0], cv = cross_val, verbose=0 )
GSCV.fit(X_train,y_train)

In [None]:
print(" Best Parameters:" + str(GSCV.best_params_))
print(" Best Score:" + str(GSCV.best_score_))
LR_param = GSCV.best_params_

# After Hyperparameter Tuning of Models

In [None]:
classifier = RandomForestClassifier(**RF_param)
classifier.fit(X_train,y_train)

classifier_2 = LogisticRegression(**LR_param)
classifier_2.fit(X_train,y_train)

classifier_3 = GradientBoostingClassifier(n_estimators = 180, min_samples_split = 2, min_samples_leaf = 4, max_depth = 50, loss = 'deviance', learning_rate = 0.01, criterion = 'mse')
classifier_3.fit(X_train,y_train)

classifier_4 = XGBClassifier()
classifier_4.fit(X_train,y_train)

classifier_5 = VotingClassifier(estimators=[('rf',classifier), ('gb',classifier_3),('xgb',classifier_4)], voting='hard',n_jobs=-1,)
classifier_5.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)
print("For Random Forest :")
print(classification_report(y_test,y_pred))

y_pred_2 = classifier_2.predict(X_test)
print("For Logistic Regression :")
print(classification_report(y_test,y_pred_2))

y_pred_3 = classifier_3.predict(X_test)
print("For Gradient Boosting :")
print(classification_report(y_test,y_pred_3))

y_pred_4 = classifier_4.predict(X_test)
print("For Xtreme Gradient Boosting :")
print(classification_report(y_test,y_pred_4))

y_pred_5 = classifier_5.predict(X_test)
print("For Voting Classifier :")
print(classification_report(y_test,y_pred_5))

In [None]:
# Summary of Predictions after hyper Parameter Tuning:
predictions_2 = [y_pred, y_pred_2, y_pred_3,y_pred_4,y_pred_5]
j = 1
for i in predictions:
  print('For Classifier :' + str(j))
  print(accuracy_score(y_test,i))
  print(confusion_matrix(y_test,i))
  j+=1

In [None]:
# Comparasion of Predictions :
j = 1
for i in range(0,len(predictions)):
  print('For Classifier :' + str(j))
  print('Without HyperParameter Tuning' + str(accuracy_score(y_test,predictions[i])) )
  print('After HyperParameter Tuning' + str(accuracy_score(y_test,predictions_2[i])) )
  j+=1

# Using Keras Sequential Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense

dim = X_train.shape[1]
model = Sequential()
model.add(Dense(8, input_dim=dim, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train,y_train, epochs=100, batch_size=10)
# evaluate the keras model
_, accuracy = model.evaluate(X_train, y_train)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
y_pred_6 = model.predict(X_test,batch_size=10).round(0)

In [None]:
y_pred_6 = y_pred_6.astype(int)

In [None]:
print(classification_report(y_test,y_pred_6))
print(accuracy_score(y_test,y_pred_6))
print(confusion_matrix(y_test,y_pred_6))

# Preparing Submission File

In [None]:
result = pd.DataFrame(classifier.predict(scaler.transform(test_set)),columns={'Survived'})

In [None]:
submission = pd.read_csv('../input/titanic/test.csv')
submission =pd.concat([submission['PassengerId'],result],axis=1)
submission.to_csv('./Submission_5.csv',index=False)