In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stat
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PowerTransformer0
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 0.LIBRARIES REQUIRED

In [None]:
#Data Analysis
import numpy as np
import pandas as pd
import scipy.stats as stat

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

#Data Preprocessing
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
import scipy.stats as stat

#Model Creations
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#Model Validation 
from sklearn.model_selection import  cross_val_score, StratifiedKFold, learning_curve

#Hyper Parameter Tuning
from sklearn.model_selection import GridSearchCV


In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.shape,test.shape

In [None]:
train.info() #to check the type of data int/float, and the vogue idea about null values

In [None]:
test.info()  #to check the type of data int/float, and the vogue idea about null values

In [None]:
train.head(4)

In [None]:
test.head(4)

In [None]:
train.describe()

# 1.EXPLORATORY DATA ANALYSIS (EDA)

## EDA (Catagorical Features)

In [None]:
#quant_features=train.select_dtypes(exclude=['object']).columns
#cat_features=train.select_dtypes(include=['object']).columns
quant_feature=['Age','SibSp','Parch','Fare']
cat_features=['Pclass','Sex','Embarked','Ticket','Cabin','Survived']

In [None]:
for i in cat_features:
    #sns.countplot(x=i,data=train)
    sns.countplot(x=i,data=train,hue='Survived')
    plt.show()
    sns.barplot(x=i,y='Survived',data=train)
    plt.show()

## EDA Numerical Features

In [None]:
#Check the disturbution of quantiative data and might be required to standardize/normalize it , 

In [None]:
for i in quant_feature:
    sns.histplot(train[i])
    plt.show()
    

In [None]:
#heatmap to check the correclation between the featues , 
#if it is highly correlated we have to ignore one of the feature
sns.heatmap(train[quant_feature].corr(),annot=True)  

In [None]:
x=pd.pivot_table(train,index='Survived',values=quant_feature)
x

In [None]:
group_survival=train.groupby(['Survived','Pclass'])

In [None]:
group_survival['Age',].mean()

# 2.FEATURE ENGINEERING

In [None]:
"""
Creating a new dataset by combining both the test and train data ,which will be easy to preprocess and later just before modeling split it again into train and test 
This will avoid the tedious process of preprocessing train and test datasets seperately and avoid confusiions lateron """
dataset =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [None]:
IDtest = test["PassengerId"]
"""
The passenger id for the test data is required as column name IDtest in future for submission 
"""

In [None]:
dataset.isnull().sum()

## Handling Null/Nan values

In [None]:
dataset['Embarked']=dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])  
#since it is two Nan in Embarked, filled with the mode or maximum occurance

In [None]:
dataset['Fare']=dataset['Fare'].fillna(dataset['Fare'].median()) 
#Nan-Fare values filled with median of the data

Handling NaN : AGE
First need to find which are the other features have influence to age feature
So plot other features against Age

In [None]:
sns.boxplot(x='Sex',y='Age',data=dataset)
# the distribution of age data (range as well as the mean ) against Gender are amlost same.
# we can ignore this feature to fill Age Nan.

In [None]:
sns.boxplot(x='Pclass',y='Age',data=dataset)
#in this we can observe the distribution of Age against different Pclass class is different , 
#so this can be considered to fill Age NaN values

In [None]:
sns.boxplot(x='Parch',y='Age',data=dataset)
#in this we can observe the distribution of Age against different Parch class is different , 
#so this can be considered to fill Age NaN values

In [None]:
sns.boxplot(x='SibSp',y='Age',data=dataset)
#in this we can observe the distribution of Age against different SibSp class is different , 
#so this can be considered to fill Age NaN values

So belwo we created a code in to find the age Nan rows having the PClass,Parch,SibSp , have same values eg Row 5,19,
then find the rows of same values and fill the with the mean of that group

If we cant find such group then fill those Nan Ages with Median of the Ages.

In [None]:

index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) &
                               (dataset['Parch'] == dataset.iloc[i]["Parch"]) &
                               (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        dataset['Age'].iloc[i] = age_pred
    else :
        dataset['Age'].iloc[i] = age_med

In [None]:
#total passengers is 1309, but unique ticket count is  only 929, that means there might be more than one passengers in one ticket or same cabin
dataset.shape ,len(dataset['Ticket'].unique())

In [None]:
dataset['Cabin'].unique() #This is to check the non filled nan cabins what string can be used to impute , XX is not in current data so used XX string to impute

In [None]:
"""Below code is used to fill the Nan cabin values with the exisitng cabin values which have same ticket number of the missing cabin(Nan Cabin)
But not much effective since only 14 cabins can be able to fill in that way 
so the remaining cabins considered as a seperate class named XX
"""
index_NaN_cabin = list(dataset["Cabin"][dataset["Cabin"].isnull()].index)
for i in index_NaN_cabin:
    for j in range(0,len(dataset)):
        if dataset['Ticket'].iloc[j]==dataset['Ticket'].iloc[i]:
            dataset['Cabin'].iloc[i]=dataset['Cabin'].iloc[j]
            continue

dataset['Cabin']=dataset['Cabin'].fillna('XX')

In [None]:
dataset.isnull().sum()

## Feature Extraction

### Name

In [None]:
"""
Just to check the behaviour name data"""
dataset['Name'].head(5),dataset['Name'].tail(5)

Feature Extraction from Name columns
If we observe the every names have tittle(Mr,Mrs,Rev,Countess etc) which may give the some information 
to group the namesInstead of completley omiting the name column as irrelevent we can check what information 
we can extract from the name columns
If we observe the tittle of the name comes after the surname seperated with a comma.So we can split all the 
names with respect to comma get into a list and from the list take the second element which is tittle


In [None]:
title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]

In [None]:
dataset['Title']=pd.Series(title) #added the tittle series to the dataframe 
dataset['Title'].unique() #eheck the unique values

In [None]:
group_title=dataset.iloc[:len(train)].groupby('Title')
group_title['Survived'].value_counts(normalize=True)


In [None]:
"""
By observing above table we can find some inputs about the survival rate among the tittle groups 
so we can group the passengers according to the tittle and its survival rates"""

group1=['Capt','Rev','Jonkheer','Don','Dona'] #grouped based on survival rate
group2=['Dr', 'Major','Col'] #grouped based on survival rate
group3=['the Countess','Countess', 'Sir',] #grouped based on survial rate
group4=['Mr'] #title for men
group5=['Master'] #title for a boys
group6=[ 'Mrs', 'Miss','Mme', 'Ms','Mlle','Lady']# tittle for ladies


dataset['Title']=dataset['Title'].replace(group1,'1')
dataset['Title']=dataset['Title'].replace(group2,'2')
dataset['Title']=dataset['Title'].replace(group3,'3')
dataset['Title']=dataset['Title'].replace(group4,'4')
dataset['Title']=dataset['Title'].replace(group5,'5')
dataset['Title']=dataset['Title'].replace(group6,'6')
dataset["Title"] = dataset["Title"].astype(int)

In [None]:
dataset['Title'].unique()

In [None]:

"""
After grouping as above , if we check the groupby again it gives some meaning tittle class"""
group_title=dataset.iloc[:len(train)].groupby('Title')
group_title['Survived'].value_counts(normalize=True)

In [None]:
sns.barplot(x='Title',y='Survived',data=dataset.iloc[:len(train)])
#This became more meaning full grouping and it obvious that group 3 has highest survival rate and group 1 has least survival rate


In [None]:
#So now drop the name colums from the dataset , adn keep title instead
dataset.drop(labels = ["Name"], axis = 1, inplace = True)

### Cabin

CABIN
Few of the materials in internet shows that there are decks named A,B,C etc so there is a strong belief that the prefix in the cabin number is nothing but the 
deck. so in that belief we can extract the initial alphabets and classify it as decks

In [None]:
# Replace the Cabin number by the type of cabin 'XX' if not
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])

In [None]:
sns.barplot(data=dataset,x='Cabin',y='Survived')

### Ticket

In [None]:
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X. 

Ticket = []
for i in list(dataset.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
    else:
        Ticket.append("X")
        
dataset["Ticket"] = Ticket
dataset["Ticket"].head()

In [None]:
dataset['Ticket'].unique()

FAMILY
We can observe that the chance of survival is differnet depends on the size of the family.
here the family is the combination of the number of parents / children aboard the Titanic and number of siblings / spouses aboard the Titanic.
so we can find the size each family onboard by adding Sibp and Parch + 1(passenger himself)

### Sib /Parch

In [None]:
dataset['Family']=dataset["SibSp"] + dataset["Parch"] + 1 

Below plot shows the survial probability based on family class, and the most is with the familyh number 4,
But its not necessary that the indivitual traveller will survive , it may be most of the lone travellers are males.
Also we can observe that the family size beyond 5 has less chance of survival since the difficulity in mobility altogether.
by looking below details we can divide the family data into few classes , like single, small family, medium,large,very large

In [None]:
sns.barplot(x='Family',y='Survived',data=dataset)
plt.show()
sns.countplot(x='Family',hue='Survived',data=dataset)
plt.show()

In [None]:
dataset['Family'].head()

for i in range(0,len(dataset)):
    print (dataset['Family'].iloc[i])

In [None]:
for i in range(0,len(dataset)):
    if dataset['Family'].iloc[i]== 1:
        dataset['Family'].iloc[i] = 'Single'
    elif dataset['Family'].iloc[i] == 2:
        dataset['Family'].iloc[i] = 'Couple'
    elif dataset['Family'].iloc[i] >= 3 and dataset['Family'].iloc[i] <=4 :
        dataset['Family'].iloc[i] = 'SmallFam'
    elif dataset['Family'].iloc[i] >= 5 and dataset['Family'].iloc[i] <=7 :
        dataset['Family'].iloc[i] = 'LargeFam'
    elif dataset['Family'].iloc[i] > 7  :
        dataset['Family'].iloc[i] = 'BigFam'
        
        

In [None]:
dataset['Family'].unique()

In [None]:
dataset.info()

## Feature Transformations for Linear models

CHECK THE DATA DISTRUBITION AND SKEW OF EACH NUMERICAL FEATURES

Below we can check the skew and try different transformers so that the skew will be minimum . among all Yeo john transformer has least skew 
Also we cant use the box cox transformer since we have some zero values in Fare.

### Fare

In [None]:
dataset['Fare'].skew()

In [None]:
sns.histplot(dataset['Fare'],kde='True')
print('Skew:',dataset['Fare'].skew())

In [None]:
# log norm of fare 
dataset['norm_fare'] = np.log(dataset.Fare)
sns.histplot(dataset['norm_fare'],kde='True')
print('Skew:',dataset['norm_fare'].skew())

In [None]:
#square root transformation 
dataset['sq_fare'] = dataset['Fare']**(1/2)
sns.histplot(dataset['sq_fare'],kde='True')
print('Skew:',dataset['sq_fare'].skew())

In [None]:
dataset['exp_fare']=dataset['Fare']**(1/2.7)
sns.histplot(dataset['exp_fare'],kde='True')
print('Skew:',dataset['exp_fare'].skew())

In [None]:

"""
Below is one of the two power transformers  yeo johnson and box cox but we cant use boxcox on 0 
or negative values so used yeo john transformer here"""
import scipy.stats as stat
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
ptdata=pd.DataFrame(pt.fit_transform(dataset['Fare'].values.reshape(-1,1)),columns=['Fare_YJ'])
print('skew',ptdata['Fare_YJ'].skew())
sns.histplot(ptdata['Fare_YJ'],kde='True')

In [None]:
#Assign the Yj transformed values to dataset
dataset['Fare_transformed']=ptdata['Fare_YJ']
dataset.drop(labels = ["Fare"], axis = 1, inplace = True) #drop actual Fare feature from the dataset

In [None]:
dataset.drop(labels = ['norm_fare','exp_fare','sq_fare'], axis = 1, inplace = True) #drop transformed column

In [None]:
dataset['Fare_transformed'].min(),dataset['Fare_transformed'].max()

### Age

Age has normally distributed data and very low skew so no need to transform the data

In [None]:
sns.histplot(dataset['Age'],kde='True')
print ('Skew_Age',dataset['Age'].skew())  # Age have very low skew value so not applying the transformation

### SibSp

SibSp has relatively high skew so need to transform to reduce the skew and make the data normally distributed

In [None]:
sns.histplot(dataset['SibSp'],kde='True')
print ('Skew_Sibp',dataset['SibSp'].skew()) 

In [None]:
# log norm of Sibsp
dataset['log_Sib'] = np.log(dataset.SibSp)
sns.histplot(dataset['log_Sib'],kde='True')
print('Skew:',dataset['log_Sib'].skew())

In [None]:
ptdata=pd.DataFrame(pt.fit_transform(dataset['SibSp'].values.reshape(-1,1)),columns=['sib_YJ'])
print('skew',ptdata['sib_YJ'].skew())
sns.histplot(ptdata['sib_YJ'],kde='True')

In [None]:
dataset['Sib_norm']=ptdata['sib_YJ'] #Added Yeo-joh transformed Sibsp to the dataset
dataset.drop(labels = ["SibSp"], axis = 1, inplace = True) #drop actual SibSp columns
dataset.drop(labels = ["log_Sib"], axis = 1, inplace = True) #a log transformation created earlier , decicded not to use so deleting it from the dataset

### Parch

In [None]:
sns.histplot(dataset['Parch'],kde='True')
print ('Skew_Parch',dataset['Parch'].skew()) 

In [None]:
# log norm of Parch
dataset['norm_Par'] = np.log(dataset.Parch)
sns.histplot(dataset['norm_Par'],kde='True')
print('Skew:',dataset['norm_Par'].skew())
dataset.drop(labels = ["norm_Par"], axis = 1, inplace = True) #drop transformed column

HANDLE CATAGORICAL FEATURES INTO NUMERICAL ,ONE HOT ENCODING

In [None]:
dataset.info()

## Catagorical Data to One-Hot Encoding

In [None]:
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],drop_first=True,prefix="Pcl")

In [None]:
dataset["Family"] = dataset["Family"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Family"],drop_first=True,prefix="Fam")

In [None]:
dataset = pd.get_dummies(dataset, columns = ["Cabin"],drop_first=True,prefix="Cabin")
dataset = pd.get_dummies(dataset, columns = ["Ticket"],drop_first=True, prefix="T")

In [None]:
dataset = pd.get_dummies(dataset, columns = ["Title"],drop_first=True)
dataset = pd.get_dummies(dataset, columns = ["Embarked"],drop_first=True, prefix="Em")
dataset = pd.get_dummies(dataset, columns = ["Sex"],drop_first=True, prefix="Emb")

In [None]:
dataset.info()

## Further Normalization of Few Features

CHECK THE INFO OF THE DATA AND DECIDE WHETHER DO WE NEED TO NORMALIZE ANY FEATURES

In [None]:
pd.set_option('display.max_columns',80) #this is nothing but to display all the columns in visualization

In [None]:
dataset.describe()
#Mean std of Age is differnet from the remaining all the features

In [None]:
#Apply standard scalar to Age to tranform
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
dataset['Age'] = sc.fit_transform(pd.DataFrame(dataset['Age']))

In [None]:
dataset.describe()

# 3. FEATURE SELECTION 

## Drop Irrrelevent Data

In [None]:
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True) #drop irrelevent passenger id

### PCA

PCA can be done to reduce the dimensions or number of features

# 4.MODEL CREATION

"""
1.The steps we are following here is select 10 best classification algorithms and run with default values ,
2.Select the best 5 models by checking the score by cross validastion scores
3.Then hypertune the best 5 models by selecting the best hyperparameters using GrivSearch
4.Use soft voting technique to combine the outputs from all the models (Ensemble model creation) """

In [None]:

"""Split train and test datas from dataset and delete survived col from test data , 
   PN:Here test is the data we need to predict for submission , to the test data we used for regular validation purpose.
"""
train=dataset.iloc[:len(train)] #split 
test=dataset.iloc[len(train):]
test.drop(labels=["Survived"],axis = 1,inplace=True) #or test.drop(columns=['Survived'],inplace=True)

In [None]:
train['Survived']=train['Survived'].astype(int)
Y_train=train['Survived']
X_train=train.drop(columns=['Survived'])

## Base line model selection

In [None]:
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

In [None]:
# Modeling step Test differents algorithms 
random_state = 2
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

In [None]:
cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = Y_train, scoring = "accuracy", cv = kfold, n_jobs=4))

In [None]:
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [None]:

cv_result = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})
cv_result.sort_values(by='CrossValerrors',ascending=False)


In [None]:
sns.barplot(x='CrossValerrors',y='Algorithm',data=cv_result.sort_values(by='CrossValerrors',ascending=False))
plt.show()
sns.barplot(x='CrossValMeans',y='Algorithm',data=cv_result.sort_values(by='CrossValMeans',ascending=False),)
plt.show()

"""Based on the above result we selected the best five models as base line models for further tuning or  hyperparameter optimization and creating Ensemble model for prediction """

## Hyper Parameter Optimization

#### Ada Boost

In [None]:
DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC, random_state=7)

ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsadaDTC.fit(X_train,Y_train)

ada_best = gsadaDTC.best_estimator_
gsadaDTC.best_score_

#### Extra tree Classifier

In [None]:
ExtC = ExtraTreesClassifier()
## Search grid for optimal parameters
ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsExtC.fit(X_train,Y_train)

ExtC_best = gsExtC.best_estimator_

# Best score
gsExtC.best_score_

#### SVC

In [None]:
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(X_train,Y_train)
SVMC_best = gsSVMC.best_estimator_
gsSVMC.best_score_

#### RFC

In [None]:

RFC = RandomForestClassifier()


## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,Y_train)

RFC_best = gsRFC.best_estimator_

#### Gradient Boosting

In [None]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(X_train,Y_train)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

## Ensemble Model Creation

In [None]:
test_Survived_RFC = pd.Series(RFC_best.predict(test), name="RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name="ExtC")
test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name="SVC")
test_Survived_AdaC = pd.Series(ada_best.predict(test), name="Ada")
test_Survived_GBC = pd.Series(GBC_best.predict(test), name="GBC")

# Concatenate all classifier results
ensemble_results = pd.concat([test_Survived_RFC,test_Survived_ExtC,test_Survived_AdaC,test_Survived_GBC, test_Survived_SVMC],axis=1)
ensemble_results.head(3)

In [None]:
sns.heatmap(ensemble_results.corr(),annot=True,cbar=True)

In [None]:
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
('svc', SVMC_best), ('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4)

Ensemble_train = votingC.fit(X_train, Y_train)

# SUBMISSION.

In [None]:
output = pd.DataFrame({'PassengerId': IDtest, 'Survived': Ensemble_train.predict(test)})
output.to_csv('submission.csv',index=False)