In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Please upvote if you like it~

# 1.Loading dataset and modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer,IterativeImputer
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier

sns.set_style('whitegrid')
from sklearn.metrics import accuracy_score

**Dataset loading**

In [None]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test=pd.read_csv('/kaggle/input/titanic/test.csv')

submit=pd.DataFrame(test['PassengerId'])

# 2. Data Manupulation

**2.1. Label Encoder**

In [None]:
train['Sex'].replace({'male':0,'female':1},inplace=True)
train['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)


**2.2. Deriveing title from name columns**

In [None]:
train['title']=0
for i in range(0,len(train)):
    train.loc[i,'title']=train['Name'].iloc[i].split(',')[1].split('.')[0][1:]
train['title'].replace({'Mr':1,'Miss':2,'Mrs':2,'Master':3,'Dr':4,'Rev':5},inplace=True)
train['title'].replace(['Major','Mlle','Col','Don','the Countess','Sir','Capt','Mme','Lady','Jonkheer','Ms'],6,inplace=True)

**2.3. Defining a columns representing family group**

In [None]:
train['family']=train['SibSp']+train['Parch']+1
def family(size):
    a=''
    if(size<=1):
        a=1    #Alone
    elif(size<=2):
        a=2    #Couple
    elif(size<=4):
        a=3    #small family
    elif(size<=6):
        a=4   #medium amilhy
    else:
        a=5   #large family
    return a
train['family']=train['family'].map(family)





**2.4. Extracting Cabin name**

In [None]:
for i in range(len(train)):
    if not(pd.isnull(train['Cabin'].iloc[i])):
        train.loc[i,'Cabin']=train['Cabin'].loc[i][0]
train['Cabin'].replace({'C':1,'B':2,'D':3,'E':4,'A':5,'F':6,'G':7,'T':8},inplace=True)
train['Fare']=np.sqrt(train['Fare'])

**2.5. Dropping unnecessary columns**

In [None]:
train.drop(['Name','SibSp','Parch','Ticket','PassengerId','Cabin'],axis=1,inplace=True)



# 3. Data Visualization

**3.1. Histogram**

In [None]:
train.hist(figsize=(15,10))
plt.show()

**3.2. Correlation**

In [None]:
fig,ax=plt.subplots(3,1,figsize=(15,13))
sns.heatmap(train.corr('spearman'),annot=True,ax=ax[0],label='spearman')    #spearman 
sns.heatmap(train.corr('kendall'),annot=True,ax=ax[1],label='kendall')      #Kendall
sns.heatmap(train.corr('pearson'),annot=True,ax=ax[2],label='pearson')      #pearson  

**3.3. Embarked**

In [None]:
sns.catplot(x='Embarked',data=train,kind='count',hue='Survived',col='Sex')  
# -----> Male from Southampton has lesser chance to survive
# -----> Female mostly from 1st and 2nd class a lot more chance to survive
                         

In [None]:
sns.countplot(x='family',data=train,hue='Survived') # small family has more chance to survive of size 2 and 3

In [None]:
sns.countplot(x='title',data=train,hue='Survived')  # women has a lot more chance to survive according to title

In [None]:
sns.ecdfplot(x='Age',data=train,hue='Survived')
plt.annotate('The plot has a little up showing young children to survive',xy=(13,0.17),xytext=(60,0.3),arrowprops=({'color':'gray'}))
plt.show()

In [None]:
train=pd.get_dummies(train,columns=['Pclass','Embarked','title','family'],drop_first=True)

impute=KNNImputer(n_neighbors=13)
train=pd.DataFrame(impute.fit_transform(train),columns=train.columns)


# 4.Model Selection

**Different types of Classification algorithm**

In [None]:
model=[]
model.append(('Logistic Regression',LogisticRegression(max_iter=1000)))
model.append(('LDA',LinearDiscriminantAnalysis()))
model.append(('SVC',SVC(kernel='rbf')))
model.append(('DTC',DecisionTreeClassifier()))
model.append(('GBC',GradientBoostingClassifier()))
model.append(('RFC',RandomForestClassifier()))
model.append(('Kneig',KNeighborsClassifier()))


x=train.drop('Survived',axis=1)   
y=train['Survived']
xtrain,xvalid,ytrain,yvalid=train_test_split(x,y,test_size=0.3)

In [None]:

scores=[]

for name,models in model:
    pipeline=Pipeline(steps=[('scale',MinMaxScaler()),('model',models)])
    cv=StratifiedKFold(n_splits=10,random_state=21,shuffle=True)
    score=cross_val_score(pipeline,x,y,cv=cv,scoring='accuracy',n_jobs=-1)
    scores.append((name,np.mean(score)))
   
    
scores

**Classification report**

In [None]:
from sklearn.metrics import classification_report

model=LogisticRegression(max_iter=3000)
model.fit(xtrain,ytrain)
ypred=model.predict(xvalid)
print(classification_report(yvalid,ypred))

In [None]:
model=RandomForestClassifier()
model.fit(xtrain,ytrain)
ypred=model.predict(xvalid)
print(classification_report(yvalid,ypred))

# 7.Model classification using voting Classifier

In [None]:
estimator = []
estimator.append(('LR', GradientBoostingClassifier()))
estimator.append(('SVC', RandomForestClassifier()))
estimator.append(('kd',LogisticRegression(max_iter=3000)))



  
# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(xtrain, ytrain)
ypred=vot_hard.predict(xvalid)
print(classification_report(yvalid,ypred))

In [None]:
pipeline=Pipeline(steps=[('scale',RobustScaler()),('model',VotingClassifier(estimators = estimator, voting ='hard'))])
cv=StratifiedKFold(n_splits=10,random_state=21,shuffle=True)
pipeline.fit(x,y)
ypred=pipeline.predict(xvalid)
print(classification_report(yvalid,ypred))

# 8. Hyperparameter tuning 

**5.1. KNeighborsClassifier**

In [None]:

"""from sklearn.model_selection import GridSearchCV

metrics = ['euclidean','manhattan'] 
neighbors = np.arange(1, 16)
param_grid  = dict(metric=metrics, n_neighbors=neighbors)
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=10,scoring='accuracy', refit=True)
grid_search.fit(x, y)
print(grid_search.best_params_)"""

**5.2. Random Forest Classifier**

In [None]:
"""from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint


param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, x.shape[1]),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "n_estimators": sp_randint(100, 500)}

random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=10, cv=5, iid=False, random_state=42)
random_search.fit(x,y)
print(random_search.best_params_)"""

In [None]:
"""gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              'max_features': [1.0, 0.3, 0.1] 
              }
print(gb_grid_params)

gb_gs = GradientBoostingClassifier(n_estimators = 600)

clf =GridSearchCV(gb_gs,
                               gb_grid_params,
                               cv=2,
                               scoring='accuracy', 
                               n_jobs=10);
clf.fit(x,y)
print(clf.best_params_)"""

# 9.Testing

In [None]:

test['Sex'].replace({'male':0,'female':1},inplace=True)
test['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)
test['title']=0
test['Fare']=np.sqrt(test['Fare'])
for i in range(0,len(test)):
    test.loc[i,'title']=test['Name'].iloc[i].split(',')[1].split('.')[0][1:]
test['title'].replace({'Mr':1,'Miss':2,'Mrs':2,'Master':3,'Dr':4,'Rev':5},inplace=True)
test['title'].replace(['Major','Mlle','Col','Don','the Countess','Sir','Capt','Mme','Lady','Jonkheer','Ms','Dona'],7,inplace=True)
test['family']=test['SibSp']+test['Parch']+1
test['family']=test['family'].map(family)

for i in range(len(test)):
    if not(pd.isnull(test['Cabin'].iloc[i])):
        test.loc[i,'Cabin']=test['Cabin'].loc[i][0]
test.drop(['Name','SibSp','Parch','Ticket','PassengerId','Cabin'],axis=1,inplace=True)

test=pd.get_dummies(test,columns=['Pclass','Embarked','title','family'],drop_first=True)

test=pd.DataFrame(impute.fit_transform(test),columns=test.columns)

In [None]:
submit['Survived']=pipeline.predict(test).astype(int)
submit.to_csv('ver.csv',index=False)
submit