In [1]:
# import libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# set working directory

path = "S:/Kaggle Projects/kaggle-titanic"
os.chdir(path)

In [3]:
# read the dataset

data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# check for missing values

data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Data Cleaning

In [5]:
# Replace missing values from`Age` with the median Age value

data['Age'] = data['Age'].fillna(data['Age'].median())

In [6]:
# Replace missing values of `Embarked` with the mode value

data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [7]:
 # remove columns
    
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [8]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [11]:
# Label encoding categorical columns

# import the module
from sklearn.preprocessing import LabelEncoder

# create object for encoder class
encoder = LabelEncoder()

In [12]:
# encode `Sex` column

data['Sex'] = encoder.fit_transform(data['Sex'])

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,S
1,2,1,1,0,38.0,1,0,71.2833,C
2,3,1,3,0,26.0,0,0,7.925,S
3,4,1,1,0,35.0,1,0,53.1,S
4,5,0,3,1,35.0,0,0,8.05,S


In [15]:
# encode `Embarked` column

data['Embarked'] = encoder.fit_transform(data['Embarked'])

In [16]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


## Train classifier model

In [29]:
# import modules

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier

In [43]:
# define the pipeline

model_pipeline = Pipeline([
    ('scaler_obj', StandardScaler()),
    ('feature_selector', VarianceThreshold()),
    ('classifier_kn', KNeighborsClassifier())
])

In [44]:
# define parameters to tune

parameters = {
    'scaler_obj': [StandardScaler(), Normalizer()],
    'feature_selector__threshold': [0, 0.01, 0.001],
    'classifier_kn__n_neighbors': [5, 7, 10, 15],
    'classifier_kn__p': [1, 2]
}

In [45]:
# split data into train and test data

X = data.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8]]
y = data.iloc[:, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(712, 8) (179, 8) (712,) (179,)


In [46]:
# apply gridSearchCV to data

grid_cv = GridSearchCV(model_pipeline, parameters, cv = 5).fit(X_train, y_train)

In [47]:
print(grid_cv.best_params_)

{'classifier_kn__n_neighbors': 5, 'classifier_kn__p': 1, 'feature_selector__threshold': 0, 'scaler_obj': StandardScaler()}


In [48]:
print(grid_cv.score(X_train, y_train))
print(grid_cv.score(X_test, y_test))

0.8497191011235955
0.8379888268156425


In [49]:
estimators = [
    ('kn', KNeighborsClassifier(n_neighbors=5, p=1))
]

In [50]:
stack_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [51]:
stack_clf.fit(X_train, y_train).score(X_test, y_test)

0.6871508379888268