In [18]:
## importing the libraries and data requires
import pandas as pd
import numpy as np

In [50]:
# import the data
train  = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
passID = test.PassengerId

In [51]:
# we will combine both the dataset for feature engineering
data = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [52]:
# checking for the null values
data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [53]:
# having a look at the data
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [54]:
# embarked has two missing values hence filling them with the highest frequency of embarked port
print("The highest appearing embarked post in the dataset is \n", train["Embarked"].mode())
data["Embarked"] = data["Embarked"].fillna("S")

The highest appearing embarked post in the dataset is 
 0    S
dtype: object


In [55]:
data.groupby(["Survived", "Pclass"])["Age"].agg("median")

Survived  Pclass
0.00      1        45.25
          2        30.50
          3        25.00
1.00      1        35.00
          2        28.00
          3        22.00
Name: Age, dtype: float64

In [56]:
# to fill the null values present in the age
# since from above we can see that pclass 1 has people who are age higher than the other classes, it would be wise to fill ages
# according to Pclass
median_ages = data.groupby(["Pclass"])["Age"].median().round(0);median_ages

Pclass
1   39.00
2   29.00
3   24.00
Name: Age, dtype: float64

In [57]:
# creating a function to impute the age missing values 
def imputeAge(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return median_ages[1]
        elif Pclass == 2:
            return median_ages[2]
        else:
            return median_ages[3]
    else:
        return Age
data["Age"] = data[["Age", "Pclass"]].apply(imputeAge, axis =1)

In [58]:
# creating a new column that comtain no of people from a family travelling on the ship and categoring if a passenger was travelling alone
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

In [59]:
# dropping PassengerId as it is different for all the passenegers and cabin and ticket as it contains a lot of missing values
drop_elements = ['PassengerId', 'Ticket', 'Cabin']
data.drop(drop_elements, inplace = True , axis = 1)


In [60]:
# extracting title from the name columns
data["Title"] = data["Name"].str.split(", ", expand = True)[1].str.split(".", expand = True)[0]

In [61]:
# categorizing fare on basis of quantiles
data["FareBin"] = pd.qcut(data["Fare"],4, labels = [0,1,2,3])     

In [62]:
# categorizing age on basis on equal intervals
data["AgeBin"] = pd.cut(data["Age"].astype(int),5, labels = [0,1,2,3,4])

In [63]:
# creating misc category if the count of the category is less than 10
title_names = (data["Title"].value_counts()<10) ## creating a true false series if the count of value of title is less than 10 and title name as index
data["Title"] = data["Title"].apply(lambda x: "Misc" if title_names.loc[x] == True else x)
print(data["Title"].value_counts())

Mr        757
Miss      260
Mrs       197
Master     61
Misc       34
Name: Title, dtype: int64


In [64]:
# converting objects into categories using label encoder from sklearn
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data["Sex_code"] = label.fit_transform(data["Sex"])
data["Embarked_code"] = label.fit_transform(data["Embarked"])
data["Title_code"] = label.fit_transform(data["Title"])

In [65]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin,Sex_code,Embarked_code,Title_code
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0,Mr,0,1,1,2,3
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.28,C,2,0,Mrs,3,2,0,0,4
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.92,S,1,1,Miss,1,1,0,2,2
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0,Mrs,3,2,0,2,4
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1,Mr,1,2,1,2,3


In [66]:
data= pd.get_dummies(data,columns=["Sex_code", "AgeBin","Embarked_code" ,"FareBin", "IsAlone", "Title_code"])

In [67]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,...,FareBin_1,FareBin_2,FareBin_3,IsAlone_0,IsAlone_1,Title_code_0,Title_code_1,Title_code_2,Title_code_3,Title_code_4
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,...,0,0,0,1,0,0,0,0,1,0
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.28,C,2,...,0,0,1,1,0,0,0,0,0,1
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.92,S,1,...,1,0,0,0,1,0,0,1,0,0
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,...,0,0,1,1,0,0,0,0,0,1
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,...,1,0,0,0,1,0,0,0,1,0


In [68]:
# dropping values that are no longer needed because they are categorized
data.drop(['Name', 'Sex', 'Age' ,'Fare', 'Embarked',
       'FamilySize', 'Title'], axis = 1, inplace = True)

In [69]:
train= data.iloc[:891,]
test = data.iloc[891:, ]

In [70]:
xtrain, xtest, ytrain, ytest = train_test_split(train.drop("Survived",axis = 1), train.Survived.astype("int"), random_state = 0)

#xtrain, ytrain = train.drop("Survived", axis = 1), train.Survived.astype("int")
#xtest = test.drop("Survived", axis = 1)

<IPython.core.display.Javascript object>

In [71]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [72]:
# using lazy prdict to compare accuracy of all the models
clf = LazyClassifier(verbose = 0 , ignore_warnings=True)


In [73]:
clf.fit(xtrain, xtest, ytrain, ytest)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 13.50it/s]


(                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
 Model                                                                           
 KNeighborsClassifier               0.84               0.83     0.83      0.84   
 BaggingClassifier                  0.84               0.83     0.83      0.84   
 SGDClassifier                      0.82               0.82     0.82      0.82   
 XGBClassifier                      0.83               0.82     0.82      0.83   
 LogisticRegression                 0.82               0.82     0.82      0.82   
 DecisionTreeClassifier             0.83               0.82     0.82      0.83   
 ExtraTreesClassifier               0.83               0.81     0.81      0.83   
 RandomForestClassifier             0.83               0.81     0.81      0.82   
 LabelSpreading                     0.82               0.81     0.81      0.82   
 LinearSVC                          0.82               0.81     0.81      0.82   
 CalibratedClass

In [74]:
# using knn classifier as it gives the highest accuracy

xtrain, ytrain = train.drop("Survived", axis = 1), train.Survived.astype("int")
xtest = test.drop("Survived", axis = 1)

In [75]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9 , weights = "uniform", algorithm="ball_tree", leaf_size=30, p=1)
knn.fit(xtrain, ytrain)
preds = knn.predict(xtest)
knn.score(xtrain,ytrain)

0.8496071829405163

In [76]:
from pprint import pprint
# Look at parameters used by our current classifier
print('Parameters currently in use:\n')
pprint(knn.get_params())

Parameters currently in use:

{'algorithm': 'ball_tree',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 9,
 'p': 1,
 'weights': 'uniform'}


In [None]:
# performing hyperparameter tuning on knn to get some better accuracy

In [80]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
k_range = list(range(1,20))
weights_options = ['uniform','distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size = list(range(10,101,10))
p = [1,2]

k_grid = dict(n_neighbors=k_range, weights = weights_options, algorithm = algorithm, leaf_size = leaf_size, p = p)
grid = GridSearchCV(knn, k_grid, cv=10, scoring = 'precision', verbose=2, n_jobs = -1)
grid.fit(xtrain, ytrain)

Fitting 10 folds for each of 3040 candidates, totalling 30400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 2328 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 3788 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 5568 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 7676 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done 10104 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 12860 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 15936 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 19340 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 23064 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 27116 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 30385 out of 30400 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs

GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19],
                         'p': [1, 2], 'weights': ['uniform', 'distance']},
             scoring='precision', verbose=2)

In [81]:
# printing the best score
print ("Best Score: ",str(grid.best_score_))

Best Score:  0.8360133982602894


In [82]:
# getting the best parameters
grid.best_params_

{'algorithm': 'brute',
 'leaf_size': 10,
 'n_neighbors': 4,
 'p': 2,
 'weights': 'uniform'}

In [89]:
# using new paramters for prediction
knn = KNeighborsClassifier(n_neighbors = 9 , weights = "uniform", algorithm="brute", leaf_size=10, p=2)
knn.fit(xtrain, ytrain)
preds = knn.predict(xtest)
knn.score(xtrain,ytrain)

0.8518518518518519

In [90]:
# as we can see our acurracy has been decreased a bit but it doesnt mean that the ytest data will be not better than the predicted
# values that were precticted before the hyperparameter tuning. Hyperparameter tuning saves our data from being over/underfitted
# we will continue with the parameter we got above

In [91]:
# saving the data
data = {"PassengerId":passID, "Survived":preds}
submission = pd.DataFrame(data)