In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

All set, let us now import all the required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from pandas.plotting import scatter_matrix

Let us import the data set into the data variable

In [None]:
data = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

The first 5 rows of the data are

In [None]:
data.head()

Let us take a look at the shape of the data set

In [None]:
data.shape

Ok, so it has 891 rows and 12 columns

In [None]:
# Save test predictions to file
#output = pd.DataFrame({'PassengerId': data.PassengerId,'Survived': preds})
#output.to_csv('submission.csv', index=False)

Now let us take a look at some information regarding this dataset

Data Analysis

In [None]:
copy_data = data

In [None]:
copy_test_data = test_data

In [None]:
data.info()

Let us see the different types of passenger classes in this dataset

In [None]:
classes = data.Pclass.unique()
classes.sort()
len(classes)

In [None]:
classes

So three different classes

Let us now describe the data

In [None]:
data.describe()

In [None]:
print("Number of Passengers is equal to " + str(len(data)))

In [None]:
data.groupby('Pclass').size()

In [None]:
data.groupby('Sex').size()

In [None]:
data.groupby('Survived').size()

In [None]:
data.columns

In [None]:
dataset = data[['Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

In [None]:
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()

In [None]:
scatter_matrix(dataset)
plt.show()

In [None]:
sns.set_theme(style="darkgrid")
sns.countplot(x= "Survived", hue="Sex", order=[1, 0], data = data).set(xticklabels=["Survived", "Not Survived"])
plt.xlabel("Survivors vs Non-Survivors Comparison")
plt.ylabel("Number of Passengers")
plt.title("Comparison")
plt.legend(labels=["Male", "Female"])
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
sns.countplot(x= "Survived", hue="Pclass", data = data).set(xticklabels=["Survived", "Not Survived"])
plt.xlabel("Survivors vs Non-Survivors Comparison")
plt.ylabel("Number of Passengers")
plt.title("Comparison")
plt.legend(title='Passenger Class', loc='upper right', labels=["First Class", "Second Class", "Third Class"])
plt.show()

In [None]:
dataset.hist(figsize=(15,7.5))
plt.show()

In [None]:
data["Age"].hist(figsize=(10,5))
plt.show()

In [None]:
data["Fare"].plot.hist(figsize=(10,5), bins = 40)
plt.show()

Data Wrangling

In [None]:
data.isnull()

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data.isnull(), yticklabels = False, cmap = "plasma")
plt.show()

In [None]:
sns.set_theme(style="darkgrid")
sns.boxplot(x= "Pclass", y="Age", data = data)
plt.show()

In [None]:
data.head()

In [None]:
data.drop("Cabin", axis =1, inplace =True)

In [None]:
data.head()

In [None]:
len(data.columns)

All Null Values have been dropped!

In [None]:
sex = pd.get_dummies(data["Sex"], drop_first = True)
sex.head(5)

In [None]:
embarked = pd.get_dummies(data["Embarked"], drop_first = True)
embarked.head(5)

In [None]:
Pcl= pd.get_dummies(data["Pclass"], drop_first = True)
Pcl.head(5)

In [None]:
data.head()

In [None]:
len(data.columns)

In [None]:
data = pd.concat([data, sex, embarked, Pcl], axis = 1)
data.head()

In [None]:
len(data.columns)

In [None]:
data.drop(["Sex", "Pclass", "PassengerId", "Embarked", "Name"], axis =1, inplace = True)

In [None]:
data.drop(["Ticket"], axis =1, inplace = True)

In [None]:
len(data.columns)

In [None]:
data=data.fillna(data.mean())

In [None]:
sns.heatmap(data.isnull(), yticklabels = False, cmap = "plasma")
plt.show()

In [None]:
data.head(5)

Applying K Nearest Neighbours Algorithm

In [None]:
data.shape

In [None]:
X = data.drop("Survived", axis = 1)
y = data["Survived"]

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# using SelectKBest to get scores of all features of the DataFrame

test = SelectKBest(f_classif, k='all')
test_fit = test.fit(X, y)
feat_score = test_fit.scores_.round(3)
p_values = -np.log10(test_fit.pvalues_).round(3)
feature_list = list(X.columns.values)
selected_features = test.get_support([test_fit])
selected_features

In [None]:
temp_list = list()
for i in selected_features:
    temp_list.append({'Feature':feature_list[i], 'P_Value':p_values[i], 'Score': feat_score[i]  })
    
feat_select = pd.DataFrame(temp_list)
feat_select = feat_select.sort_values(by='Score', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
feat_select = feat_select.set_index('Feature')
feat_select

Based on the score above, I have considered top-3 features - Sex, Pclass and Adult - as my final features

In [None]:
### Dropping the Embarked and Family column

X = X.drop(['S', 'Parch', 'Age', 'SibSp', 'Q'], axis = 1)

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features_train, features_test, labels_train, labels_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)
features_train.shape

In [None]:
sns.heatmap(test_data.isnull(), yticklabels = False, cmap = "plasma")
plt.show()

In [None]:
features_test.shape

In [None]:
labels_train.shape

In [None]:
labels_test.shape

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
knn = KNeighborsClassifier( )
k_range = list(range(1,10))
weights_options = ['uniform','distance']
k_grid = dict(n_neighbors=k_range, weights = weights_options)
grid = GridSearchCV(knn, k_grid, cv=10, scoring = 'precision')
grid.fit(features_train, labels_train)

In [None]:
print ("Best Score: ",str(grid.best_score_))

In [None]:
print ("Best Parameters: ",str(grid.best_params_))

In [None]:
print ("Best Estimators: ",str(grid.best_estimator_))

In [None]:
# predicting scores

label_pred = grid.predict(features_test)

In [None]:
from sklearn.metrics import accuracy_score
# Calculating Accuracy

acc_clf = accuracy_score(labels_test,label_pred)
print ("classifier's accuracy: ",str(acc_clf) )

In [None]:
test_set = copy_test_data[['Pclass', 'Sex', 'Fare']].copy()

In [None]:
test_set.head()

In [None]:
test_set.isnull().sum()

In [None]:
test_set=test_set.fillna(test_set.mean())
test_set.isnull().sum()

In [None]:
# Converting values of column "Sex" to float

test_set["Sex"][test_set["Sex"] == "male"] = 1
test_set["Sex"][test_set["Sex"] == "female"] = 0

In [None]:
test_set.head()

In [None]:
Pcl= pd.get_dummies(test_set["Pclass"], drop_first = True)
Pcl.head(5)

In [None]:
test_set = test_set.drop(["Pclass"], axis = 1)

In [None]:
test_set.rename(columns={'Sex': 'male'}, inplace=True)

In [None]:
cols = ["Fare", "male"]
test_set = test_set[cols]

In [None]:
test_set = pd.concat([test_set, Pcl], axis = 1)

In [None]:
test_set.head()

In [None]:
# predicting survivals on test set

final_pred = grid.predict(test_set)
final_pred

In [None]:
copy_test_data['Survived'] = pd.Series(final_pred, index=copy_test_data.index)
copy_test_data.head()

In [None]:
output = pd.DataFrame({'PassengerId': copy_test_data.PassengerId, 'Survived': final_pred})
output.to_csv('submission2.csv', index=False)

In [None]:
sub = pd.read_csv('./submission2.csv')
sub.shape

In [None]:
sub.head()