In [None]:
#importing necessary libraries
import os
import numpy as np
import pandas as pd
import pandas_profiling as pp
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

# Reading the Datasets

In [None]:
train_df=pd.read_csv('/kaggle/input/titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')
gender_df=pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
test2 = test_df.copy()  # Creating a copy of test dataset for reserving the original data

# Datasets intuitions

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# Checking the available null values
train_df.isnull()

In [None]:
print('Missing values: ', train_df.isna().sum())

In [None]:
print('Missing values: ', train_df.isna().sum().sum())

# Visualization of training dataset for various intuitions

In [None]:
# visualizing missing values using seaborn heatmap
sns.heatmap(train_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# visualizing missing values with missingno barchart
msno.bar(train_df, figsize=(10, 5), fontsize=10, color='dodgerblue')

In [None]:
# visualizing missing values with missingno matrix
msno.matrix(train_df, figsize=(10,5), fontsize=12, color=(0.101, 0.67, 0.33), sparkline=False)

In [None]:
# visualizing missing values with missingno heatmap
msno.heatmap(train_df, figsize=(10, 5), fontsize=14, labels=True, cmap='RdYlGn', cbar=True)

In [None]:
# visualizing missing values with missingno dendogram
msno.dendrogram(train_df, figsize=(10,5), fontsize=12);

In [None]:
# visualizing how many people survived
sns.set_style('whitegrid')
sns.countplot(x='Survived', data=train_df)

In [None]:
# visualizing how many people survived based on Gender
sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Sex', data=train_df)

In [None]:
# male and female pessangers
print('Total Male Passengers: ', train_df['Sex'].value_counts()['male'])
print('Total Female Passengers: ', train_df['Sex'].value_counts()['female'])

In [None]:
# visualizing how many people survived based on Class
sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Pclass', data=train_df)

In [None]:
# visualizing age distribution (dropping missing values)
sns.displot(train_df['Age'].dropna(), kde=False, color='darkgreen', bins=30)

In [None]:
# visualizing how many people had siblings or spouses
sns.countplot(x='SibSp', data=train_df)

# Organizing Datasets / Dealing with missing values

In [None]:
# Merging two dataframes
#X = train_df[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]]
# Y = train_df["Survived"]
#X_test = test_df[["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]]
#frames = [X,X_test]
#final_frame = pd.concat(frames)

In [None]:
full_data = [train_df, test_df]  # Merging two datasets in a list for logical operations

# Extracting name prefixes into new column for creating numeric features
for data in full_data:
    data['Prefix'] = data['Name'].str.extract('([A-Za-z]+)\.', expand = False)

In [None]:
# Mapping prefixes and initializing numeric values against those
Prefix_mapping = {"Mr":0,"Miss":1,"Mrs":2,"Master":0,"Dr":3,"Rev":3,"Major":3,"Mlle":1,"Col":3,"Capt":3,"Sir":3,"Ms":1,"Lady":3,"Mme":2,"Countess":3,"Jonkheer":3,"Don":3}
for data in full_data:
    data["Prefix"] = data["Prefix"].map(Prefix_mapping)

In [None]:
# Dropping/Removing 'Name' column from both datasets
train_df.drop(columns = "Name", inplace = True)
test_df.drop(columns = "Name", inplace = True)

In [None]:
# Imputing missing Data using KNN
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Define a subset of the dataset for 'Age' column in both datasets
df_knn1 = train_df.filter(['Age'], axis=1).copy()
df_knn2 = test_df.filter(['Age'], axis=1).copy()

In [None]:
# Define scaler to set values between 0 and 1 (both datasets)
scaler = MinMaxScaler(feature_range=(0, 1))
df_knn1 = pd.DataFrame(scaler.fit_transform(df_knn1), columns = df_knn1.columns)
df_knn2 = pd.DataFrame(scaler.fit_transform(df_knn2), columns = df_knn2.columns)

In [None]:
# Define KNN imputer and fill missing values (both datasets)
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
df_knn_imputed1 = pd.DataFrame(knn_imputer.fit_transform(df_knn1), columns=df_knn1.columns)
df_knn_imputed2 = pd.DataFrame(knn_imputer.fit_transform(df_knn2), columns=df_knn2.columns)

In [None]:
# Converting age into the original data values (both datasets)
df_knn_imputed1['Age'] = df_knn_imputed1['Age']*100
df_knn_imputed2['Age'] = df_knn_imputed2['Age']*100

In [None]:
df_knn_imputed1.info()

In [None]:
df_knn_imputed2.info()

In [None]:
# visualizing the spreding of 'Age' data
sns.displot(df_knn_imputed1['Age'].dropna(), kde=False, color='darkgreen', bins=30)

In [None]:
# replacing missing columns with KNN 'Age' values from the sub-data-frame
train_df['Age'] = df_knn_imputed1['Age']
test_df['Age'] = df_knn_imputed2['Age']

In [None]:
# Ffill imputation for filling text missing data (Cabin Column)
#train_df['Cabin'] = train_df['Cabin'].fillna(method='ffill')

In [None]:
# Initializing Numeric values against the 'Sex' values (i.e. female=0, male=1) for both datasets
dummy_sex1 = pd.get_dummies(train_df["Sex"])
dummy_sex2 = pd.get_dummies(test_df["Sex"])
train_df["Sex"] = dummy_sex1["female"]
test_df["Sex"] = dummy_sex2["female"]

In [None]:
# Dealing with missing data in Cabin (initializing 0 on missing values, 1 on existing values) for both datasets
for i in range(891):
    if(train_df.at[i,"Cabin"]!=0):
        train_df.at[i,"Cabin"]=1
        
for i in range(418):
    if(test_df.at[i,"Cabin"]!=0):
        test_df.at[i,"Cabin"]=1

In [None]:
# Splitting Embarked datapoint in three categories to assign numeric values on those (for both datasets)
emb_dummies1 = pd.get_dummies(train_df["Embarked"])
train_df["C"] = emb_dummies1["C"]
train_df["Q"] = emb_dummies1["Q"]
train_df["S"] = emb_dummies1["S"]
train_df.drop(columns="Embarked",inplace = True)
emb_dummies2 = pd.get_dummies(test_df["Embarked"])
test_df["C"] = emb_dummies2["C"]
test_df["Q"] = emb_dummies2["Q"]
test_df["S"] = emb_dummies2["S"]
test_df.drop(columns="Embarked",inplace = True)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Dropping unnecessary columns from the both datasets
train_df.drop(columns = "PassengerId", inplace = True)
test_df.drop(columns = "PassengerId", inplace = True)
train_df.drop(columns = "Ticket", inplace = True)
test_df.drop(columns = "Ticket", inplace = True)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Merging SibSp and Parch columns in a single column naming 'Family' with adding additional 1 for avoiding null values
train_df["Family"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["Family"] = test_df["SibSp"] + test_df["Parch"] + 1
for data in full_data:
    data.drop(columns = ["SibSp","Parch"],inplace =True)

In [None]:
# Bfill imputation
#fbfill_imputation = train_df.fillna(method='bfill')

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
# Ffill imputation for filling text missing data (Cabin Column)
test_df['Fare'] = test_df['Fare'].fillna(method='ffill')
test_df['Prefix'] = test_df['Prefix'].fillna(method='ffill')

In [None]:
test_df.isna().sum()

# Classification Tasks

In [None]:
# Importing Necessary Machine Learning Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
# Splitting both datasets with necessary columns
X_trainee = train_df[["Pclass","Sex","Age","Fare","Cabin","Prefix","C","Q","S","Family"]]
Y = train_df[["Survived"]]
X_testiee = test_df[["Pclass","Sex","Age","Fare","Cabin","Prefix","C","Q","S","Family"]]

In [None]:
X_trainee.info()

In [None]:
Y.info()

In [None]:
# Standardisation of the Data
sc = StandardScaler()
X_trainee = sc.fit_transform(X_trainee)
X_testiee =  sc.transform(X_testiee)

In [None]:
# Splitting datasets into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X_trainee,Y,test_size = 0.2, random_state=1)

In [None]:
# Classification using KNN (77% Accuracy Gained)
acc = []

for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,y_train.values.ravel())
    yhat = knn.predict(X_test)
    acc.append(accuracy_score(y_test,yhat))
    print("For k = ",i," : ",accuracy_score(y_test,yhat))

In [None]:
# visualisig a line graph to find out which k value gives the best accuracy (15 is the desired K value here)
plt.figure(figsize=(8,6))
plt.plot(range(1,20),acc, marker = "o")
plt.xlabel("Value of k")
plt.ylabel("Accuracy Score")
plt.title("Finding the right k")
plt.xticks(range(1,20))
plt.show()

In [None]:
# Validating model with the testing dataset and storing as Kaggle Competition's Format
KNN = KNeighborsClassifier(n_neighbors = 15)
KNN.fit(X_trainee,Y.values.ravel())
y_pred = KNN.predict(X_testiee)
df_KNN = pd.DataFrame()
df_KNN["PassengerId"] = test2["PassengerId"]
df_KNN["Survived"] = y_pred
df_KNN.head(10)

In [None]:
# Classification using Decision Tree (75% Accuracy Gained)
depth = []

for i in range(1,8):
    clf_tree = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=i)
    clf_tree.fit(X_train, y_train)
    yhat = clf_tree.predict(X_test)
    depth.append(accuracy_score(y_test,yhat))
    print("For max depth = ",i, " : ",accuracy_score(y_test,yhat))

In [None]:
# visualisig a line graph to find out which depth value gives the best accuracy [3 and 5 are the desired depth here]
plt.figure(figsize=(8,6))
plt.plot(range(1,8),depth,color="red", marker = "o")
plt.xlabel("Depth of Tree")
plt.ylabel("Accuracy Score")
plt.title("Finding the right depth with highest accuracy")
plt.xticks(range(1,8))
plt.show()

In [None]:
# Validating model with the testing dataset and storing as Kaggle Competition's Format
clf_tr = DecisionTreeClassifier(criterion="entropy", random_state = 100, max_depth = 3)
clf_tr.fit(X_trainee,Y)
pred_tree = clf_tr.predict(X_testiee)
df_TREE = pd.DataFrame()
df_TREE["PassengerId"] = test2["PassengerId"]
df_TREE["Survived"] = pred_tree
df_TREE.head(10)

In [None]:
# Classification using Random Forest Classifier (78% Accuracy gained)

clf_forest = RandomForestClassifier(random_state=0)
clf_forest.fit(X_train,y_train.values.ravel())
yhat = clf_forest.predict(X_test)
print("Accuracy for training data : ",accuracy_score(y_test,yhat))

In [None]:
# Validating model with the testing dataset and storing as Kaggle Competition's Format
clf_for = RandomForestClassifier(random_state=0)
clf_for.fit(X_trainee,Y.values.ravel())
y_forest = clf_for.predict(X_testiee)
df_FOREST = pd.DataFrame()
df_FOREST["PassengerId"] = test2["PassengerId"]
df_FOREST["Survived"] = y_forest
df_FOREST.head(10)

In [None]:
# Classification using Support Vector Machine (79% Accuracy Gained)

clf_svm = SVC(gamma='auto')
clf_svm.fit(X_train, y_train.values.ravel())
yhat = clf_svm.predict(X_test)
print("Accuracy for training data : ",accuracy_score(y_test,yhat))

In [None]:
# Validating model with the testing dataset and storing as Kaggle Competition's Format
clf_SVM = SVC(gamma='auto')
clf_SVM.fit(X_trainee,Y.values.ravel())
pred_svm = clf_SVM.predict(X_testiee)
df_SVM = pd.DataFrame()
df_SVM["PassengerId"] = test2["PassengerId"]
df_SVM["Survived"] = pred_svm
df_SVM.head(10)

In [None]:
# Classification using Logistic Regression (79% Accuracy Gained)

regr = LogisticRegression(solver='liblinear', random_state=1)
regr.fit(X_train,y_train.values.ravel())
yhat = regr.predict(X_test)
print("Accuracy for training data : ",accuracy_score(y_test,yhat))

In [None]:
# Validating model with the testing dataset and storing as Kaggle Competition's Format
reg = LogisticRegression(solver='liblinear', random_state=1)
reg.fit(X_trainee,Y.values.ravel())
y_LR = reg.predict(X_testiee)
df_LR = pd.DataFrame()
df_LR["PassengerId"] = test2["PassengerId"]
df_LR["Survived"] = y_LR
df_LR.head(10)

In [None]:
df_LR.info()

In [None]:
# Storing the Dataframe to be submitted into Kaggle competition
df_LR.to_csv('submission.csv', index=False)