In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

train_df.head(10)

In [1]:
print("Train Set shape: {}".format(train_df.shape))
print("Test Set shape: {}".format(test_df.shape))

In [1]:
train_df.info()

In [1]:
train_df.describe()

We are predicting the Survival label

In [1]:
train_df['Survived'].value_counts()

# Problem Definition
This is a binary classification problem. So we are going to prepare our data to train it for the following classification models and pick the best:

* Logistic Regression.
* k-Nearest Neighbors.
* Decision Trees.
* Support Vector Machine.
* Naive Bayes

# Missing Data

We will inspect the data for missing values and try to rectify where possible

In [1]:
missing_data_train = train_df.isnull().sum()

print(missing_data_train[missing_data_train > 0])

In [1]:
missing_data_test = test_df.isnull().sum()

print(missing_data_test[missing_data_test > 0])

In [1]:
import missingno as msno
msno.matrix(train_df)

In the train set, cabin has a lot of missing values, so we are going to drop it

In [1]:
msno.matrix(test_df)

in the train set, Cabin has a lot of missing values as well

In [1]:
#creating copies of training and test datasets
train_data = train_df.copy()
test_data = test_df.copy()

In [1]:
def drop_columns(df, columns):
    df = df.drop(columns, axis=1, inplace=True)
    return df

columns = ['Cabin']
drop_columns(train_df, columns)
drop_columns(test_df, columns)

train_df.shape

# Exploratory Data Analysis
Let's do some EDA to see if we can gain insights that might help is in filling missing data, and in feature selection

In [1]:
#Age distribution
import seaborn as sns

sns.displot(train_df, x="Age", hue= "Survived", multiple="stack")

There is almost a 50% death rate per age group, save for age group 65 - 75 with no fatalities. Total fatalities for age group 75 - 80.

In [1]:
sns.boxplot(x=train_df["Age"])

In [1]:
#we have some outliers so we will use median to fill in missing values in Age.

def fill_missing(df, column):
    df[column].fillna(df[column].median(),inplace=True)
    return df

column = "Age"

fill_missing(train_df, column)
fill_missing(test_df, column)

missing_data_test = test_df.isnull().sum()

print(missing_data_test[missing_data_test > 0])

In [1]:
missing_data_train = train_df.isnull().sum()

print(missing_data_train[missing_data_train > 0])

In [1]:
train_df['Fare'].value_counts()

In [1]:
sns.kdeplot(data=train_df, x="Fare", hue="Survived")

We observe that those that paid lower fares had a higher probability of dying

In [1]:
sns.boxplot(x=test_df["Fare"])

In [1]:
#we will use median again to fill in missing values for Fare
fill_missing(test_df, "Fare")

test_df.isnull().sum()

In [1]:
#relationship between Fare and Embarked
sns.stripplot(x="Embarked", y="Fare", data=train_df)

In [1]:
#relationship between Fare and Age
sns.scatterplot(data=train_df, x="Age", y="Fare")

In [1]:
sns.countplot(x="Embarked", hue="Survived", data=train_df)

We seem to have more passengers embarking at Southampton port, than others. More of the passengers who embarked at that port died, than those who survived.

In [1]:
sns.countplot(x="Pclass", hue="Survived", data=train_df)

More passengers were in Pclass 3, where there was a significantly lower survival rate than the other two classes.

In [1]:
#Relationship between Age, Sex and Survived
sns.violinplot(x="Sex", y="Age", hue="Survived",
                    data=train_df, palette="muted")

In [1]:
sns.countplot(x='Sex', hue='Survived', data=train_df)

More males died as compared to females

In [1]:
train_df.info()

In [1]:
train_df['Ticket'].value_counts()

Ticket has too many categorical attributes so we will drop it, together with PassengerId

In [1]:
columns = ['PassengerId', 'Ticket']

drop_columns(train_df, columns)
drop_columns(test_df, columns)

print(test_df.shape)
print(train_df.shape)

In [1]:
columns = ['Name']

drop_columns(train_df, columns)
drop_columns(test_df, columns)

In [1]:
train_df['SibSp'].value_counts()

In [1]:
sns.countplot(x='SibSp', hue='Survived', data=train_df)

In [1]:
train_df['Parch'].value_counts()

In [1]:
#Is there a relationship between SibSp and Parch
sns.jointplot(data=train_df, x="Parch", y="SibSp")

There is no visible relationship between SibSp and Parch

# Categorical Feature Encoding

In [1]:
#for the Sex feature, we replace Male with 1 and Female with 0

train_df['Sex']=train_df['Sex'].replace('male', 0)
train_df['Sex']=train_df['Sex'].replace('female', 1)

test_df['Sex']=test_df['Sex'].replace('male', 0)
test_df['Sex']=test_df['Sex'].replace('female', 1)

In [1]:
train_df.info()

In [1]:
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

In [1]:
train_df.isnull().sum()

In [1]:
train_df.head()

In [1]:
test_df.head()

We split features and labels on our train data set

In [1]:
X = train_df.iloc[:, 1:]
y = train_df.iloc[:, 0]

In [1]:
#Normalization of numerical features
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

train_minmax = minmax.fit_transform(X)

X = pd.DataFrame(train_minmax, columns=X.columns)
X.head()

# Train , test split

In [1]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
Logistic Regression.
k-Nearest Neighbors.
Decision Trees.
Support Vector Machine.
Naive Bayes

In [1]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression()

log_reg_model.fit(X_train, y_train)

In [1]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

knn_model.fit(X_train, y_train)

In [1]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()

tree_model.fit(X_train, y_train)

In [1]:
from sklearn.svm import SVC

svm_model = SVC()

svm_model.fit(X_train, y_train)

In [1]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier()

forest_model.fit(X_train, y_train)

In [1]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False)

xgb_model.fit(X_train, y_train)

# Model Evaluation



In [1]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def evaluate_model(model):
    y_pred = model.predict(X_test)
    print("Metrics for {} model are: ".format(model) + "\n")
    print("f1 score: {}".format(f1_score(y_test, y_pred, average="macro")))
    print("precision score : {}".format(precision_score(y_test, y_pred, average="macro")))
    print("recall score : {}".format(recall_score(y_test, y_pred, average="macro")))
    print("\n")

model_list = [xgb_model, tree_model, log_reg_model, forest_model, svm_model, knn_model]

for model in model_list:
    evaluate_model(model)   

In [1]:
#Using the KNN model
knn_model.fit(X, y)

In [1]:
y_preds = knn_model.predict(test_df)

In [1]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                       'Survived': y_preds})
output.to_csv('submission.csv', index=False)