In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction

This is my first ever kernal on Kaggle and also my first Machine Learning project.
This is based on the 'Tabular Playground Sythanic April competition'.
I have done Exploratory data analysis of the given data, cleansed the data, applied some feature engineering techniques and then fit the data to a Logistic Regression model.
I am open to feedback and evaluation on this kernel.
Below are the steps followed in the analysis
1. Import Packages and Read data
2. Data Preprocessing
     2.1. Explore shape and attribues
     2.2. Describe data
     2.3. Check for missing values
         2.3.1. Cabin
         2.3.2. Ticket
         2.3.3. Age
         2.3.4. Fare
         2.3.5. Embarked
     2.4. Handle Outliers     
3. Exploratory Data Analysis
4. Feature Engineering
    4.1. Handle missing values for continuos features
        4.1.1. Impute null values in Age
        4.1.2. Impute missing values in Fare
    4.2. Handle missing values for categorical features
        4.2.1. Impute null values in Embarked
        4.2.2. Create new feature from SibSp and Parch
        4.2.3. One-Hot Encoding for Pclass, Sex, Embarked
    4.3. Drop unnecessary features
5. Fit data to Logistic Regression model
6. Prediction for Kaggle test data


## 1. Import Packages and Read data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.color_palette("Set2",10)
sns.set()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Read Training dataset

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')

In [None]:
# preview data
train_df.head()

### Read Test dataset

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
# preview test data
test_df.head()

## 2. Data Preprocessing

### 2.1 Explore data and its attribues

In [None]:
train_df.info()

In [None]:
print("Total number of passenger records in the training set is {}".format(train_df.shape[0]))

In [None]:
test_df.info()

'Survived' is the attribute to the predicted. Hence its not part of the test dataset

In [None]:
print("Total number of passenger records in the training set is {}".format(test_df.shape[0]))

### 2.2 Describe data

In [None]:
train_df.describe()

The above table gives  a brief decription of the numerical attributes in training data

### 2.3 Check for missing values

Let's check the data for missing values, so that we will handle them in the Feature Engineering section

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

##### 2.3.1 Cabin

In [None]:
print("Percentage of null-values in Cabin %.2f%% " %(train_df['Cabin'].isnull().sum()/train_df.shape[0]*100))

Cabin has more than 67.87% of null values, and it doesn't make much sense to impute this attribute. So we'll drop this attribute during Feature engineering process

##### 2.3.2 Ticket

In [None]:
print("Percentage of null-values in Ticket %.2f%% " %(train_df['Ticket'].isnull().sum()/train_df.shape[0]*100))

Ticket column refernces the ticket number and we wouldn't use it for our analysis. So we'll drop this attribute as well

##### 2.3.3 Age

In [None]:
print("Percentage of null-values in Age %.2f%% " %(train_df['Age'].isnull().sum()/train_df.shape[0]*100))

Age will be one of the most important attribute to predict the survived.

We wil impute the missing age values with either mean or median, depending on the data.

Let's look at the distribution of Age

In [None]:
#Plot the distribution of Age
plt.figure(figsize=(7,5))
sns.histplot(data = train_df['Age'],bins=20,kde=True, color='coral',element="step", alpha=0.4)
plt.title('Distribution of Age')
plt.ylabel('No of Passengers')
plt.show()

In [None]:
print('The mean Age is',train_df['Age'].mean())
print('The median Age is',train_df['Age'].median())

In [None]:
sns.boxplot(x=train_df['Age'], color='turquoise')

As we can see Age is slightly left skewed, so we will consider median for imputing null values.

##### 2.3.4 Fare

In [None]:
print("Percentage of null-values in Fare %.2f%% " %(train_df['Fare'].isnull().sum()/train_df.shape[0]*100))

Fare is the ticket fare and its based on the class of the Passenger. So we'll group passenger records by Passenger class, calculate the average fare and impute missing fare values based on this.

Let's look at the distribution of Fare attribute

In [None]:
# Visualize distribution of Fare

plt.figure(figsize=(7,5))
sns.histplot(data = train_df['Fare'],bins=50, kde=True, color='green',element="step", alpha=0.3)
plt.title('Distribution of Fare')
plt.show()

Clearly Fare is left skewed and also has lot of outliers at the upper extreme. We will remove some of the outliers using quartiles and IQR.

Let's look at the outliers more clearly using boxplot

In [None]:
sns.boxplot(data=train_df, y=train_df['Fare'], x=train_df['Pclass'], palette='Set2')

We will group the passenger records by Passenger Class and then remvove the Fare outliers from each group. We will perform this in the next section

##### 2.3.5 Embarked

In [None]:
print("Percentage of null-values in Embarked %.2f%% " %(train_df['Embarked'].isnull().sum()/train_df.shape[0]*100))

There are only 0.25% null values in Port of Embarkation. So we'll impute this with the most frequent port embarked.

In [None]:
#Visualize the distribution of Embarked
plt.figure(figsize=(7,5))
sns.countplot(x='Embarked',data=train_df, palette='Set2')
plt.title('Distribution of Embarked')
plt.ylabel('No of passengers')
plt.show()

All of the attributes with null values have been explored. We will look at the distribution of all the attributes after handling outliers and null-values

### 2.4 Handle Outliers

Fare column has outliers.

1. Because Fare is associated with PClass, We will group the records by Pclass 
2. We can calculate IQR --> Inter Quartile Range and Quantile(0.75).
3. Compute Quantile(0.75) + (1.5 * IQR)
4. Any value above this can be treated as outlier and dropped.

In [None]:
#Group based on Pclass
train_pclass1 = train_df.iloc[np.where(train_df['Pclass'] == 1)]
train_pclass2 = train_df.iloc[np.where(train_df['Pclass'] == 2)]
train_pclass3 = train_df.iloc[np.where(train_df['Pclass'] == 3)]

In [None]:
# Handling outliers where Pclass=1
q1 = train_pclass1['Fare'].quantile(0.25)
q3 = train_pclass1['Fare'].quantile(0.75)
IQR = q3-q1
m = q3 + 1.5*IQR

In [None]:
train_pclass1 = train_pclass1.iloc[np.where(train_pclass1['Fare'] < m)]

In [None]:
# Handling outliers where Pclass=2
q1 = train_pclass2['Fare'].quantile(0.25)
q3 = train_pclass2['Fare'].quantile(0.75)
IQR = q3-q1
m = q3 + 1.5*IQR

train_pclass2 = train_pclass2.iloc[np.where(train_pclass2['Fare'] < m)]

In [None]:
# Handling outliers where Pclass=3
q1 = train_pclass3['Fare'].quantile(0.25)
q3 = train_pclass3['Fare'].quantile(0.75)
IQR = q3-q1
m = q3 + 1.5*IQR

train_pclass3 = train_pclass3.iloc[np.where(train_pclass3['Fare'] < m)]

In [None]:
#concatenate the 3 dataframes
train_df = pd.concat([train_pclass1,train_pclass2,train_pclass3], axis=0)

In [None]:
train_df.shape

6777 records are dropped as outliers in Fare attribute

Let's draw the boxplot of Fare to check if the outliers are handled well

In [None]:
#Boxplot of Fare
plt.figure(figsize=(7,5))
sns.boxplot(data=train_df, y=train_df['Fare'], x=train_df['Pclass'], palette='Set2')
plt.title("Checking for outliers")
plt.show()

As we have already handled the extreme outliers, we can leave the outliers still present in the data untreated. 

## 3. Exploratory data analysis

In this section, we will explore some more features and look for relationship with 'Survived'

Let's draw some plots and gather insights

Let's see if there is any relationship between PClass and Survived passengers

In [None]:
upper = train_df.loc[train_df['Pclass'] == 1]['Survived']
upper_percent = (sum(upper)/len(upper))*100
print(" %.2f%% of upper class passengers survived" %upper_percent)

middle = train_df.loc[train_df['Pclass'] == 2]['Survived']
middle_percent = (sum(middle)/len(middle))*100
print(" %.2f%% of middle class passengers survived" %middle_percent)

lower = train_df.loc[train_df['Pclass'] == 3]['Survived']
lower_percent = (sum(lower)/len(lower))*100
print(" %.2f%% of lower class passengers survived" %lower_percent)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Survived',hue='Pclass',data=train_df, palette='Set2')
plt.title('Analysing Survived passengers by Pclass')
plt.ylabel('No of passengers')
plt.show()

From the graph, its obvious that passengers belonging to upper class have higher chances of survival.

Also most passengers who didnt survive are from lower class

#### Did more women survive than men?

In [None]:
male = train_df.loc[train_df['Sex'] == 'male']['Survived']
male_percent = (sum(male)/len(male))*100
print("Percentage of men who survived %.2f%% " %male_percent)

female = train_df.loc[train_df['Sex'] == 'female']['Survived']
female_percent = (sum(female)/len(female))*100
print("Percentage of women who survived %.2f%%" %female_percent)

Its is true that being a women increased the chances of being on life boat and survival

71 % of women survived when compared to 20 % of men who survived the disaster.

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Survived',hue='Sex',data=train_df, palette='Set2' )
plt.title('Analysing Survived by Sex')
plt.ylabel('No of Passengers')
plt.show()

#### Is there any relationship between Fare and survival?

In [None]:
plt.figure(figsize=(7,5))
ax = sns.kdeplot(train_df[train_df['Survived'] == 1]['Fare'], shade=True, legend=True, color='coral')
ax = sns.kdeplot(train_df[train_df['Survived'] == 0]['Fare'], shade=True, legend=True, color='teal')
plt.title('Relationship between Fare and Survival')
ax.legend(['Survived','Deceased'])
plt.show()

From the plot, we can infer that people who paid less fare have low probability of survival.

## 4. Feature Engineering
1. Handle missing values for continuous features
2. One Hot Encoding for categorical features

### 4.1 Handle missing values for continuous features

As we have already explored the different features, we have a fair idea of how to impute missing values for different features

We will impute missing in our training data and then we'll apply the same logic and impute missing values in final test data (kaggle) as well.

#### 4.1.1 Impute null values in Age

In [None]:
# Impute train data
age_median = train_df['Age'].median()
train_df['Age'] = train_df['Age'].fillna(age_median)

In [None]:
# Impute test data
test_df['Age'] = test_df['Age'].fillna(age_median)

#### 4.1.2 Impute missing values in Fare

In [None]:
# Impute train data
train_mean_fare = lambda x: x.fillna(x.mean())
train_df['Fare'] = train_df.groupby(['Pclass'])['Fare'].apply(lambda x: x.fillna(x.mean()))

In [None]:
# Store fare mean by Pclass for handling missing values in test data
impute_fare = train_df.groupby(['Pclass'])['Fare'].mean()
impute_fare

In [None]:
pclass1_mean_fare = impute_fare.iloc[0]
pclass2_mean_fare = impute_fare.iloc[1]
pclass3_mean_fare = impute_fare.iloc[2]

In [None]:
# Impute missing values in test data
test_df['Fare'] = np.where(test_df['Pclass']==1, test_df['Pclass'].fillna(pclass1_mean_fare),test_df['Fare'])
test_df['Fare'] = np.where(test_df['Pclass']==2, test_df['Pclass'].fillna(pclass2_mean_fare),test_df['Fare'])
test_df['Fare'] = np.where(test_df['Pclass']==3, test_df['Pclass'].fillna(pclass3_mean_fare),test_df['Fare'])

In [None]:
test_df['Fare'].isnull().sum()

### 4.2 Handle missing values for categorical features

There are 3 categorical features to be imputed.
* Embarked
* Pclass
* SibSp & Parch --> combine to create new attribute

We will perform One-Hot encoding for Pclass and Embarked features.

SibSp --> Presence of Sibling/Spouse on board
Parch --> Presence of Parent/Child on board
We will create a new feature based on these two, to indicate the presence of Family on board

We will impute missing in our training data and then we'll apply the same logic and impute missing values in final test data (kaggle) as well.

#### 4.2.1 Impute null values in Embarked
* We will impute missing values with the most frequent port of Embarkment

In [None]:
# Imputing train data
freq_embarked = train_df['Embarked'].mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(freq_embarked)

In [None]:
# Imputing test data
test_df['Embarked'] = test_df['Embarked'].fillna(freq_embarked)

#### 4.2.1 Create new feature from SibSp and Parch

In [None]:
# Impute train data
train_df['Family'] = np.where(train_df['SibSp']+train_df['Parch'] > 0, 1,0)

In [None]:
# Impute test data
test_df['Family'] = np.where(test_df['SibSp']+test_df['Parch'] > 0, 1,0)

#### 4.2.2 One-Hot Encoding for Pclass, Sex, Embarked

In [None]:
# Impute train data
train_df['Pclass'] = train_df['Pclass'].astype(str)
train_onehot = pd.get_dummies(train_df[['Pclass','Sex','Embarked']], drop_first=True)

In [None]:
train_onehot.columns

In [None]:
train_df = pd.concat([train_df,train_onehot], axis=1)

In [None]:
#  Impute test data
test_df['Pclass'] = test_df['Pclass'].astype(str)
test_onehot = pd.get_dummies(test_df[['Pclass','Sex','Embarked']], drop_first=True)

In [None]:
test_onehot.columns

In [None]:
test_df = pd.concat([test_df,test_onehot], axis=1)

### 4.3. Drop unnecessary features
Let's drop the features that are no longer needed after one hot encoding.

In [None]:
train_df.head()

In [None]:
# Drop features in train data
train_df.drop(['Sex','Embarked','SibSp','Parch','Pclass'], axis=1, inplace=True)
train_df.drop(['PassengerId','Name','Cabin','Ticket'], axis=1, inplace=True)


In [None]:
train_df.columns

In [None]:
# Drop features in test data
test_df.drop(['Sex','Embarked','SibSp','Parch','Pclass'], axis=1, inplace=True)
test_df.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)

In [None]:
test_df.columns

## 5. Fit data to Logistic Regression model


In [None]:
X = train_df.drop(['Survived'], axis=1)

In [None]:
y = train_df['Survived']

Since the attributes are of different scales, we will scale the data using standard scaler

Then we have to check for multicollinearity in the dependent variables using VIF (Variance Inflation Factor)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled

This is transformed data after scaling

In [None]:
vif = pd.DataFrame()
vif['vif'] = [variance_inflation_factor(X_scaled,i) for i in range(X_scaled.shape[1])] 
vif['Features'] = X.columns

In [None]:
vif

All Vif values are very low --> no multicollinearity

We will split the training data into train and test data, so that we can check the accuracy of prediction

The test data provided by kaggle is for final prediction and submission

So, let's split the training data into train (80%) and test(20%)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.20, random_state=355)


In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
log_reg.score(X_test,y_test)

In [None]:
# Print confusion matrix
conf_matrix = confusion_matrix(y_test,y_pred)
conf_matrix

In [None]:
# Plot confusion matrix on heatmap
plt.figure(figsize=(4,4))
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='.0f', cbar=False, cmap='Greys')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy of the logistic regression model is %2.3f" % accuracy)

In [None]:
true_positive = conf_matrix[0][0]
false_positive = conf_matrix[0][1]
false_negative = conf_matrix[1][0]
true_negative = conf_matrix[1][1]

In [None]:
Precision = true_positive / (true_positive + false_positive)
Recall = true_positive / (true_positive + false_negative)
Specificity = true_negative / (true_negative + false_positive)
False_positive_rate = 1-Specificity
auc = roc_auc_score(y_test,y_pred)

In [None]:
print("Accuracy of the logistic regression model is %2.3f" % accuracy)
print("Precision of the logistic regression model is %2.3f" % Precision)
print("Recall of the logistic regression model is %2.3f" % Recall)
print("Specificity of the logistic regression model is %2.3f" % Specificity)
print("False positive rate of the logistic regression model is %2.3f" % False_positive_rate)
print("AUC of the logistic regression model is %2.3f" % auc)

## 6. Prediction for Kaggle Test data

In [None]:
test_df.head()

In [None]:
test_result = pd.DataFrame()
test_result['PassengerId']=test_df['PassengerId']

In [None]:
test_df.drop(['PassengerId'],axis=1,inplace=True)
test_df.shape

In [None]:
test_result['Survived'] = log_reg.predict(test_df)

In [None]:
test_result

In [None]:
test_result.to_csv('submission.csv',header=True,index=False)