In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Steps
* Problem Deifinition
* Exploratory Data Analysis
* Data Preprocessing
* Model & Predictions

# Problem Definition
Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck

Submission:

* PassengerId
* Survived (predictions: 1 for survived, 0 for deceased)


# Exploratory Data Analysis

In [None]:
# Importing libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Reading the files
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Shape of data
print(f'Training data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

In [None]:
# training data info
train_data.info()

* Sex and Embarked to be converted to category or numerical based on library consideration
* PassengerId and Name column can be dropped
* Cabin to be dropped as values are less
* Ticket column also can be dropped
* Age column will be imputed with median/mean values

In [None]:
# Basic stats
train_data.describe().T

In [None]:
# train data columns
train_data.columns

In [None]:
# Survivers Data
# 1: survived, 0: Deceased

plt.figure(figsize=(9,5))

plt.subplot(121)
plt.pie(train_data.Survived.value_counts(),labels=train_data.Survived.value_counts().index,autopct='%.1f%%')
plt.title('Pie Chart of Survived')

plt.subplot(122)
ax = sns.countplot(data=train_data, x='Survived')
ax.bar_label(ax.containers[0])
plt.title('Count plot of Survived')

plt.show()

In [None]:
# Other Count plots
countplot_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

plt.figure(figsize=(14,12))
for i,col in enumerate(countplot_list):
    plt.subplot(2,3,i+1)
    ax = sns.countplot(data=train_data,x = col,hue='Survived')
    for j in range(len(ax.containers)):
        ax.bar_label(ax.containers[j])
    plt.title(f'{col} Plot')

plt.show()

* Class 1 persons survived more compared to others and class 3 persons died most
* Female persons survived most compared to Male

In [None]:
# Survivals based on Age
plt.figure(figsize = (16,6))

plt.subplot(121)
sns.histplot(data=train_data,x='Age',hue='Survived',kde=True)
plt.title('Age Histogram')

plt.subplot(122)
sns.histplot(data=train_data,x='Fare',hue='Survived',kde=True)
plt.title('Fare Histogram')

plt.show()

In [None]:
# Survivals based on Age
plt.figure(figsize = (8,6))
sns.scatterplot(data=train_data,x='Age',y='Fare',hue='Survived')
plt.title('Age vs Fare')
plt.show()

* Persons travelled with higher fare tickets were survived more
* Survival percentage of kids upto age 10 also seems good
* most of the persons above age 20 with fares less than 100 were died
* Except few cases age seems not deciding factor

In [None]:
# Corelations

plt.figure(figsize=(8,6))
sns.heatmap(train_data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr(),annot=True)
plt.show()

* Pclass,Age,SibSp have negative relation with Survived
* Fare and Parch have positive relationships with Survived

# Data Preprocessing

* Separating features and target
* Deletion of PassengerId, Name,Cabin, Ticket features
* Encoding of Sex and Embarked features
* Imputation of values
* Scaling of features

In [None]:
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Separating features and targets
X_train = train_data.drop(['Survived','PassengerId','Name','Cabin','Ticket'],axis=1)
y_train = train_data['Survived']

X_test = test_data.drop(['PassengerId','Name','Cabin','Ticket'],axis=1)

In [None]:
# Encoding Sex and Embarked features
encoder = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder.fit_transform(X_train[['Sex','Embarked']]),columns=['Sex','Embarked'])
encoder_test = pd.DataFrame(encoder.transform(X_test[['Sex','Embarked']]),columns=['Sex','Embarked'])

X_train = pd.concat([X_train.drop(['Sex','Embarked'],axis=1),encoder_train],axis=1)
X_test = pd.concat([X_test.drop(['Sex','Embarked'],axis=1),encoder_test],axis=1)

In [None]:
feature_columns = [col for col in X_train.columns]

In [None]:
# Imputation of values
imputer = SimpleImputer(missing_values=np.nan)
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns=feature_columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test),columns=feature_columns)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns = feature_columns)
X_test = pd.DataFrame(scaler.transform(X_test),columns = feature_columns)

# Model and Predictions

In [None]:
# Model definition
# Only RFC considered (Have expereince earlier with RFC classification)

rfc = RandomForestClassifier(n_estimators = 100,max_depth=8,random_state=0)

In [None]:
#Fitting on final model
rfc.fit(X_train,y_train)

In [None]:
#predictions on train
y_pred_train = rfc.predict(X_train)

In [None]:
# Classification Report on train data set
print('Classification Report on Train Data','\n\n',classification_report(y_train,y_pred_train))

In [None]:
# Predictions on X_test
y_pred = pd.DataFrame(rfc.predict(X_test),columns=['Survived'])

In [None]:
# Submissions
submissions = pd.concat([test_data.PassengerId,y_pred],axis=1)
submissions.to_csv('submission.csv',index=False)