In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Basic Data Inference

In [1]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

In [1]:
df_train.head()

In [1]:
df_train.info()
df_test.info()

In [1]:
df_train.describe()

In [1]:
df_train.describe(include ='O')

## Inference from the dataframes:
1. Age, Cabin, Embarked have missing values in training set
2. Age, Cabin, Fare have missing values in test set
3. DF contains Data of multiple type, so will have to convert to use based on model we decide to use.
4. PassengerId, name, Cabin does not have any significance for our prediction objective
5. Most of the people are male and embarked at S i.e. Southampton

# Exploratory Data Analysis

In [1]:
df_survived = df_train.drop(df_train[df_train['Survived'] == 0].index)

# Sex

In [1]:
sns.histplot(data = df_survived, x = 'Sex', shrink=.8)
sns.histplot(data = df_train, x = 'Sex', shrink=.8)

##### **Majority of the females survived while majority of males did not survive.**
* sex must be included in classification

# Passenger Class

In [1]:
df_train[['Pclass', 'Survived']].groupby('Pclass').sum()

In [1]:
df_train['Pclass'].value_counts()
print(f"percentage of First class passangers surviving: {136/216}")
print(f"percentage of Second class passangers surviving: {87/184}")
print(f"percentage of Third class passangers surviving: {119/491}")

##### **Higher the class -> Higher chances of surviving**
* Passenger Class must be included in classification

# Age

In [1]:
sns.histplot(data = df_survived, x = 'Age', bins = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], color='green').set_title('Total people(Blue) vs people surviving(Green)')
sns.histplot(data = df_train, x = 'Age', bins = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100])

##### **Passengers having age in mid-range have relatively lower survival rates as compared to relatively younger and older passengers**

# Siblings and Parents

In [1]:
g = sns.catplot(x='SibSp', y='Survived', kind='bar', data=df_train)

In [1]:
g = sns.catplot(x='Parch', y='Survived', kind='bar', data=df_train)

In [1]:
df_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_test['Family'] = df_test['SibSp'] + df_test['Parch']

In [1]:
g = sns.catplot(x='Family', y='Survived', kind='bar', data=df_train)

##### **Passengers having relatively less number of family members have better survival rates, we may drop SibSp and Parch and keep Family as a feature**

# Fare

In [1]:
sns.histplot(data = df_survived, x = 'Fare', bins = 20, color='green')
sns.histplot(data = df_train, x = 'Fare', bins = 20)

##### **It is apparent that passangers paying more fare have better survival rates.**

# Embarked

In [1]:
df_sur = df_survived[['Embarked', 'Survived']].groupby('Embarked').sum()

In [1]:
print("Percentage of people survived embarked from:")
print(f"S: {df_sur.iloc[2]['Survived']/df_train['Embarked'].value_counts()['S']}")
print(f"C: {df_sur.iloc[0]['Survived']/df_train['Embarked'].value_counts()['C']}")
print(f"Q: {df_sur.iloc[1]['Survived']/df_train['Embarked'].value_counts()['Q']}")

##### **Embarkment still has some effect on survival rates but they are not clearly apparent**

# Filling in missing data

In [1]:
sns.boxplot(x='Pclass', y='Age', data=df_train, showmeans=True)

### We can see that older people tend to buy tickets of higher class, so we can use Pclass to fill in missing age variables (mean age of each Pclass)

In [1]:
def getmean(cols):
    if pd.isna(cols[0]):
        if(cols[1] == 1):
            return 37
        elif(cols[1] == 2):
            return 29
        else:
            return 24
    return cols[0]

In [1]:
df_train['Age'] = df_train[['Age', 'Pclass']].apply(getmean, axis=1)
df_test['Age'] = df_test[['Age', 'Pclass']].apply(getmean, axis=1)

In [1]:
df_train.info()
df_test.info()

### Dropping the non-useful features

In [1]:
df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Parch', 'SibSp'],axis=1, inplace=True)
pid = df_test['PassengerId']
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Parch', 'SibSp'],axis=1, inplace=True)

### Filling missing Embarked and Fare data using most common entries i.e. mode

In [1]:
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0])
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mode()[0])

In [1]:
df_train.info()
df_test.info()

In [1]:
df_train.head()

# Converting labelled data to numeric data for model processing

In [1]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_train['Sex'])
df_train['Sex'] = le.transform(df_train['Sex'])
le.fit(df_test['Sex'])
df_test['Sex'] = le.transform(df_test['Sex'])
le.fit(df_train['Embarked'])
df_train['Embarked'] = le.transform(df_train['Embarked'])
df_test['Embarked'] = le.transform(df_test['Embarked'])
df_test.head()

In [1]:
X_train = df_train.drop(['Survived'], axis=1)
y_train = df_train['Survived']

# Using Random Forest to fit our data and prediction

In [1]:
clf = RandomForestClassifier(criterion='entropy', max_depth=2, max_features='sqrt',
                       min_samples_leaf=10, min_samples_split=9,
                       n_estimators=1200, random_state=42)
clf.fit(X_train, y_train)

In [1]:
clf.score(X_train, y_train)

In [1]:
y_pred = clf.predict(df_test)

In [1]:
df_out = pd.DataFrame({'PassengerId': pid, 'Survived': y_pred})
df_out.to_csv('titanic_out.csv', index = False)