In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
cf.go_offline()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/titanic_train.csv')
train.head()

In [None]:
#There is going to be some missing data in the dataset. So lets check that
train.isnull()

In [None]:
sns.heatmap(train.isnull(),yticklabels=False, cbar=False, cmap='viridis')

We notice that there is a lot of cabin information missing and there is some information missing in the Age column

In [None]:
sns.set_style('whitegrid')

In [None]:
sns.countplot(x='Survived', data=train)

There are more count on the 0 which means they did not survived. We can look at the distribution based on sex.

In [None]:
sns.countplot(x='Survived', hue='Sex', data=train, palette='RdBu_r')

From the data it looks like there are more female survivors than male. The unsurvived count for male is **very high** compared to females.

In [None]:
sns.countplot(x='Survived', hue='Pclass', data=train)

When the graph is distributed by passenger class. This looks likes the people who did not survive are way more higher on the class 3. The higher class passengers seems to have lesser death rate. The other reason also could be we have fairly lower total count of Class 1 and 2 passengers

In [None]:
sns.distplot(train['Age'].dropna(), kde=False, bins=30)

It looks like there were a lot of young passengers on the titanic. 

In [None]:
train['Age'].plot.hist(bins=20)
#doing the same thing using pandas native plotting. 

In [None]:
train.info()

In [None]:
sns.countplot(x='SibSp', data=train)

From the chart it looks like most of the people where singles and without children. It could be possible that they did not have a spouse onboard as well.  The second highest option is 1 which means cases where they have a spouse but no children on board. 

In [None]:
train['Fare'].hist(bins=40, figsize=(10,4))

It looks like most of the passengers were in the cheaper Class 3. This could be one of the reasons where we saw more non-survivors in the barcharts displayed earlier. 

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x='Pclass',y='Age', data=train)

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
    
#This function helps in getting the suitable possible age for the missing values in the Age column. 

In [None]:
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)

In [None]:
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
train.drop('Cabin', axis=1, inplace=True)
#Since there are loads of information missing in the cabin column, it is better we drop it completely

In [None]:
train.head()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

Now that we have completely removed Cabin column. We can see if there is anything else that has missing information. We notice from the above chart that the embarked column has little missing information. We will run train.dropna once to ensure all missing values are fully removed from the dataset.

In [None]:
train.dropna(inplace=True)

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

Now when we run the heatmap again we notice there is no missing data at all. 

In [None]:
sex = pd.get_dummies(train['Sex'], drop_first=True)

In [None]:
embark = pd.get_dummies(train['Embarked'], drop_first=True)

In [None]:
train = pd.concat([train,sex,embark], axis=1)

In [None]:
train.head()

Now that we have created new columns that the machine learning model can understand, we can drop the columns that have been converted to dummy columns. This optimizes that the dataset to be addressed by the machine learning algorithm

In [None]:
train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
train.head()

We can drop the passenger ID as well as it wouldnt give any insight.

In [None]:
train.drop('PassengerId', axis=1, inplace=True)

In [None]:
train.head()

In [None]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, predictions)

We had another column called the Pclass which also had specific numeric values. 1 to 3. This means, this column is also a potential for converting into a dummies column. 

In [None]:
train.head()

In [None]:
Pclass = pd.get_dummies(train['Pclass'], drop_first=True)
Pclass.head(20)

In [None]:
train = pd.concat([train,Pclass], axis=1)

In [None]:
train.head(20)

Now we can drop the Pclass column and see how the data looks. 

In [None]:
train.drop(['Pclass'], axis=1, inplace=True)

In [None]:
train.head(20)

We will run the logmodel fit once again

In [None]:
logmodel.fit(X_train, y_train)

We can run the predictions once again

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))