In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

#WE WILL PREDICT WHAT SORT OF PEOPLE SURVIVED.

In [None]:
#read the csv train file into a dataframe we will call df and display the first 10 entries.
df = pd.read_csv("../input/train.csv")


1. **Preparation**
>     1.1 **Variable Identification**
> We want to start with identifying what our **predictor** and **target** variables are.

In [None]:
df.head(10)

So in this project I will be using a model to predict the survival from the person. It's clear then that our target variable, the thing we're trying to get will be *survived*. Our input variable will be *PassengerId*. 
The predictor variables are:
* PClass: Ticket class
* Sex: sex 
* Age: age in years
* SibSp: # of siblings / spouses aboard the Titanic
* Parch: # of parents / children aboard the Titanic
* Ticket: Ticket number
* Fare: Passenger fare 
* Cabin: Cabin number
* Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
* Name


> 1.2 Univariable analysis

Let's have a closer look at the train data. Starting with a variable correlation analysis. 

In [None]:
df.describe()

In [None]:
# count the nan's
# count the nan's
df.apply(lambda x: sum(x.isnull()))

In [None]:
#remove nan's from Age. Store result into a new DF.
df_1 = df.dropna()
df_1.head()

In [None]:
#let's plot this to get an age distribution.
plt.figure(figsize = (30,20))
ax = sns.countplot(df_1.Age)


In [None]:
# let's try a histogram plot
df_1['Age'].hist(bins=40)

The most common age for a passanger is 36 years old (11 people) followed by 24 years old (9 people). Many are kind of bunched. These would be our so called age-groups. **Young Adult** range 18-25, **Adult** range 27-50, **Senior** range 55-61, **Adolescent** range 14-18, **Young Child/Baby** range 0-5.  

In [None]:
# Let's see the distribution of the Ticket Classes sold.
ax= sns.countplot(df.Pclass)

Most tickets were sold to third class people. As expected, the bulk of passengers were of low class. However, a respectable amount of first class tickets were sold too (around 200).

In [None]:
# Let's see the distribution of the ticket prices. 
df.boxplot(column='Fare')

While the majority of tickets sold were 3rd class there are many ticket sold whose price lie outside the 4th quartile. These outliers could be the 1st class tickets sold. 

In [None]:
# IQR = 3rd quartile - 1st quartile
IQR = 31.0 - 7.910400
outlier_fence = (IQR * 1.5) + 31.0
print(outlier_fence)

Most outlier fares sold lie above 65 british sterling. Lets see how many belonged to females and how many to males. 

In [None]:
# create new DF consisting of all fares above 65.
expnsiv_fare = df[df['Fare'] > 65.0]
expnsiv_fare.shape

In [None]:
expnsiv_fare.sort_values(['Fare'],ascending=False).head(30)
#arrange a countplot for fares by gender.
expnsiv_sex = expnsiv_fare.Sex
ax=sns.countplot(expnsiv_sex)

In [None]:
expnsiv_sex.value_counts()
print(((70-46)/(70+46))*100)

Around **21%** more females than males bought the expensive fare tickets. 

Let's take a closer look at the type of people that were on board. 

In [None]:
# create new dataframe containing only the names.
names = pd.DataFrame(df.Name)
names.tail(100)

In [None]:
# count the nan's
names.apply(lambda x: sum(x.isnull()))

No values are unknown in the name entries. This is good. 

In [None]:
# define a function to get the suffixes. 
def get_suffix(name):
    return name.split(',')[1].split('.')[0].split()
# use the function and feed the result into an array.
name_sffx = [get_suffix(names.Name[i]) for i in range(0,len(names))]


In [None]:
suffix_df = pd.DataFrame(name_sffx,columns=['Suffixes','None'])
suffix_list = suffix_df.Suffixes.unique()
freq=suffix_df.Suffixes.value_counts()


In [None]:
# create a new df:
suffix_count = pd.DataFrame({'Suffixes': suffix_list,'Frequency':freq})
# set the index:
suffix_count.set_index('Suffixes')
del suffix_count['Suffixes']
suffix_count.index



In [None]:
#Lets plot this into a barplot
plt.figure(figsize=(15,10))
ax = sns.barplot(x=suffix_count.index,y=freq)

plt.xticks(rotation=90)
plt.xlabel('Suffixes')
plt.ylabel('Frequency')

Most of the passengers were males & most of them were just average people. Besides the Mr and Miss/Mrs we do have a respectable amount of Masters on board. 

1.3 Bivariate Analysis

**Survived-Age, Survived-Pclass, Survived-SibSp,Survived-Parch, Survived-Fare**

In [None]:
#Picking the necessary columns and saving it into a new df.
ages = df[['Survived','Age']]
ages.head(10)

In [None]:
ages.shape

So out of 891 total passengers from which are documented, only 714 have an age data. Lets sort this out. 

In [None]:
ages = ages.dropna()
ages.head(10)

In [None]:
new_index = (ages['Age'].sort_values(ascending=True)).index.values
sorted_age = ages.reindex(new_index)

In [None]:
sorted_age.head(10)

In [None]:
#Let's plot this now on a bargraph. 
plt.figure(figsize=(35,20))
ax = sns.barplot(x=sorted_age['Age'],y=sorted_age['Survived'])
plt.title('Age versus Survival')
plt.xlabel('Age')
plt.ylabel('Survival')