# Titanic: Machine Learning from Disaster

In [248]:
#import python libraries for data analysis and visualization
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns




In [249]:
#import the input dataset as python dataframe object
titanic_df = pd.read_csv("../input/train.csv")

In [250]:
#get the first few rows in the dataframe
titanic_df.head()

In [251]:
#get overall info of the dataset
titanic_df.info()

## The following questions would be asked from the data:

### 1) Who were the passengers on the Titanic?




In [252]:
#count the number of passengers by gender
sns.countplot('Sex', data=titanic_df)


There were more number of male passengers who boarded the Titanic!


In [253]:
#count of passengers by class separated by gender
sns.countplot("Pclass", data=titanic_df, hue="Sex")

The number of male passengers who belonged to the Pclass 3 were more like the total male passengers on Titanic

In [254]:
#create a function to split between males, females and children
def person_cat(passenger):
    if passenger.Age<16:
        return "child"
    else:
        return passenger.Sex
titanic_df["Person"] = titanic_df.apply(person_cat, axis=1)
titanic_df[0:10]


In [255]:
#count of passengers by class separated by person
sns.countplot("Pclass", data=titanic_df, hue="Person")

The number of child passengers after splitting them were found to be more in Pclass 3 just like males and females

In [256]:
#histogram to show distribution of age of passengers
titanic_df["Age"].hist(bins=70)
plt.xlabel("Age")

In [257]:
#get the mean age of passengers
titanic_df["Age"].mean()


In [258]:
#get the count of passengers
titanic_df.Person.value_counts()

In [259]:
g = sns.FacetGrid(titanic_df, hue="Sex", size=5, aspect=3)
g = g.map(sns.kdeplot ,"Age")
g.set(xlim=(0, titanic_df["Age"].max()))
g.add_legend()

This gives the kernal density plot to show the distribution of Age by Sex with maximum age being 80 for males and females being less than 80 years of age

### 2) What deck were the passengers on and how does that relate to their class?


In [260]:
#drop the missing values from "Cabin" column
deck = titanic_df['Cabin'].dropna()


In [261]:
#create a loop to extract the first letter from Cabin 
levels = []
deck_array = deck.values
for i in deck_array:
    levels.append(i[0][0])
levels_df = pd.DataFrame(levels, columns=["CabinLevel"])
levels_df.sort_values(by=["CabinLevel"], inplace=True)

#count the passengers by Cabin
sns.countplot("CabinLevel", data=levels_df)

    
  


###  3) Did the deck have an effect on the passengers survival rate?

In [262]:
#create a new dataframe with "Survived" and "Cabin" columns and extract the first letter of the cabin  
cabin_df = titanic_df.loc[:, ("Survived", "Cabin")]
cabin_df.dropna(inplace=True)
cabin_df["DeckLevel"] = cabin_df["Cabin"].str[0]

In [263]:
cabin_df.head()

In [264]:
#group the dataframe by the new column of cabin and survived and plot an unstacked bar chart 
cabin_df.groupby(["Survived", "DeckLevel"]).size().unstack().plot(kind="bar")

The majority of passengers who survived and not survived belonged to the B & C deck levels followed by D & E levels

### 4) Where did the passengers come from and how does that relate to their class?

In [265]:
#get the unique values for Embarked
titanic_df.Embarked.unique()


The Embarked column has C,Q,and S unique values. The project on Kaggle shows that these stand for Cherbourg, Queenstown, Southhampton

In [266]:
#make a countplot and check the results
sns.countplot("Embarked", data=titanic_df, hue='Pclass')

The passengers were mostly from Southhampton and boarded Pclass 3 while In Queenstown all the passengers that boarded they were 3rd class 

### 5) Who was alone and who was with family?

In [267]:
# create a new column to determine how many passengers are alone or with family
titanic_df["PassengerFamily"] = titanic_df["Parch"] + titanic_df["SibSp"]
titanic_df["PassengerFamily"].head(10)

In [268]:
# subset the dataset and assign "Family" and "Alone" for PassengerFamily ==0 & >0 respectively
titanic_df.loc[titanic_df.PassengerFamily>0, "PassengerFamily"] = "Family"
titanic_df.loc[titanic_df.PassengerFamily==0, "PassengerFamily"] = "Alone"



In [269]:
# display the dataset with new column
titanic_df.head()

In [270]:
#create a countplot to count who were Alone and with Family
sns.countplot("PassengerFamily", data=titanic_df)

These were some really great insights on how gender, age, and class relates to a passenger survival using Pandas and Seaborn