# Titanic

## Constants

In [1]:
DATA_PATH = 'datasets/'
TITANIC_PATH = DATA_PATH + 'titanic/'

In [2]:
TITANIC_TRAIN_PATH = TITANIC_PATH + 'train.csv'
TITANIC_TEST_PATH = TITANIC_PATH + 'test.csv'

## Load datasets

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(TITANIC_TRAIN_PATH)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [8]:
df.hist(figsize=(15,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1077d6d68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c836978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8747b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c87eb38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8de898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c8de8d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10c94e978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c988978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10c9bb358>]], dtype=object)

### Looking for correlation

In [9]:
corr_matrix = df.corr()

In [10]:
corr_matrix.Survived.sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Clean data

### Remove unimportant features

Hypothesis : PassengerId, Name, Ticket and Cabin don't influence the fact of survived or not. I will see if it's true or not.

In [11]:
titanic_features = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
titanic_labels = df[['Survived']]

In [12]:
titanic_features.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [13]:
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


### Remove NaN

In [14]:
from sklearn.preprocessing import Imputer

In [15]:
imputer = Imputer(strategy='median')

In [17]:
titanic_num = titanic_features.drop(['Sex', 'Embarked'], axis=1)
titanic_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Pclass    891 non-null int64
Age       714 non-null float64
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
dtypes: float64(2), int64(3)
memory usage: 34.9 KB


In [18]:
imputer.fit(titanic_num)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [19]:
imputer.statistics_

array([  3.    ,  28.    ,   0.    ,   0.    ,  14.4542])

In [20]:
titanic_num.median().values

array([  3.    ,  28.    ,   0.    ,   0.    ,  14.4542])

In [21]:
X = imputer.transform(titanic_num)

### Convert Categorical to numbers

In [23]:
titanic_features.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

### Pipeline

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

In [25]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [26]:
from sklearn.pipeline import Pipeline

In [27]:
num_attribs = ['Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = ['Pclass', 'Sex', 'Embarked']
available_attribs = num_attribs + cat_attribs

In [28]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median'))
])

In [31]:
titanic_prepared = num_pipeline.fit_transform(titanic_features)

In [32]:
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [33]:
titanic_prepared.shape

(891, 4)