In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import re as re

train = pd.read_csv('train.csv', header = 0, dtype={'Age': np.float64})
test  = pd.read_csv('test.csv' , header = 0, dtype={'Age': np.float64})
full_data = [train, test]

print (train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None


# Feature Engineering


## Pclass


In [3]:
train[["Pclass", "Survived"]].groupby("Pclass").mean()

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


## Sex

In [4]:
train[["Sex", "Survived"]].groupby("Sex").mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


## Parch and SibSp

In [5]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())


   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [6]:


for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())



   IsAlone  Survived
0        0  0.505650
1        1  0.303538


## Embarked

In [7]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
print (train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())

  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


## Fare
Fill the missing values with median

In [8]:
for dataset in full_data:
    dataset["Fare"] = dataset["Fare"].fillna(dataset[dataset["Fare"].notnull()].median())
train["CategoricalFare"] = pd.qcut(train['Fare'],4)
print (train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

  CategoricalFare  Survived
0       [0, 7.91]  0.197309
1  (7.91, 14.454]  0.303571
2    (14.454, 31]  0.454955
3   (31, 512.329]  0.581081


## Age


In [9]:
train["Age"][train["Age"].isnull()].size

177

There are many missing values for age so we can fill it with random values

In [10]:
for dataset in full_data:
    age_avg = dataset["Age"][dataset["Age"].notnull()].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()

    age_rand_list = np.random.randint(age_avg-age_std, age_avg+age_std, size=age_null_count)
    dataset["Age"][dataset["Age"].isnull()] = age_rand_list
    dataset['Age'] = dataset['Age'].astype(int)
    dataset["CategoricalAge"] = pd.qcut(dataset["Age"], 5)
#     print(age_null_count)
print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

  CategoricalAge  Survived
0        [0, 19]  0.442211
1       (19, 25]  0.308140
2       (25, 32]  0.421622
3       (32, 40]  0.385542
4       (40, 80]  0.349112


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


## Name


In [11]:
import re as re
def get_title(name):
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
		return title_search.group(1)
	return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

print(pd.crosstab(train['Title'], train['Sex']))

Sex       female  male
Title                 
Capt           0     1
Col            0     2
Countess       1     0
Don            0     1
Dr             1     6
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    40
Miss         182     0
Mlle           2     0
Mme            1     0
Mr             0   517
Mrs          125     0
Ms             1     0
Rev            0     6
Sir            0     1


In [12]:
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print (train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())


    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826


# Data cleaning


In [13]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone', 'CategoricalFare', 'CategoricalAge', 'Title'], dtype='object')

In [16]:
for dataset in full_data:
    # mapping Sex
    dataset["Sex"] = dataset["Sex"].map({"male":1, "female":0}).astype(int)
    # mapping Age
    dataset.loc[dataset["Age"] <=19, "Age"] =0
    dataset.loc[(dataset["Age"] <=25) & (dataset["Age"] >19), "Age"] =1
    dataset.loc[(dataset["Age"] <=31) & (dataset["Age"] >25), "Age"] =2
    dataset.loc[(dataset["Age"] <=40) & (dataset["Age"] >31), "Age"] =3
    dataset.loc[(dataset["Age"] <=80) & (dataset["Age"] >40), "Age"] =4
    dataset['Age'] = dataset['Age'].astype(int)
    # mapping Fare
    dataset.loc[dataset["Fare"] <=7.91, "Fare"] = 0
    dataset.loc[(dataset["Fare"] <=14.454) & (dataset["Fare"] >7.91), "Fare"] =1
    dataset.loc[(dataset["Fare"] <=31) & (dataset["Fare"] >14.454), "Fare"] =2
    dataset.loc[(dataset["Fare"] <=512.329) & (dataset["Fare"] >31), "Fare"] =3
    dataset['Fare'] = dataset['Fare'].astype(int)
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

#      CategoricalFare  Survived
# 0       [0, 7.91]  0.197309
# 1  (7.91, 14.454]  0.303571
# 2    (14.454, 31]  0.454955
# 3   (31, 512.329]  0.581081

NameError: name 'title_mappin' is not defined