# Titanic Feature Analysis

In [1]:
import csv
import numpy as np
import pandas as pd

## Read Train Data

In [2]:
train = pd.read_csv('data/train.csv', header=0)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
# Information of Data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Missing Data Processing
* Cabin, Name, Ticket은 Attribute 자체를 제거
* Age와 Embarked는 없는 행을 제거

In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train = train.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1)
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
Survived    712 non-null int64
Pclass      712 non-null int64
Sex         712 non-null object
Age         712 non-null float64
SibSp       712 non-null int64
Parch       712 non-null int64
Fare        712 non-null float64
Embarked    712 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


# Feature Analysis
## 1. Pclass
* Ticket class
* 1 = 1st, 2 = 2nd, 3 = 3rd

In [6]:
train[['Pclass','Survived']].groupby(['Pclass'], as_index=False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.652174
1,2,0.479769
2,3,0.239437


## 2. Sex

In [7]:
train[['Sex','Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,female,0.752896
1,male,0.205298


## 3. SibSp
* The number of siblings / spouses aboard the Titanic

In [8]:
train[['SibSp','Survived']].groupby(['SibSp'], as_index=False).sum()

Unnamed: 0,SibSp,Survived
0,0,173
1,1,97
2,2,11
3,3,4
4,4,3
5,5,0


In [9]:
train[['SibSp','Survived']].groupby(['SibSp'], as_index=False).mean()

Unnamed: 0,SibSp,Survived
0,0,0.36887
1,1,0.530055
2,2,0.44
3,3,0.333333
4,4,0.166667
5,5,0.0


## 4. Parch
* The number of parents / children aboard the Titanic

In [10]:
train[['Parch','Survived']].groupby(['Parch'], as_index=False).sum()

Unnamed: 0,Parch,Survived
0,0,184
1,1,61
2,2,39
3,3,3
4,4,0
5,5,1
6,6,0


In [11]:
train[['Parch','Survived']].groupby(['Parch'], as_index=False).mean()

Unnamed: 0,Parch,Survived
0,0,0.354528
1,1,0.554545
2,2,0.573529
3,3,0.6
4,4,0.0
5,5,0.2
6,6,0.0


## 5. Alone
* 'Alone' means that the number of Family(SibSp+Parch) is 0

In [12]:
train['Alone'] = 0
train.loc[train['SibSp'] + train['Parch'] == 0, 'Alone'] = 1

train[['Alone','Survived']].groupby(['Alone'], as_index=False).mean()

Unnamed: 0,Alone,Survived
0,0,0.516129
1,1,0.318408


## 6. Embarked
* Port of Embarkation
* C = Cherbourg, Q = Queenstown, S = Southampton

In [13]:
train[['Embarked','Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.607692
1,Q,0.285714
2,S,0.362816


## 7. Fare
* Fare is categorized into 4 range

In [14]:
train['CategoricalFare'] = pd.qcut(train['Fare'], 4, labels=['Fare1_low','Fare2_mid','Fare3_high','Fare4_highest'])
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

Unnamed: 0,CategoricalFare,Survived
0,Fare1_low,0.204188
1,Fare2_mid,0.321212
2,Fare3_high,0.486034
3,Fare4_highest,0.615819


## 8. Age
* Age is categorized into 5 range

In [15]:
train['CategoricalAge'] = pd.cut(train['Age'], 5, labels=['Age1_teenage', 'Age2_young', 'Age3_middle', 'Age4_elder', 'Age5_old'])
train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()

Unnamed: 0,CategoricalAge,Survived
0,Age1_teenage,0.55
1,Age2_young,0.369942
2,Age3_middle,0.40107
3,Age4_elder,0.426471
4,Age5_old,0.090909


# Pandas의 get_dummies()를 이용한 One Hot Encoding
* String으로 된 value를 Number(int)로 인코딩

In [16]:
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Alone,CategoricalFare,CategoricalAge
0,0,3,male,22.0,1,0,7.25,S,0,Fare1_low,Age2_young
1,1,1,female,38.0,1,0,71.2833,C,0,Fare4_highest,Age3_middle
2,1,3,female,26.0,0,0,7.925,S,1,Fare1_low,Age2_young
3,1,1,female,35.0,1,0,53.1,S,0,Fare4_highest,Age3_middle
4,0,3,male,35.0,0,0,8.05,S,1,Fare1_low,Age3_middle


In [17]:
pd.get_dummies(train['Sex']).head(5)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [18]:
train = train.join(pd.get_dummies(train['Sex']))
train = train.drop('Sex', axis=1)
train.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Alone,CategoricalFare,CategoricalAge,female,male
0,0,3,22.0,1,0,7.25,S,0,Fare1_low,Age2_young,0,1
1,1,1,38.0,1,0,71.2833,C,0,Fare4_highest,Age3_middle,1,0
2,1,3,26.0,0,0,7.925,S,1,Fare1_low,Age2_young,1,0
3,1,1,35.0,1,0,53.1,S,0,Fare4_highest,Age3_middle,1,0
4,0,3,35.0,0,0,8.05,S,1,Fare1_low,Age3_middle,0,1


In [19]:
train = train.join(pd.get_dummies(train['CategoricalAge']))
train = train.drop(['CategoricalAge','Age'], axis=1)

In [20]:
train = train.join(pd.get_dummies(train['CategoricalFare']))
train = train.drop(['CategoricalFare','Fare'], axis=1)

In [21]:
train = train.join(pd.get_dummies(train['Embarked']))
train = train.drop(['Embarked'], axis=1)

In [22]:
train.head(5)

Unnamed: 0,Survived,Pclass,SibSp,Parch,Alone,female,male,Age1_teenage,Age2_young,Age3_middle,Age4_elder,Age5_old,Fare1_low,Fare2_mid,Fare3_high,Fare4_highest,C,Q,S
0,0,3,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1
1,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0
2,1,3,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1
3,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1
4,0,3,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1


## Learning Decision Tree Classifier

In [35]:
from sklearn import tree, cross_validation
from sklearn.ensemble import RandomForestClassifier

In [24]:
train_y = train['Survived']
train_X = train.drop('Survived', axis=1)

In [45]:
clf = RandomForestClassifier(max_depth=7, n_estimators=10)
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Classification for test data

In [46]:
test = pd.read_csv('data/test.csv', header=0)
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Missing Data Processing
* Cabin, Name, Ticket은 Attribute 자체를 제거
* Age와 Embarked는 없는 행을 제거

In [47]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [48]:
test = test.drop(['Cabin', 'Name', 'Ticket'], axis=1)
test['Age'] = test['Age'].interpolate()
test['Fare'] = test['Fare'].interpolate()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


In [49]:
test['Alone'] = 0
test.loc[train['SibSp'] + test['Parch'] == 0, 'Alone'] = 1

test['CategoricalFare'] = pd.qcut(test['Fare'], 4, labels=['Fare1_low','Fare2_mid','Fare3_high','Fare4_highest'])

test['CategoricalAge'] = pd.cut(test['Age'], 5, labels=['Age1_teenage', 'Age2_young', 'Age3_middle', 'Age4_elder', 'Age5_old'])

test = test.join(pd.get_dummies(test['Sex']))
test = test.drop('Sex', axis=1)

test = test.join(pd.get_dummies(test['CategoricalAge']))
test = test.drop(['CategoricalAge','Age'], axis=1)

test = test.join(pd.get_dummies(test['CategoricalFare']))
test = test.drop(['CategoricalFare','Fare'], axis=1)

test = test.join(pd.get_dummies(test['Embarked']))
test = test.drop(['Embarked'], axis=1)

test_X = test.drop('PassengerId', axis=1)
passengerId = test['PassengerId']

In [50]:
result = clf.predict(test_X)

In [51]:
result = pd.DataFrame(result, columns=['Survived'])

In [52]:
output = pd.concat((passengerId, result), axis=1)
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [54]:
output.to_csv('submission/submission_RandomForest.csv', index=False)