In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

In [38]:
traindata = pd.read_csv("train.csv")
testdata = pd.read_csv("test.csv")

In [39]:
testdata.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## Index
* [fillna in age](#fillna_age)
* [fillna in fare](#fillna_fare)
* [transform Sex attribute](#transform_sex)
* [transform Embarked attribute](#transform_embarked)
* [delete useless features](#delete_useless)
* [concatenate and save](#concate_save)

<a id="fillna_age"></a>
### fillna in Age
there are missing data in Age attribute, so we need to fill them with median in train data

In [40]:
train_age_median = traindata.Age.median()
print "median age in train data: %3.2f"%(train_age_median)
testdata.Age=testdata.Age.fillna(train_age_median)

median age in train data: 28.00


<a id="fillna_fare"></a>
### fillna in Fare
there are 1 missing value in Fare attribute, also fill with median in train data

In [41]:
testdata.Fare = testdata.Fare.fillna(traindata.Fare.median())

<a id="transform_sex"></a>
### transform Sex attribute

In [42]:
testdata["IsMale"] = (testdata.Sex == "male").astype(int)

<a id="transform_embarked"></a>
### transform Embarked attribute

In [43]:
print "there are %d missing value in Embarked"%(np.count_nonzero( pd.isnull( testdata.Embarked ) ))

there are 0 missing value in Embarked


In [44]:
testdata.Embarked.groupby(testdata.Embarked).size()

Embarked
C    102
Q     46
S    270
dtype: int64

In [45]:
def encode_category(category_values,nameprefix):
    unique_values = np.unique(category_values)
    
    datas = []
    names = []
    for onevalue in unique_values:
        datas.append( (category_values == onevalue).astype(int) )
        names.append(nameprefix+onevalue)
        
    return pd.concat(datas,axis=1,keys=names)

embarks = encode_category(testdata.Embarked,"Embark")
embarks.sum()

EmbarkC    102
EmbarkQ     46
EmbarkS    270
dtype: int64

<a id="delete_useless"></a>
### delete useless features

In [46]:
del testdata["Cabin"]# there are more than 50% missing values in Cabin attribute, so delete that column
del testdata["PassengerId"]# useless for classification
del testdata["Ticket"]# cannot understand this feature, think it is useless

<a id="concate_save"></a>
### concatenate and save

In [47]:
testdata = pd.concat([ testdata,embarks],axis=1)
testdata.columns

Index([u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare',
       u'Embarked', u'IsMale', u'EmbarkC', u'EmbarkQ', u'EmbarkS'],
      dtype='object')

In [48]:
testdata.to_csv("test_processed.csv")