In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

In [24]:
traindata = pd.read_csv("train.csv")
testdata = pd.read_csv("test.csv")

In [36]:
testdata.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,IsMale,EmbarkC,EmbarkQ,EmbarkS
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,29.805024,0.447368,0.392344,35.576535,0.636364,0.244019,0.110048,0.645933
std,0.841838,12.667969,0.89676,0.981429,55.850103,0.481622,0.430019,0.313324,0.478803
min,1.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,23.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,0.0,1.0
75%,3.0,35.75,1.0,0.0,31.471875,1.0,0.0,0.0,1.0
max,3.0,76.0,8.0,9.0,512.3292,1.0,1.0,1.0,1.0


## Index
* [fillna in age](#fillna_age)
* [fillna in fare](#fillna_fare)
* [transform Sex attribute](#transform_sex)
* [transform Embarked attribute](#transform_embarked)
* [delete useless features](#delete_useless)

<a id="fillna_age"></a>
### fillna in Age
there are missing data in Age attribute, so we need to fill them with median in train data

In [35]:
train_age_median = traindata.Age.median()
print "median age in train data: %3.2f"%(train_age_median)
testdata.Age=testdata.Age.fillna(train_age_median)

median age in train data: 28.00


<a id="fillna_fare"></a>
### fillna in Fare
there are 1 missing value in Fare attribute, also fill with median in train data

In [27]:
testdata.Fare = testdata.Fare.fillna(traindata.Fare.median())

<a id="transform_sex"></a>
### transform Sex attribute

In [28]:
testdata["IsMale"] = (testdata.Sex == "male").astype(int)

<a id="transform_embarked"></a>
### transform Embarked attribute

In [29]:
print "there are %d missing value in Embarked"%(np.count_nonzero( pd.isnull( testdata.Embarked ) ))

there are 0 missing value in Embarked


In [30]:
testdata.Embarked.groupby(testdata.Embarked).size()

Embarked
C    102
Q     46
S    270
dtype: int64

In [31]:
def encode_category(category_values,nameprefix):
    unique_values = np.unique(category_values)
    
    datas = []
    names = []
    for onevalue in unique_values:
        datas.append( (category_values == onevalue).astype(int) )
        names.append(nameprefix+onevalue)
        
    return pd.concat(datas,axis=1,keys=names)

embarks = encode_category(testdata.Embarked,"Embark")
embarks.sum()

EmbarkC    102
EmbarkQ     46
EmbarkS    270
dtype: int64

<a id="delete_useless"></a>
### delete useless features

In [32]:
del testdata["Cabin"]# there are more than 50% missing values in Cabin attribute, so delete that column
del testdata["PassengerId"]# useless for classification
del testdata["Ticket"]# cannot understand this feature, think it is useless

<a id></a>
### concatenate and save

In [33]:
testdata = pd.concat([ testdata,embarks],axis=1)
testdata.columns

Index([u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare',
       u'Embarked', u'IsMale', u'EmbarkC', u'EmbarkQ', u'EmbarkS'],
      dtype='object')

In [34]:
testdata.to_csv("test_processed.csv")