In [29]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### 載入資料與前處理
訓練資料，驗證資料與答案

In [2]:
train_path = 'titanicTrain.csv'
verify_path = 'titanicQuestion.csv'
train_df = pd.read_csv(train_path)
verify_df = pd.read_csv(verify_path)


In [25]:
answer_df = pd.read_excel('titanic3.xls',3)

清理nan值與未用到的欄位

In [6]:
train_df.dropna(axis='rows', how='all', inplace=True)
train_df.head()

verify_df.dropna(axis='rows', how='all', inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3,,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q,,,
1,3,,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q,16.0,,
2,3,,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q,16.0,,
3,3,,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q,16.0,,
4,3,,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q,13.0,,


In [7]:
train_df.drop(columns=['home.dest', 'body','embarked','cabin','name','ticket'], inplace=True)
train_df.head()

verify_df.drop(columns=['home.dest', 'body','embarked','cabin','name','ticket'], inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,boat
0,3,,male,,0,0,7.75,
1,3,,female,,2,0,23.25,16.0
2,3,,female,,2,0,23.25,16.0
3,3,,male,,2,0,23.25,16.0
4,3,,female,,0,0,7.7875,13.0


In [8]:
# sex->sex_number M=1,F=0
# boat->on_boat  if row('boat') is not NaN
train_df['sex_number'] = train_df.apply(lambda row: 0 if row['sex'] == 'male' else 1, axis=1)
train_df['on_boat'] = train_df.apply(lambda row: 0 if type(row['boat']) == type(1.0) and np.isnan(row['boat']) else 1, axis=1)
train_df.drop(columns=['sex', 'boat'], inplace=True)
train_df.head()

verify_df['sex_number'] = verify_df.apply(lambda row: 0 if row['sex'] == 'male' else 1, axis=1)
verify_df['on_boat'] = verify_df.apply(lambda row: 0 if type(row['boat']) == type(1.0) and np.isnan(row['boat']) else 1, axis=1)
verify_df.drop(columns=['sex', 'boat'], inplace=True)
verify_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_number,on_boat
0,3,,,0,0,7.75,0,0
1,3,,,2,0,23.25,1,1
2,3,,,2,0,23.25,1,1
3,3,,,2,0,23.25,0,1
4,3,,,0,0,7.7875,1,1


所會用到的features及轉換成機器學習所要的格式

In [9]:
features = ['sex_number', 'age', 'pclass', 'fare', 'sibsp', 'parch', 'on_boat']

In [10]:


# missing value preprocessing
#     age:float
# impute missing value
imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
imputer = imputer.fit(train_df.loc[:, features])
train_arr_imputed = imputer.fit_transform(train_df.loc[:, features])
verify_arr_imputed = imputer.fit_transform(verify_df.loc[:, features])
#train_df_imputed = pd.DataFrame()



### 機器學習
一開始先嘗試了SVM

In [11]:
# 訓練模型
svc = SVC()
svc_trained = svc.fit(train_arr_imputed, train_df['survived'])

In [28]:
# 計算測資
verify_df['survived'] = svc_trained.predict(verify_arr_imputed)
verify_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_number,on_boat,rf_survived
0,3,0.0,,0,0,7.75,0,0,0.0
1,3,0.0,,2,0,23.25,1,1,1.0
2,3,0.0,,2,0,23.25,1,1,1.0
3,3,0.0,,2,0,23.25,0,1,1.0
4,3,1.0,,0,0,7.7875,1,1,1.0


在訓練資料上準確度96% >> 在驗證資料上準確度81%
<br>有overfit的情況 可能需要調參數才能改善表現

In [32]:
accuracy_score(svc_trained.predict(train_arr_imputed),train_df['survived'])

0.966

In [31]:
accuracy_score(verify_df['survived'],answer_df['survived'])

0.8122977346278317

再用RandomForest試試看

In [18]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_arr_imputed, train_df['survived']);


In [20]:
verify_df['rf_survived'] = rf.predict(verify_arr_imputed)
verify_df

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_number,on_boat,rf_survived
0,3,0.0,,0,0,7.7500,0,0,0.0
1,3,0.0,,2,0,23.2500,1,1,1.0
2,3,0.0,,2,0,23.2500,1,1,1.0
3,3,0.0,,2,0,23.2500,0,1,1.0
4,3,1.0,,0,0,7.7875,1,1,1.0
5,3,0.0,,0,0,15.5000,0,0,0.0
6,3,1.0,,0,0,7.8792,1,1,1.0
7,3,1.0,15.0,0,0,8.0292,1,0,0.0
8,3,0.0,35.0,0,0,7.7500,1,0,0.0
9,3,0.0,,0,0,7.7500,0,0,0.0


In [27]:
answer_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3,1,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q,,,
1,3,1,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q,16.0,,
2,3,1,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q,16.0,,
3,3,1,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q,16.0,,
4,3,1,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q,13.0,,


在驗證資料上展現了96%的準確率
<br> 另外該演算法包含隨機過程 故隨機狀態的改變會讓表現稍有浮動

In [30]:
accuracy_score(verify_df['rf_survived'],answer_df['survived'])

0.9611650485436893