In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

In [3]:
df = pd.concat([df1,df2])

In [4]:
df.info()  #檢查 遺失值 可知Cabin 缺失很多  Age也有一部分沒有 Embarked 與Fare 缺失一些

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


## 資料清理與特徵工程

In [5]:
## 處理遺失值
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna('S')
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [6]:
#將同屬性值的數值 加在一起 例如Parch + SibSp = Family

df['Family'] = df['Parch'] + df['SibSp']

In [7]:
#每個特徵 針對 生存 的關係，將有相關的特徵與後面建模相關性進行匹配 

df = df[['Age','Embarked','Fare','Pclass','Sex','Survived','Family']]
df.head()

Unnamed: 0,Age,Embarked,Fare,Pclass,Sex,Survived,Family
0,22.0,S,7.25,3,male,0.0,1
1,38.0,C,71.2833,1,female,1.0,1
2,26.0,S,7.925,3,female,1.0,0
3,35.0,S,53.1,1,female,1.0,1
4,35.0,S,8.05,3,male,0.0,0


In [8]:
# 將不是數值類型的 作one hot encoding
df = pd.get_dummies(df,columns = ['Sex'])
df = pd.get_dummies(df,columns = ['Embarked'])
df.head(5)

Unnamed: 0,Age,Fare,Pclass,Survived,Family,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,3,0.0,1,0,1,0,0,1
1,38.0,71.2833,1,1.0,1,1,0,1,0,0
2,26.0,7.925,3,1.0,0,1,0,0,0,1
3,35.0,53.1,1,1.0,1,1,0,0,0,1
4,35.0,8.05,3,0.0,0,0,1,0,0,1


## 選定特徵 及 目標

In [9]:
# 將沒有預測目標的train資料拿出來
df_train = df.loc[pd.notnull(df.Survived)]
df_test = df.loc[pd.isnull(df.Survived)]
df_train

Unnamed: 0,Age,Fare,Pclass,Survived,Family,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.000000,7.2500,3,0.0,1,0,1,0,0,1
1,38.000000,71.2833,1,1.0,1,1,0,1,0,0
2,26.000000,7.9250,3,1.0,0,1,0,0,0,1
3,35.000000,53.1000,1,1.0,1,1,0,0,0,1
4,35.000000,8.0500,3,0.0,0,0,1,0,0,1
5,29.881138,8.4583,3,0.0,0,0,1,0,1,0
6,54.000000,51.8625,1,0.0,0,0,1,0,0,1
7,2.000000,21.0750,3,0.0,4,0,1,0,0,1
8,27.000000,11.1333,3,1.0,2,1,0,0,0,1
9,14.000000,30.0708,2,1.0,1,1,0,1,0,0


In [10]:
x_test = df_test.drop(['Survived'],axis=1)
x = df_train.drop(['Survived'],axis=1)
y = df_train['Survived']

In [11]:
# 建立訓練與驗證資料 (數據分割)
from sklearn.cross_validation import train_test_split
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.2,random_state=33)



In [12]:
# 使用scikit_learn.feature_extration中特徵轉換器，進行特徵抽取
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)

In [13]:
# 轉換特徵後，類別型的特徵都單獨撥離出來，單獨成一列特徵，樹值型的保持不變。
x_train = vec.fit_transform(x_train.to_dict(orient='record'))

In [14]:
# 同樣需要對測試數據的特徵進行轉換。
x_val = vec.transform(x_val.to_dict(orient='record'))

## 訓練模型

In [15]:
# 導入模型
from sklearn.tree import DecisionTreeClassifier
# 使用默認配置初始化決策樹分類器
dtc = DecisionTreeClassifier()
# 使用分割到的訓練模型進行模型學習
dtc.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
# 利用訓練資料進行交叉驗證
from sklearn.model_selection import StratifiedKFold #交叉驗證
from sklearn.model_selection import cross_val_score #交叉驗證
from sklearn.metrics import accuracy_score #準確率
kfold = StratifiedKFold(n_splits=10,random_state=0)
cross_val_score(dtc,x_train,y_train,scoring = "accuracy",cv=kfold).mean()

0.7694444444444445

In [21]:
# 用訓練好的決策模型 對策是特徵數據進行預測
y_predict = dtc.predict(x_val)
y_predict

array([1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [22]:
# 從sklearn.metrics導入classification_report
from sklearn.metrics import classification_report

In [23]:
# 輸出預測準確性
dtc.score(x_val,y_val)

0.8100558659217877

In [25]:
# 輸出更加詳細的分類性能
print(classification_report(y_predict,y_val,target_names = ['died','survived']))

             precision    recall  f1-score   support

       died       0.93      0.79      0.85       125
   survived       0.64      0.85      0.73        54

avg / total       0.84      0.81      0.82       179



In [None]:
# 左邊分類的標籤名 support 每個標籤出限次數 ,avg/total行 各列的均值 precision recall f1-score 分別為各類別的精準度 

## 預測

In [26]:
y_test = dtc.predict(x_test)

In [27]:
df2['Survived'] = list(y_test)
df3 = df2[['PassengerId','Survived']]
df3=df3.astype('int64')
df3.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,0
2,894,0
3,895,0
4,896,1


In [28]:
df3.to_csv('鐵達尼號生存預測_決策樹.csv',index=False)
