In [74]:
#匯入套件與模組
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [75]:
#下載資料集
import urllib.request 
import os
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [76]:
#讀取資料集
all_df = pd.read_excel("titanic3.xls")

In [77]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [78]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
      'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [79]:
# 依8:2比例將資料分成訓練資料與測試資料
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]
print(msk)

[ True  True  True ...  True  True  True]


In [80]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df),)

total: 1309 train: 1034 test: 275


In [81]:
#資料前處理
def PreprocessData(all_df):
    df=all_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked"])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaledFeatures=minmax_scale.fit_transform(Features)
    
    return scaledFeatures,Label
    

In [82]:
train_Features,train_Label = PreprocessData(train_df)
test_Features,test_Label = PreprocessData(test_df)
print(train_Features)

[[0.         0.         0.38844819 ... 0.         0.         1.        ]
 [0.         1.         0.00679502 ... 0.         0.         1.        ]
 [0.         0.         0.02151711 ... 0.         0.         1.        ]
 ...
 [1.         1.         0.35447309 ... 1.         0.         0.        ]
 [1.         1.         0.36126811 ... 1.         0.         0.        ]
 [1.         1.         0.38844819 ... 0.         0.         1.        ]]


# Build Model

In [83]:
#匯入keras模組
from keras.models import Sequential
from keras.layers import Dense,Dropout

In [84]:
#建立Keras Sequentail模型
model = Sequential()

In [85]:
#建立Dense層
#輸出的神經元個數為40
#輸入的神經元個數為9(9個features)
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=40*9,input_dim=9,
               kernel_initializer='uniform',
               activation='relu'))

In [86]:
#建立Dense層
#輸出的神經元個數為30
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=100,
                kernel_initializer='uniform',
               activation='relu'))

In [87]:
#建立Dense層
#輸出的神經元個數為1(最後的結果)
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為 sigmoid
model.add(Dense(units=1,
                kernel_initializer='uniform',
               activation='sigmoid'))

In [88]:
#定義模型訓練方式：
#設定loss function
#optimizer 設定優化器
#設定評估模型的方式為 accuracy
model.compile(loss = 'binary_crossentropy',
             optimizer='adam',metrics= ['accuracy'])

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 360)               3600      
_________________________________________________________________
dense_7 (Dense)              (None, 100)               36100     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 39,801
Trainable params: 39,801
Non-trainable params: 0
_________________________________________________________________


In [89]:
#設定訓練資料參數
#設定訓言資料與驗證資料比例
#設定epoch訓練週期與batch_size每一批次訓練筆數
#verbose設定顯示訓練過程
train_history = model.fit(
    x=train_Features,
    y=train_Label,
    validation_split = 0.1,
    epochs = 500,
    batch_size = 20,
    verbose=2)

Epoch 1/500
47/47 - 1s - loss: 0.6345 - accuracy: 0.6344 - val_loss: 0.4874 - val_accuracy: 0.7885
Epoch 2/500
47/47 - 0s - loss: 0.5126 - accuracy: 0.7656 - val_loss: 0.4390 - val_accuracy: 0.7885
Epoch 3/500
47/47 - 0s - loss: 0.4785 - accuracy: 0.7785 - val_loss: 0.4229 - val_accuracy: 0.8173
Epoch 4/500
47/47 - 0s - loss: 0.4795 - accuracy: 0.7677 - val_loss: 0.4206 - val_accuracy: 0.8173
Epoch 5/500
47/47 - 0s - loss: 0.4679 - accuracy: 0.7796 - val_loss: 0.4165 - val_accuracy: 0.8077
Epoch 6/500
47/47 - 0s - loss: 0.4596 - accuracy: 0.7839 - val_loss: 0.4151 - val_accuracy: 0.8077
Epoch 7/500
47/47 - 0s - loss: 0.4582 - accuracy: 0.7860 - val_loss: 0.4172 - val_accuracy: 0.8269
Epoch 8/500
47/47 - 0s - loss: 0.4549 - accuracy: 0.7882 - val_loss: 0.4181 - val_accuracy: 0.8173
Epoch 9/500
47/47 - 0s - loss: 0.4583 - accuracy: 0.7892 - val_loss: 0.4151 - val_accuracy: 0.8269
Epoch 10/500
47/47 - 0s - loss: 0.4523 - accuracy: 0.7968 - val_loss: 0.4239 - val_accuracy: 0.8269
Epoch 11/

47/47 - 0s - loss: 0.3842 - accuracy: 0.8323 - val_loss: 0.4181 - val_accuracy: 0.8173
Epoch 84/500
47/47 - 0s - loss: 0.3905 - accuracy: 0.8312 - val_loss: 0.4293 - val_accuracy: 0.8269
Epoch 85/500
47/47 - 0s - loss: 0.3821 - accuracy: 0.8333 - val_loss: 0.4888 - val_accuracy: 0.8077
Epoch 86/500
47/47 - 0s - loss: 0.3882 - accuracy: 0.8269 - val_loss: 0.4031 - val_accuracy: 0.8269
Epoch 87/500
47/47 - 0s - loss: 0.3810 - accuracy: 0.8344 - val_loss: 0.4151 - val_accuracy: 0.8365
Epoch 88/500
47/47 - 0s - loss: 0.3802 - accuracy: 0.8323 - val_loss: 0.4127 - val_accuracy: 0.8269
Epoch 89/500
47/47 - 0s - loss: 0.3827 - accuracy: 0.8301 - val_loss: 0.4103 - val_accuracy: 0.8173
Epoch 90/500
47/47 - 0s - loss: 0.3861 - accuracy: 0.8258 - val_loss: 0.4380 - val_accuracy: 0.8173
Epoch 91/500
47/47 - 0s - loss: 0.3818 - accuracy: 0.8323 - val_loss: 0.4123 - val_accuracy: 0.8173
Epoch 92/500
47/47 - 0s - loss: 0.3809 - accuracy: 0.8247 - val_loss: 0.4121 - val_accuracy: 0.8269
Epoch 93/500


Epoch 165/500
47/47 - 0s - loss: 0.3499 - accuracy: 0.8441 - val_loss: 0.4791 - val_accuracy: 0.7981
Epoch 166/500
47/47 - 0s - loss: 0.3482 - accuracy: 0.8387 - val_loss: 0.4717 - val_accuracy: 0.8077
Epoch 167/500
47/47 - 0s - loss: 0.3529 - accuracy: 0.8366 - val_loss: 0.5017 - val_accuracy: 0.7981
Epoch 168/500
47/47 - 0s - loss: 0.3463 - accuracy: 0.8473 - val_loss: 0.4962 - val_accuracy: 0.7981
Epoch 169/500
47/47 - 0s - loss: 0.3522 - accuracy: 0.8387 - val_loss: 0.4974 - val_accuracy: 0.8077
Epoch 170/500
47/47 - 0s - loss: 0.3490 - accuracy: 0.8452 - val_loss: 0.4701 - val_accuracy: 0.8077
Epoch 171/500
47/47 - 0s - loss: 0.3498 - accuracy: 0.8484 - val_loss: 0.5008 - val_accuracy: 0.7981
Epoch 172/500
47/47 - 0s - loss: 0.3431 - accuracy: 0.8527 - val_loss: 0.4881 - val_accuracy: 0.7885
Epoch 173/500
47/47 - 0s - loss: 0.3470 - accuracy: 0.8387 - val_loss: 0.4579 - val_accuracy: 0.8462
Epoch 174/500
47/47 - 0s - loss: 0.3495 - accuracy: 0.8484 - val_loss: 0.4937 - val_accurac

47/47 - 0s - loss: 0.3360 - accuracy: 0.8484 - val_loss: 0.5117 - val_accuracy: 0.7788
Epoch 247/500
47/47 - 0s - loss: 0.3297 - accuracy: 0.8505 - val_loss: 0.5307 - val_accuracy: 0.7981
Epoch 248/500
47/47 - 0s - loss: 0.3301 - accuracy: 0.8559 - val_loss: 0.5750 - val_accuracy: 0.7692
Epoch 249/500
47/47 - 0s - loss: 0.3285 - accuracy: 0.8548 - val_loss: 0.5586 - val_accuracy: 0.7885
Epoch 250/500
47/47 - 0s - loss: 0.3289 - accuracy: 0.8570 - val_loss: 0.5584 - val_accuracy: 0.7981
Epoch 251/500
47/47 - 0s - loss: 0.3295 - accuracy: 0.8581 - val_loss: 0.5283 - val_accuracy: 0.7981
Epoch 252/500
47/47 - 0s - loss: 0.3261 - accuracy: 0.8473 - val_loss: 0.5198 - val_accuracy: 0.7788
Epoch 253/500
47/47 - 0s - loss: 0.3270 - accuracy: 0.8505 - val_loss: 0.5200 - val_accuracy: 0.8077
Epoch 254/500
47/47 - 0s - loss: 0.3231 - accuracy: 0.8602 - val_loss: 0.5610 - val_accuracy: 0.7788
Epoch 255/500
47/47 - 0s - loss: 0.3275 - accuracy: 0.8591 - val_loss: 0.5164 - val_accuracy: 0.8173
Epoc

Epoch 328/500
47/47 - 0s - loss: 0.3137 - accuracy: 0.8591 - val_loss: 0.5712 - val_accuracy: 0.7788
Epoch 329/500
47/47 - 0s - loss: 0.3183 - accuracy: 0.8527 - val_loss: 0.6347 - val_accuracy: 0.7788
Epoch 330/500
47/47 - 0s - loss: 0.3174 - accuracy: 0.8581 - val_loss: 0.5987 - val_accuracy: 0.7788
Epoch 331/500
47/47 - 0s - loss: 0.3157 - accuracy: 0.8613 - val_loss: 0.6819 - val_accuracy: 0.7788
Epoch 332/500
47/47 - 0s - loss: 0.3166 - accuracy: 0.8602 - val_loss: 0.5951 - val_accuracy: 0.7885
Epoch 333/500
47/47 - 0s - loss: 0.3191 - accuracy: 0.8495 - val_loss: 0.5592 - val_accuracy: 0.7885
Epoch 334/500
47/47 - 0s - loss: 0.3139 - accuracy: 0.8602 - val_loss: 0.6024 - val_accuracy: 0.7788
Epoch 335/500
47/47 - 0s - loss: 0.3088 - accuracy: 0.8591 - val_loss: 0.5223 - val_accuracy: 0.8269
Epoch 336/500
47/47 - 0s - loss: 0.3242 - accuracy: 0.8538 - val_loss: 0.5947 - val_accuracy: 0.7981
Epoch 337/500
47/47 - 0s - loss: 0.3177 - accuracy: 0.8581 - val_loss: 0.5693 - val_accurac

47/47 - 0s - loss: 0.3027 - accuracy: 0.8667 - val_loss: 0.6314 - val_accuracy: 0.7885
Epoch 410/500
47/47 - 0s - loss: 0.2998 - accuracy: 0.8699 - val_loss: 0.6071 - val_accuracy: 0.7981
Epoch 411/500
47/47 - 0s - loss: 0.3044 - accuracy: 0.8645 - val_loss: 0.6309 - val_accuracy: 0.7788
Epoch 412/500
47/47 - 0s - loss: 0.3056 - accuracy: 0.8602 - val_loss: 0.6609 - val_accuracy: 0.7981
Epoch 413/500
47/47 - 0s - loss: 0.3013 - accuracy: 0.8699 - val_loss: 0.6296 - val_accuracy: 0.7788
Epoch 414/500
47/47 - 0s - loss: 0.3041 - accuracy: 0.8634 - val_loss: 0.6328 - val_accuracy: 0.7885
Epoch 415/500
47/47 - 0s - loss: 0.3060 - accuracy: 0.8667 - val_loss: 0.6189 - val_accuracy: 0.7885
Epoch 416/500
47/47 - 0s - loss: 0.2991 - accuracy: 0.8591 - val_loss: 0.6247 - val_accuracy: 0.7885
Epoch 417/500
47/47 - 0s - loss: 0.2981 - accuracy: 0.8677 - val_loss: 0.6375 - val_accuracy: 0.7692
Epoch 418/500
47/47 - 0s - loss: 0.3061 - accuracy: 0.8613 - val_loss: 0.6075 - val_accuracy: 0.7981
Epoc

Epoch 491/500
47/47 - 0s - loss: 0.2958 - accuracy: 0.8645 - val_loss: 0.6797 - val_accuracy: 0.7981
Epoch 492/500
47/47 - 0s - loss: 0.2947 - accuracy: 0.8656 - val_loss: 0.6636 - val_accuracy: 0.8077
Epoch 493/500
47/47 - 0s - loss: 0.2890 - accuracy: 0.8634 - val_loss: 0.6825 - val_accuracy: 0.7981
Epoch 494/500
47/47 - 0s - loss: 0.2897 - accuracy: 0.8753 - val_loss: 0.6779 - val_accuracy: 0.7981
Epoch 495/500
47/47 - 0s - loss: 0.2937 - accuracy: 0.8699 - val_loss: 0.7228 - val_accuracy: 0.7692
Epoch 496/500
47/47 - 0s - loss: 0.3047 - accuracy: 0.8581 - val_loss: 0.7527 - val_accuracy: 0.7788
Epoch 497/500
47/47 - 0s - loss: 0.2961 - accuracy: 0.8624 - val_loss: 0.7455 - val_accuracy: 0.7788
Epoch 498/500
47/47 - 0s - loss: 0.3029 - accuracy: 0.8624 - val_loss: 0.6549 - val_accuracy: 0.7981
Epoch 499/500
47/47 - 0s - loss: 0.3014 - accuracy: 0.8624 - val_loss: 0.6537 - val_accuracy: 0.8077
Epoch 500/500
47/47 - 0s - loss: 0.2936 - accuracy: 0.8677 - val_loss: 0.6935 - val_accurac

In [90]:
#評估模型準確率
scores = model.evaluate(x = test_Features,y = test_Label)



In [91]:
scores[1]

0.7927272915840149

# Prediction

In [61]:
#匯入自定義的人物data
Jack = pd.Series([0,'Jack',3,'male',23,1,0,5.0000,'S'])
Rose = pd.Series([1,'Rose',1,'female',20,1,0,100.0000,'S'])
JR_df = pd.DataFrame([list(Jack),list(Rose)],
                    columns=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked'])

In [62]:
all_df = pd.concat([all_df,JR_df])

In [63]:
all_df[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,0,Jack,3,male,23.0,1,0,5.0,S
1,1,Rose,1,female,20.0,1,0,100.0,S


In [64]:
all_Features,Label = PreprocessData(all_df)

In [65]:
#透過剛剛訓練好的模型預測其存活率
all_probability = model.predict(all_Features)

In [66]:
all_probability[:10]

array([[1.0000000e+00],
       [9.9959457e-01],
       [8.6387098e-03],
       [1.7884374e-04],
       [2.1437407e-02],
       [3.6689243e-01],
       [9.9999970e-01],
       [3.6443093e-01],
       [9.9999928e-01],
       [3.5551190e-04]], dtype=float32)

In [67]:
pd=all_df

In [68]:
pd.insert(len(all_df.columns),
          'probability',all_probability)

In [69]:
#最後一欄為其成果
pd[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.149474
1,1,Rose,1,female,20.0,1,0,100.0,S,1.0
