In [1]:
#匯入套件與模組
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [2]:
#下載資料集
import urllib.request 
import os
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

downloaded: ('titanic3.xls', <http.client.HTTPMessage object at 0x7fa6bf86f7b8>)


In [3]:
#讀取資料集
all_df = pd.read_excel("titanic3.xls")

In [4]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [5]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
      'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [6]:
# 依8:2比例將資料分成訓練資料與測試資料
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [7]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df),)

total: 1309 train: 1034 test: 275


In [8]:
#資料前處理
def PreprocessData(all_df):
    df=all_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female':0, 'male':1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked"])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaledFeatures=minmax_scale.fit_transform(Features)
    
    return scaledFeatures,Label
    

In [9]:
train_Features,train_Label = PreprocessData(train_df)
test_Features,test_Label = PreprocessData(test_df)

# Build Model

In [10]:
#匯入keras模組
from keras.models import Sequential
from keras.layers import Dense,Dropout

Using TensorFlow backend.


In [11]:
#建立Keras Sequentail模型
model = Sequential()

W1029 17:04:36.811871 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [12]:
#建立Dense層
#輸出的神經元個數為40
#輸入的神經元個數為9(9個features)
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=40,input_dim=9,
               kernel_initializer='uniform',
               activation='relu'))

W1029 17:04:38.441642 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1029 17:04:38.445658 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [13]:
#建立Dense層
#輸出的神經元個數為30
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為relu
model.add(Dense(units=30,
                kernel_initializer='uniform',
               activation='relu'))

In [14]:
#建立Dense層
#輸出的神經元個數為1(最後的結果)
#輸入的神經元個數為 上一層之輸出層（預設）
#設定kernel_initializer 為 uniform distibution設定之亂數，初始化weight及bias
#設定Activation Function 為 sigmoid
model.add(Dense(units=1,
                kernel_initializer='uniform',
               activation='sigmoid'))

In [17]:
#定義模型訓練方式：
#設定loss function
#optimizer 設定優化器
#設定評估模型的方式為 accuracy
model.compile(loss = 'binary_crossentropy',
             optimizer='adam',metrics= ['accuracy'])

W1029 17:05:29.079699 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1029 17:05:29.113965 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead.

W1029 17:05:29.125296 4479512000 deprecation.py:323] From /Users/jerry/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 40)                400       
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 1,661
Trainable params: 1,661
Non-trainable params: 0
_________________________________________________________________


In [19]:
#設定訓練資料參數
#設定訓言資料與驗證資料比例
#設定epoch訓練週期與batch_size每一批次訓練筆數
#verbose設定顯示訓練過程
train_history = model.fit(
    x=train_Features,
    y=train_Label,
    validation_split = 0.1,
    epochs = 30,
    batch_size = 30,
    verbose=2)

W1029 17:05:33.244222 4479512000 deprecation_wrapper.py:119] From /Users/jerry/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 930 samples, validate on 104 samples
Epoch 1/30
 - 1s - loss: 0.6901 - acc: 0.5774 - val_loss: 0.6713 - val_acc: 0.7885
Epoch 2/30
 - 0s - loss: 0.6681 - acc: 0.5957 - val_loss: 0.5916 - val_acc: 0.7885
Epoch 3/30
 - 0s - loss: 0.6102 - acc: 0.6677 - val_loss: 0.4943 - val_acc: 0.8173
Epoch 4/30
 - 0s - loss: 0.5456 - acc: 0.7624 - val_loss: 0.4626 - val_acc: 0.7692
Epoch 5/30
 - 0s - loss: 0.5064 - acc: 0.7602 - val_loss: 0.4547 - val_acc: 0.7885
Epoch 6/30
 - 0s - loss: 0.4910 - acc: 0.7645 - val_loss: 0.4399 - val_acc: 0.7788
Epoch 7/30
 - 0s - loss: 0.4822 - acc: 0.7591 - val_loss: 0.4425 - val_acc: 0.7885
Epoch 8/30
 - 0s - loss: 0.4788 - acc: 0.7581 - val_loss: 0.4350 - val_acc: 0.7981
Epoch 9/30
 - 0s - loss: 0.4730 - acc: 0.7645 - val_loss: 0.4273 - val_acc: 0.7981
Epoch 10/30
 - 0s - loss: 0.4696 - acc: 0.7753 - val_loss: 0.4254 - val_acc: 0.7981
Epoch 11/30
 - 0s - loss: 0.4655 - acc: 0.7624 - val_loss: 0.4224 - val_acc: 0.8173
Epoch 12/30
 - 0s - loss: 0.4636 - acc:

In [20]:
#評估模型準確率
scores = model.evaluate(x = test_Features,y = test_Label)



In [21]:
scores[1]

0.8109090913425793

# Prediction

In [22]:
#匯入自定義的人物data
Jack = pd.Series([0,'Jack',3,'male',23,1,0,5.0000,'S'])
Rose = pd.Series([1,'Rose',1,'female',20,1,0,100.0000,'S'])
JR_df = pd.DataFrame([list(Jack),list(Rose)],
                    columns=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked'])

In [23]:
all_df = pd.concat([all_df,JR_df])

In [24]:
all_df[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,0,Jack,3,male,23.0,1,0,5.0,S
1,1,Rose,1,female,20.0,1,0,100.0,S


In [25]:
all_Features,Label = PreprocessData(all_df)

In [26]:
#透過剛剛訓練好的模型預測其存活率
all_probability = model.predict(all_Features)

In [27]:
all_probability[:10]

array([[0.97751796],
       [0.61642814],
       [0.9731523 ],
       [0.4123049 ],
       [0.9720175 ],
       [0.27562404],
       [0.9490433 ],
       [0.3278759 ],
       [0.9478191 ],
       [0.29229587]], dtype=float32)

In [28]:
pd=all_df

In [29]:
pd.insert(len(all_df.columns),
          'probability',all_probability)

In [30]:
#最後一欄為其成果
pd[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.149248
1,1,Rose,1,female,20.0,1,0,100.0,S,0.970775
