# DataProcessing

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# Replace Type to Number {Female, Male} = {0, 1}
def ReplaceTypeToNum(data):
    print('Class：', data.unique())
    classLE = LabelEncoder()
    data = classLE.fit_transform(data.values)
    return data

# Select Feature
def SelectFeature(data, data2):
    PassengerId = data['PassengerId']
    Survived = data['Survived']
    y = data['Survived'].values
    x = data.drop(columns=['PassengerId', 'Survived']).values
    data = data.drop(columns=['PassengerId', 'Survived'])
    
    #using ExtraTreesClassifier(極限樹)
    ETC = ExtraTreesClassifier(n_estimators = 500)
    ETC = ETC.fit(x, y)
    print('Feature Importances：', ETC.feature_importances_)
    ETCModel = SelectFromModel(ETC, prefit = True)
    
    data = data.loc[:,ETCModel.get_support()]
    data['PassengerId'] = PassengerId
    col = []
    for key in data2.columns:
        if key in data.columns:
            col.append(True)
        else:
            col.append(False)
    data2 = data2.loc[:, col]
    data['Survived'] = Survived
    return data, data2

# Compare Association Between Type
def CompareAssBetType(data, keyword1, keyword2):
#     print(pd.crosstab(dataMerge['NameTitle'], dataMerge['Sex']))
    return data[[keyword1, keyword2]].groupby([keyword1], as_index=False).mean().sort_values(by=keyword2, ascending=False)


In [2]:
if __name__ == "__main__":
#     讀取資料
    readPath = './Data/train.csv'
    readPath2 = './Data/test.csv'
    writePath = './Data/new_train.csv'
    writePath2 = './Data/new_test.csv'
    data = pd.read_csv(readPath)
    data2 = pd.read_csv(readPath2)
    data['Type'] = 'Train'
    data2['Type'] = 'Test'
                        
#     ------------------------------------------------------------------------------------
#   Train 前處理    
#     移除NAN數值
    data = data.dropna()
#     ------------------------------------------------------------------------------------
#     合併資料處理
#     合併資料
    dataMerge = pd.concat([data, data2], sort=True)
    
#     Age = NAN值 Replace 0
    dataMerge['Age'] = dataMerge['Age'].fillna(0)
    
#     保留名稱稱謂
    dataMerge['NameTitle'] = dataMerge['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        
#     利用qcut將Age調整為五個區間 # qcut 主要是透過總數量進行區分
    dataMerge['Age'] = pd.cut(dataMerge['Age'], 5)
    
#     利用cut將Fare調整為五個區間 # cut 主要是將總量作為五個區間進行分類
    dataMerge['Fare'] = dataMerge['Fare'].fillna(0)
    dataMerge['Fare'] = pd.cut(dataMerge['Fare'], 4)
    
#     比較名稱稱謂與性別的總數，判斷哪些稱謂可以合併
    dataMerge['NameTitle'] = dataMerge['NameTitle'].replace('Mlle', 'Miss')
    dataMerge['NameTitle'] = dataMerge['NameTitle'].replace('Ms', 'Miss')
    dataMerge['NameTitle'] = dataMerge['NameTitle'].replace('Mme', 'Mrs')
    
    # Reassign rare titles
    dataMerge.loc[(dataMerge['NameTitle'] == 'Capt') | 
               (dataMerge['NameTitle'] == 'Col') |
               (dataMerge['NameTitle'] == 'Don') |
               (dataMerge['NameTitle'] == 'Dr') |
               (dataMerge['NameTitle'] == 'Major') |
               (dataMerge['NameTitle'] == 'Rev'), 'NameTitle'] = 'officer'
    
    dataMerge.loc[(dataMerge['NameTitle'] == 'Dona') | 
               (dataMerge['NameTitle'] == 'Lady') |
               (dataMerge['NameTitle'] == 'Countess') |
               (dataMerge['NameTitle'] == 'Sir') |
               (dataMerge['NameTitle'] == 'Jonkheer'), 'NameTitle'] = 'royalty'

#     比較各類別與Survived的關聯性
    print('----------------------------------------')
    print('Compare Association Between Type')
    print(CompareAssBetType(dataMerge, 'Pclass', 'Survived'), '\n')    
    print(CompareAssBetType(dataMerge, 'Sex', 'Survived'), '\n')
    print(CompareAssBetType(dataMerge, 'Age', 'Survived'), '\n')
    print(CompareAssBetType(dataMerge, 'Fare', 'Survived'), '\n')
    print(CompareAssBetType(dataMerge, 'Embarked', 'Survived'), '\n')
    print(CompareAssBetType(dataMerge, 'NameTitle', 'Survived'), '\n')
    print('----------------------------------------')
    
#     類別替換成數字
    print('----------------------------------------')
    print('Replace Type to Number：{Female, Male} = {0, 1}')
    dataMerge['Age'] = ReplaceTypeToNum(dataMerge['Age'])
    dataMerge['Embarked'] = ReplaceTypeToNum(dataMerge['Embarked'])
    dataMerge['Fare'] = ReplaceTypeToNum(dataMerge['Fare'])
    dataMerge['Sex'] = ReplaceTypeToNum(dataMerge['Sex'])
    dataMerge['NameTitle'] = ReplaceTypeToNum(dataMerge['NameTitle'])
    print('----------------------------------------')
    
#     移除不必要資料
    dataMerge = dataMerge.drop(columns=['Name', 'Ticket', 'Cabin'])
#     ------------------------------------------------------------------------------------
#     拆開合併資料
    data = dataMerge[dataMerge['Type'] == 'Train']
    data2 = dataMerge[dataMerge['Type'] == 'Test']
    data = data.drop(columns=['Type'])
    data2 = data2.drop(columns=['Type', 'Survived'])
    
# #     選擇特徵值
#     print('----------------------------------------')
#     print('Select Feature')
#     data, data2 = SelectFeature(data, data2)
#     print('----------------------------------------')
        
#     輸出資料
    print('----------------------------------------')
    print('write csv')
    print('new_train：')
    print(data.head(10))
    print('\ndtypes：')
    print(data.dtypes)
    print('\nnew_test：')
    print(data2.head(10))
    print('\ndtypes：')
    print(data2.dtypes)
    
    data.to_csv(writePath, index=False)
    data2.to_csv(writePath2, index=False)
    print('----------------------------------------')
#     ------------------------------------------------------------------------------------

----------------------------------------
Compare Association Between Type
   Pclass  Survived
1       2  0.800000
0       1  0.670886
2       3  0.500000 

      Sex  Survived
0  female  0.931818
1    male  0.431579 

             Age  Survived
0  (-0.08, 16.0]  0.875000
1   (16.0, 32.0]  0.737705
2   (32.0, 48.0]  0.661538
3   (48.0, 64.0]  0.555556
4   (64.0, 80.0]  0.200000 

                 Fare  Survived
3  (384.247, 512.329]  1.000000
1  (128.082, 256.165]  0.750000
2  (256.165, 384.247]  0.666667
0   (-0.512, 128.082]  0.658065 

  Embarked  Survived
0        C  0.738462
2        S  0.637931
1        Q  0.500000 

  NameTitle  Survived
0    Master  1.000000
5   royalty  1.000000
1      Miss  0.934783
3       Mrs  0.923077
4   officer  0.571429
2        Mr  0.370370 

----------------------------------------
----------------------------------------
Replace Type to Number：{Female, Male} = {0, 1}
Class： [(32.0, 48.0], (48.0, 64.0], (-0.08, 16.0], (16.0, 32.0], (64.0, 80.0]]
Catego

# DNN訓練模型

In [3]:
import os
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import regularizers

In [4]:
def buildModel(width, num_classes):
    
    model = Sequential()
    model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01), input_dim = width))
    model.add(Dropout(0.4))
    model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation = 'sigmoid'))
    
    model.summary()
    model.compile(loss = binary_crossentropy,
            optimizer = Adam(lr = 0.001),
            metrics = ['accuracy'])
    return model

def saveTrainModels(model, saveModelPath, saveTensorBoardPath, epochs, batch_size,
                    x_train, y_train, x_val, y_val):
    
#     設置TensorBoard
    tbCallBack = TensorBoard(log_dir = saveTensorBoardPath, write_images = True,
                            embeddings_freq = 0, embeddings_layer_names = None, embeddings_metadata = None)
    
#     Revicing the bug of TensorBoard of TF2
    tfPath01 = saveTensorBoardPath + '/train'
    tfPath02 = saveTensorBoardPath + '/train/plugins'
    tfPath03 = saveTensorBoardPath + '/train/plugins/profile'
    if not os.path.exists(tfPath01):
        os.mkdir(tfPath01)
    if not os.path.exists(tfPath02):
        os.mkdir(tfPath02)
    if not os.path.exists(tfPath03):
        os.mkdir(tfPath03)

#     設置checkpoint
    checkpoint = ModelCheckpoint(
                            monitor = 'val_loss', verbose = 1, 
                            save_best_only = True, mode = 'min',
                            filepath = ('%s_{epoch:02d}_{accuracy:.4f}_{loss:.4f}_{val_accuracy:.4f}_{val_loss:.4f}.h5' %(saveModelPath)))

#     設置ReduceLROnPlateau
    Reduce = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.9, patience = 5, cooldown = 1, verbose = 1)

#     設置EarlyStopping
    Early = EarlyStopping(monitor = 'val_loss', patience = 15, verbose = 1)

    callbacks_list = [checkpoint, tbCallBack, Reduce, Early]

#     訓練模型
    model.fit(x_train, y_train,
                batch_size = batch_size,
                epochs = epochs,
                verbose = 1,
                shuffle = True,
                validation_data = (x_val, y_val),
                callbacks = callbacks_list)
    
if __name__ == "__main__":
    DataSplitRatio = 0.8
    NumClasses = 1
    Epochs = 2000
    BatchSize = 512 
    SaveModelPath = "./Model/Train"
    SaveTensorBoardPath = "./Model/Tensorboard"
      
    if not os.path.exists("./Model"):
        os.mkdir("./Model")
    if not os.path.exists(SaveTensorBoardPath):
        os.mkdir(SaveTensorBoardPath)
    
#     讀取資料
    readPath = './Data/new_train.csv'
    data = pd.read_csv(readPath)
    data = data.drop(columns=['PassengerId'])
    x_train = data.drop(columns=['Survived']).values
    y_train = data['Survived'].values
    Width = x_train.shape[1]
    
#     順序隨機
    num_example = x_train.shape[0]
    arr = np.arange(num_example)
    np.random.shuffle(arr)
    x_train = x_train[arr]
    y_train = y_train[arr]
    
#     切割資料
    s = np.int(num_example * DataSplitRatio)
    x_val = x_train[s:]
    y_val = y_train[s:]
    x_train = x_train[:s]
    y_train = y_train[:s]
    
#     Print資料量
    print('x_train：', x_train.shape)
    print('y_train：', y_train.shape)
    print('x_val：', x_val.shape)
    print('y_val：', y_val.shape)
    
#     建構模型
    Model = buildModel(Width, NumClasses)
        
#     訓練及儲存模型
    saveTrainModels(Model, SaveModelPath, SaveTensorBoardPath, Epochs, BatchSize, x_train, y_train, x_val, y_val)
    
    

x_train： (146, 8)
y_train： (146,)
x_val： (37, 8)
y_val： (37,)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1152      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 34,433
Trainable params: 34,433
Non-trainable params: 0
_________________________________________________________________
Train on 146 samples, validate on 37 samples
Epoch 1/200

Epoch 27/2000

Epoch 00027: val_loss improved from 1.71573 to 1.69090, saving model to ./Model/Train_27_0.7192_1.6931_0.5676_1.6909.h5
Epoch 28/2000

Epoch 00028: val_loss improved from 1.69090 to 1.66606, saving model to ./Model/Train_28_0.7123_1.6365_0.5676_1.6661.h5
Epoch 29/2000

Epoch 00029: val_loss improved from 1.66606 to 1.64155, saving model to ./Model/Train_29_0.7397_1.6265_0.5946_1.6415.h5
Epoch 30/2000

Epoch 00030: val_loss improved from 1.64155 to 1.61629, saving model to ./Model/Train_30_0.7055_1.6063_0.6486_1.6163.h5
Epoch 31/2000

Epoch 00031: val_loss improved from 1.61629 to 1.59173, saving model to ./Model/Train_31_0.7534_1.5988_0.6486_1.5917.h5
Epoch 32/2000

Epoch 00032: val_loss improved from 1.59173 to 1.56709, saving model to ./Model/Train_32_0.7055_1.5635_0.6486_1.5671.h5
Epoch 33/2000

Epoch 00033: val_loss improved from 1.56709 to 1.54221, saving model to ./Model/Train_33_0.7123_1.5279_0.6486_1.5422.h5
Epoch 34/2000

Epoch 00034: val_loss improved from 1.54

Epoch 58/2000

Epoch 00058: val_loss improved from 1.11125 to 1.09778, saving model to ./Model/Train_58_0.7671_1.0680_0.8108_1.0978.h5
Epoch 59/2000

Epoch 00059: val_loss improved from 1.09778 to 1.08543, saving model to ./Model/Train_59_0.7534_1.0793_0.7838_1.0854.h5
Epoch 60/2000

Epoch 00060: val_loss improved from 1.08543 to 1.07358, saving model to ./Model/Train_60_0.7329_1.0696_0.7838_1.0736.h5
Epoch 61/2000

Epoch 00061: val_loss improved from 1.07358 to 1.06235, saving model to ./Model/Train_61_0.7123_1.0734_0.7838_1.0623.h5
Epoch 62/2000

Epoch 00062: val_loss improved from 1.06235 to 1.05174, saving model to ./Model/Train_62_0.7260_1.0387_0.7568_1.0517.h5
Epoch 63/2000

Epoch 00063: val_loss improved from 1.05174 to 1.04134, saving model to ./Model/Train_63_0.7466_1.0220_0.7297_1.0413.h5
Epoch 64/2000

Epoch 00064: val_loss improved from 1.04134 to 1.03082, saving model to ./Model/Train_64_0.7534_0.9780_0.7297_1.0308.h5
Epoch 65/2000

Epoch 00065: val_loss improved from 1.03

Epoch 88/2000

Epoch 00088: val_loss improved from 0.83314 to 0.82630, saving model to ./Model/Train_88_0.7671_0.8178_0.7838_0.8263.h5
Epoch 89/2000

Epoch 00089: val_loss improved from 0.82630 to 0.81984, saving model to ./Model/Train_89_0.7877_0.7770_0.7838_0.8198.h5
Epoch 90/2000

Epoch 00090: val_loss improved from 0.81984 to 0.81326, saving model to ./Model/Train_90_0.7603_0.7536_0.7568_0.8133.h5
Epoch 91/2000

Epoch 00091: val_loss improved from 0.81326 to 0.80686, saving model to ./Model/Train_91_0.7671_0.7886_0.7568_0.8069.h5
Epoch 92/2000

Epoch 00092: val_loss improved from 0.80686 to 0.80143, saving model to ./Model/Train_92_0.7808_0.7637_0.7568_0.8014.h5
Epoch 93/2000

Epoch 00093: val_loss improved from 0.80143 to 0.79721, saving model to ./Model/Train_93_0.7534_0.7619_0.7838_0.7972.h5
Epoch 94/2000

Epoch 00094: val_loss improved from 0.79721 to 0.79304, saving model to ./Model/Train_94_0.8151_0.7400_0.7838_0.7930.h5
Epoch 95/2000

Epoch 00095: val_loss improved from 0.79

Epoch 119/2000

Epoch 00119: val_loss improved from 0.69297 to 0.69050, saving model to ./Model/Train_119_0.7808_0.6498_0.7568_0.6905.h5
Epoch 120/2000

Epoch 00120: val_loss improved from 0.69050 to 0.68899, saving model to ./Model/Train_120_0.8014_0.6128_0.7568_0.6890.h5
Epoch 121/2000

Epoch 00121: val_loss improved from 0.68899 to 0.68786, saving model to ./Model/Train_121_0.8014_0.6154_0.7568_0.6879.h5
Epoch 122/2000

Epoch 00122: val_loss improved from 0.68786 to 0.68618, saving model to ./Model/Train_122_0.7671_0.6297_0.7568_0.6862.h5
Epoch 123/2000

Epoch 00123: val_loss improved from 0.68618 to 0.68435, saving model to ./Model/Train_123_0.8151_0.6093_0.7568_0.6844.h5
Epoch 124/2000

Epoch 00124: val_loss improved from 0.68435 to 0.68131, saving model to ./Model/Train_124_0.7671_0.6198_0.7568_0.6813.h5
Epoch 125/2000

Epoch 00125: val_loss improved from 0.68131 to 0.67757, saving model to ./Model/Train_125_0.7603_0.6182_0.7297_0.6776.h5
Epoch 126/2000

Epoch 00126: val_loss imp

Epoch 150/2000

Epoch 00150: val_loss improved from 0.62435 to 0.62241, saving model to ./Model/Train_150_0.7329_0.5857_0.7838_0.6224.h5
Epoch 151/2000

Epoch 00151: val_loss improved from 0.62241 to 0.62007, saving model to ./Model/Train_151_0.8151_0.5418_0.7568_0.6201.h5
Epoch 152/2000

Epoch 00152: val_loss improved from 0.62007 to 0.61669, saving model to ./Model/Train_152_0.8014_0.5461_0.7568_0.6167.h5
Epoch 153/2000

Epoch 00153: val_loss improved from 0.61669 to 0.61349, saving model to ./Model/Train_153_0.7671_0.5390_0.7568_0.6135.h5
Epoch 154/2000

Epoch 00154: val_loss improved from 0.61349 to 0.61094, saving model to ./Model/Train_154_0.7740_0.5395_0.7568_0.6109.h5
Epoch 155/2000

Epoch 00155: val_loss improved from 0.61094 to 0.60867, saving model to ./Model/Train_155_0.8082_0.5240_0.7568_0.6087.h5
Epoch 156/2000

Epoch 00156: val_loss improved from 0.60867 to 0.60660, saving model to ./Model/Train_156_0.7877_0.5503_0.7568_0.6066.h5
Epoch 157/2000

Epoch 00157: val_loss imp

Epoch 180/2000

Epoch 00180: val_loss improved from 0.57721 to 0.57621, saving model to ./Model/Train_180_0.8014_0.5184_0.7568_0.5762.h5
Epoch 181/2000

Epoch 00181: val_loss did not improve from 0.57621
Epoch 182/2000

Epoch 00182: val_loss did not improve from 0.57621
Epoch 183/2000

Epoch 00183: val_loss did not improve from 0.57621
Epoch 184/2000

Epoch 00184: val_loss did not improve from 0.57621
Epoch 185/2000

Epoch 00185: val_loss improved from 0.57621 to 0.57584, saving model to ./Model/Train_185_0.7945_0.5172_0.7838_0.5758.h5
Epoch 186/2000

Epoch 00186: val_loss improved from 0.57584 to 0.57386, saving model to ./Model/Train_186_0.8014_0.5034_0.7838_0.5739.h5
Epoch 187/2000

Epoch 00187: val_loss improved from 0.57386 to 0.57198, saving model to ./Model/Train_187_0.7740_0.5139_0.7568_0.5720.h5
Epoch 188/2000

Epoch 00188: val_loss improved from 0.57198 to 0.56978, saving model to ./Model/Train_188_0.7740_0.5143_0.7568_0.5698.h5
Epoch 189/2000

Epoch 00189: val_loss improved 

Epoch 214/2000

Epoch 00214: val_loss did not improve from 0.55963
Epoch 215/2000

Epoch 00215: val_loss did not improve from 0.55963
Epoch 216/2000

Epoch 00216: val_loss did not improve from 0.55963

Epoch 00216: ReduceLROnPlateau reducing learning rate to 0.0007290000503417104.
Epoch 217/2000

Epoch 00217: val_loss did not improve from 0.55963
Epoch 218/2000

Epoch 00218: val_loss did not improve from 0.55963
Epoch 219/2000

Epoch 00219: val_loss improved from 0.55963 to 0.55863, saving model to ./Model/Train_219_0.7808_0.4808_0.7568_0.5586.h5
Epoch 220/2000

Epoch 00220: val_loss improved from 0.55863 to 0.55671, saving model to ./Model/Train_220_0.7945_0.4898_0.7568_0.5567.h5
Epoch 221/2000

Epoch 00221: val_loss improved from 0.55671 to 0.55502, saving model to ./Model/Train_221_0.8082_0.4618_0.7568_0.5550.h5
Epoch 222/2000

Epoch 00222: val_loss improved from 0.55502 to 0.55332, saving model to ./Model/Train_222_0.7808_0.4718_0.7568_0.5533.h5
Epoch 223/2000

Epoch 00223: val_los

Epoch 249/2000

Epoch 00249: val_loss did not improve from 0.54617

Epoch 00249: ReduceLROnPlateau reducing learning rate to 0.0005314410547725857.
Epoch 250/2000

Epoch 00250: val_loss improved from 0.54617 to 0.54601, saving model to ./Model/Train_250_0.8219_0.4536_0.7838_0.5460.h5
Epoch 251/2000

Epoch 00251: val_loss improved from 0.54601 to 0.54427, saving model to ./Model/Train_251_0.8014_0.4497_0.7838_0.5443.h5
Epoch 252/2000

Epoch 00252: val_loss improved from 0.54427 to 0.54235, saving model to ./Model/Train_252_0.8014_0.4810_0.7568_0.5423.h5
Epoch 253/2000

Epoch 00253: val_loss improved from 0.54235 to 0.54118, saving model to ./Model/Train_253_0.8151_0.4642_0.7568_0.5412.h5
Epoch 254/2000

Epoch 00254: val_loss improved from 0.54118 to 0.54094, saving model to ./Model/Train_254_0.7945_0.4629_0.7568_0.5409.h5
Epoch 255/2000

Epoch 00255: val_loss did not improve from 0.54094
Epoch 256/2000

Epoch 00256: val_loss did not improve from 0.54094
Epoch 257/2000

Epoch 00257: val_

Epoch 283/2000

Epoch 00283: val_loss did not improve from 0.53700
Epoch 284/2000

Epoch 00284: val_loss did not improve from 0.53700

Epoch 00284: ReduceLROnPlateau reducing learning rate to 0.0003486784757114947.
Epoch 00284: early stopping


# DNN預測模型

In [5]:
from tensorflow.keras.models import load_model
if __name__ == "__main__":
#     ModelPath = "./Model_WithoutSelection/Train_182_0.8288_0.5042_0.7838_0.6408.h5"
    ModelPath = "./Model_WithSelection/Train_269_0.8288_0.4570_0.7568_0.5370.h5"
    readPath = './Data/new_test.csv'
    WritePath = "./gender_submission.csv"
    
#     讀取資料
    data = pd.read_csv(readPath)
    data2 = data['PassengerId']
    data = data.drop(columns=['PassengerId'])
    print('x_test：', data.shape)
    
#     載入模型
    Model = load_model(ModelPath)
    
#     預測模型
    pred = Model.predict(data)
#     print(pred)
    
#     輸出結果
    fw = open(WritePath, "w")
    fw.write('PassengerId,Survived\n')
    for idx in range(0, pred.shape[0], 1):
        Temp = 1 if pred[idx] > 0.5 else 0
        fw.write('%s,%s\n'%(data2[idx], Temp))
    fw.close()
    print('Complate.')

x_test： (418, 8)
Complate.


# Random Forest訓練模型

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
import pickle
import os
if __name__ == "__main__":
    DataSplitRatio = 0.8
    num_tree = 50
    SaveModelPath = "./Model/RFModel.pickle"
      
    if not os.path.exists("./Model"):
        os.mkdir("./Model")
    
#     讀取資料
    readPath = './Data/new_train.csv'
    data = pd.read_csv(readPath)
    data = data.drop(columns=['PassengerId'])
    x_train = data.drop(columns=['Survived']).values
    y_train = data['Survived'].values
    Width = x_train.shape[1]
    
#     順序隨機
    num_example = x_train.shape[0]
    arr = np.arange(num_example)
    np.random.shuffle(arr)
    x_train = x_train[arr]
    y_train = y_train[arr]
    
#     切割資料
    s = np.int(num_example * DataSplitRatio)
    x_val = x_train[s:]
    y_val = y_train[s:]
    x_train = x_train[:s]
    y_train = y_train[:s]
    
#     Print資料量
    print('x_train：', x_train.shape)
    print('y_train：', y_train.shape)
    print('x_val：', x_val.shape)
    print('y_val：', y_val.shape)
    
#     建構模型
    RFModel = RandomForestClassifier(n_estimators=num_tree)
#     RFModel = DecisionTreeClassifier()
#     RFModel = XGBClassifier(n_estimators=num_tree)
        
#     訓練及儲存模型
    RFModel.fit(x_train, y_train)
    y_train_pred = RFModel.predict(x_train)
    y_val_pred = RFModel.predict(x_val)
    print(accuracy_score(y_train_pred, y_train))
    print(accuracy_score(y_val_pred, y_val))
    with open(SaveModelPath, 'wb') as f:
        pickle.dump(RFModel, f)
        

x_train： (146, 8)
y_train： (146,)
x_val： (37, 8)
y_val： (37,)
0.9178082191780822
0.6486486486486487


# Random Forest預測模型

In [7]:
if __name__ == "__main__":
    ModelPath = "./Model_RF/RFModel.pickle"
    WritePath = "./gender_submission.csv"
    
#     讀取資料
    readPath = './Data/new_test.csv'
    data = pd.read_csv(readPath)
    data2 = data['PassengerId']
    data = data.drop(columns=['PassengerId'])
    print('x_test：', data.shape)
    
    data.columns = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7']
#     data.columns = ['f0', 'f1', 'f2', 'f3']
    
#     載入模型
    with open(ModelPath, 'rb') as f:
        RFModel = pickle.load(f)
    
#     預測模型
    pred = RFModel.predict(data)
#     print(pred)
    
#     輸出結果
    fw = open(WritePath, "w")
    fw.write('PassengerId,Survived\n')
    for idx in range(0, pred.shape[0], 1):
        fw.write('%s,%d\n'%(data2[idx], int(pred[idx])))
    fw.close()

x_test： (418, 8)
