In [42]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Raw Data Loding
train = pd.read_csv('./data/titanic/train.csv')
train2 = pd.read_csv('./data/titanic/train.csv')
test = pd.read_csv('./data/titanic/test.csv')

# 필요없는 column
train.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)

# 성별처리
sex_mapping = { 'male' : 0, 'female' : 1 }
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

# 가족처리
train['Family'] = train['SibSp'] + train['Parch']
train.drop(['SibSp', 'Parch'], axis=1, inplace=True)

test['Family'] = test['SibSp'] + test['Parch']
test.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Embarked 결측치 처리
train['Embarked'] = train['Embarked'].fillna('Q')
test['Embarked'] = test['Embarked'].fillna('Q')

# Age에 대한 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())

# Embarked 문자 -> 숫자 처리
embarked_mapping = {'S' : 0 , 'C': 1, 'Q': 2 }
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

# Age에 대해서 Binning 처리 (Numerical value -> categorical value)
train.loc[train['Age'] < 8, 'Age'] = 0
train.loc[(train['Age'] >= 8) & (train['Age'] < 20), 'Age'] = 1
train.loc[(train['Age'] >=20) & (train['Age'] < 65 ), 'Age'] = 2
train.loc[train['Age'] >=65, 'Age'] = 3

test.loc[test['Age'] < 8, 'Age'] = 0
test.loc[(test['Age'] >= 8) & (test['Age'] < 20), 'Age'] = 1
test.loc[(test['Age'] >=20) & (test['Age'] < 65 ), 'Age'] = 2
test.loc[test['Age'] >=65, 'Age'] = 3

tf.reset_default_graph()

# Data Split
x_data_train, x_data_test, t_data_train, t_data_test = \
train_test_split(train, train2['Survived'], test_size=0.3, random_state=0)

# Min-Max Normalization
scaler = MinMaxScaler()   # scaler = StandardScaler()
scaler.fit(x_data_train)
x_data_train_norm = scaler.transform(x_data_train)
x_data_test_norm = scaler.transform(x_data_test)

del x_data_train
del x_data_test

#Tensorflow

sess = tf.Session()

t_data_train_onehot = sess.run(tf.one_hot(t_data_train,depth=2))
t_data_test_onehot = sess.run(tf.one_hot(t_data_test,depth=2))

# Placeholder
X = tf.placeholder(shape=[None,5], dtype=tf.float32)
T = tf.placeholder(shape=[None,2], dtype=tf.float32)
drop_rate = tf.placeholder(dtype=tf.float32)

# Weight & bias
W2 = tf.get_variable('weight2', shape=[5,4],
                     initializer=tf.contrib.layers.variance_scaling_initializer())

b2 = tf.Variable(tf.random.normal([4]), name='bias2')
_layer2 = tf.nn.relu(tf.matmul(X,W2) + b2)
layer2 = tf.nn.dropout(_layer2, rate=drop_rate)

W3 = tf.get_variable('weight3', shape=[4,3],
                     initializer=tf.contrib.layers.variance_scaling_initializer())

b3 = tf.Variable(tf.random.normal([3]), name='bias3')
_layer3 = tf.nn.relu(tf.matmul(layer2,W3) + b3)
layer3 = tf.nn.dropout(_layer3, rate=drop_rate)

W4 = tf.get_variable('weight4', shape=[3,2],
                     initializer=tf.contrib.layers.variance_scaling_initializer())
b4 = tf.Variable(tf.random.normal([2]), name='bias4')

# Hypothesis
logit = tf.matmul(layer3,W4) + b4
H = tf.nn.softmax(logit)

# loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit, 
                                                                 labels=T))

# train
train = tf.train.GradientDescentOptimizer(learning_rate=1e-2).minimize(loss)

# parameter
num_of_epoch = 2000
batch_size = 100

# 학습
def run_train(sess, train_x, train_t):
    print('### Starting Training ###')
    # 초기화
    sess.run(tf.global_variables_initializer())
    
    for step in range(num_of_epoch):
        total_batch = int(train_x.shape[0] / batch_size)

        for i in range(total_batch):
            batch_x = train_x[i*batch_size:(i+1)*batch_size]
            batch_t = train_t[i*batch_size:(i+1)*batch_size]
            _, loss_val = sess.run([train,loss], feed_dict={X:batch_x, 
                                                            T:batch_t,
                                                            drop_rate:0.3})

        if step % 1000 == 0:
            print('Loss : {}'.format(loss_val))
    print('### End Training ###')

    
# Accuracy
predict = tf.argmax(H,1)
correct = tf.equal(predict, tf.argmax(T,1))
accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

# Testing
run_train(sess,x_data_train_norm,t_data_train_onehot) # 학습
print('### Test Set으로 Accuracy 측정 ###')
result = sess.run(predict, feed_dict={X:x_data_test_norm, 
                                      drop_rate:0})
print(classification_report(t_data_test,result.ravel()))


### Starting Training ###
Loss : 1.5366599559783936
Loss : 0.5853803753852844
### End Training ###
### Test Set으로 Accuracy 측정 ###
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       168
           1       0.76      0.68      0.72       100

    accuracy                           0.80       268
   macro avg       0.79      0.77      0.78       268
weighted avg       0.80      0.80      0.80       268



In [43]:
test_scaled = scaler.transform(test)

result = sess.run(H, feed_dict = {X:test_scaled,
                                  drop_rate:0})
result = np.argmax(result, axis=1)
result = pd.Series(result, name='Survived')

print(result)

submission = pd.read_csv('./data/titanic/test.csv')
submission['Survived'] = result
submission = submission[['PassengerId', 'Survived']]

submission.to_csv('./submission/titanic_deep4.csv', index=False)

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64


In [14]:
# Multinomial Classification
# BMI 지수로 학습해보자. => 키와 몸무게를 가지고 저체중, 정상 과체중 비만을 판단하는 지수
# BMI = 자신의 몸무게(kg) / 키의 제곱(m)
#      18.5 이하 => 저체중
#      18.5 ~ 23 => 정상
#      23 ~ 25 => 과체중
#      25 ~ => 비만
# 우리가 하려는 건 식이 아니라 BMI 지수를 조사한 데이터가 있다.
# 이걸 학습해서 예측을 통해 나의 BMI 지수를 알아보자
# 단 제공하는 데이터는 4가지가 아니라 3가지 분류로 되어있다.

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/bmi/bmi.csv', skiprows=3)

# display(df)

# 결측치 확인
# df.isnull().sum()        # 결측치 없음

# 이상치 확인
zscore = 1.8

# 이상치를 확인
# df.loc[np.abs(stats.zscore(df['height'])) >= zscore, :] # height의 이상치는 없다
# df.loc[np.abs(stats.zscore(df['weight'])) >= zscore, :] # weight의 이상치는 없다
# df.loc[np.abs(stats.zscore(df['label'])) >= zscore, :] # label의 이상치는 없다

# Data Split
# Train, Test 두 부분으로 분할. 분리하는 비율은 7:3으로 분리
# 나중에 Train부분은 k-fold cross validation을 진행
x_data_train, x_data_test, t_data_train, t_data_test = \
train_test_split(df[['height','weight']],df['label'],test_size = 0.3, random_state=0) # 14000 / 6000

# Normalization
scaler = MinMaxScaler()  # scaler 객체를 생성
scaler.fit(x_data_train) # scaler 객체에 최대 최소와 같은 정보가 들어간다. (fit 처리)

x_data_train_norm = scaler.transform(x_data_train)
x_data_test_norm = scaler.transform(x_data_test)

del x_data_train       # 혼동 방지를 위해 변수를 삭제
del x_data_test

# sklearn 구현은 매우매우 간단 - model 생성하고 학습진행
model = LogisticRegression()
model.fit(x_data_train_norm, t_data_train)

# 우리 model의 정확도를 측정해야한다.
# cross validation
kfold = 10
kfold_score = cross_val_score(model,x_data_train_norm, t_data_train, cv=kfold)
print('### cross validation ###')
print('score : {}'.format(kfold_score))
print('평균: {}'.format(kfold_score.mean()))

# 최종모델평가
predict_val = model.predict(x_data_test_norm)  # 테스트 데이터로 예측값을 구해요
acc = accuracy_score(predict_val, t_data_test)

print('우리 Model의 최종 Accuracy : {}'.format(acc))

# Predict

height = 188
weight = 78
my_state = [[height,weight]]
my_state_val = model.predict(scaler.transform(my_state))
print(my_state_val)

### cross validation ###
score : [0.98       0.98642857 0.985      0.97642857 0.98642857 0.98428571
 0.98714286 0.97714286 0.97714286 0.98642857]
평균: 0.9826428571428572
우리 Model의 최종 Accuracy : 0.9845
[1]
