In [249]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dropout

In [2]:
#데이터 불러오기 및 크기 확인
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print("훈련용 데이터 크기:", train_data.shape)
print("테스트용 데이터 크기:", test_data.shape)

훈련용 데이터 크기: (17480, 16)
테스트용 데이터 크기: (15081, 15)


In [3]:
#결측치 제거 함수

def check_missing_col(dataframe):
    missing_col = []
    for col in dataframe.columns:
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            print(f'결측치가 있는 컬럼은: {col} 입니다')
            print(f'해당 컬럼에 총 {missing_values} 개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if missing_col == []:
        print('결측치가 존재하지 않습니다')
    return missing_col


missing_col = check_missing_col(train_data)
print(missing_col)

train_data.dropna(axis=0,subset=['workclass','occupation','native.country'],inplace = True)

결측치가 있는 컬럼은: workclass 입니다
해당 컬럼에 총 1836 개의 결측치가 존재합니다.
결측치가 있는 컬럼은: occupation 입니다
해당 컬럼에 총 1843 개의 결측치가 존재합니다.
결측치가 있는 컬럼은: native.country 입니다
해당 컬럼에 총 583 개의 결측치가 존재합니다.
[['workclass', dtype('O')], ['occupation', dtype('O')], ['native.country', dtype('O')]]


In [4]:

x_train = train_data.drop(['id', 'target'], axis=1)
y_train = train_data['target']

test_data=test_data.drop(['id'], axis=1)


print("훈련용 데이터:", x_train.shape)
print("테스트용 라벨:", y_train.shape)

훈련용 데이터: (15081, 14)
테스트용 라벨: (15081,)


In [5]:
#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            #dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

x_train = label_encoder(x_train, make_label_map(x_train))

#제출용 라벨링 데이터 변환
test_data = label_encoder(test_data, make_label_map(test_data))

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train,Y_test = train_test_split(x_train, y_train, random_state=22)

In [8]:
mean_vals = np.mean(X_train, axis=0)
std_val = np.std(X_train)

X_train_centered = (X_train  - mean_vals) / std_val

mean_vals = np.mean(X_test, axis=0)
std_val = np.std(X_test)

X_test_centered = (X_test- mean_vals) / std_val

In [10]:
#원핫 인코딩 생성
y_train_onehot = tf.keras.utils.to_categorical(Y_train)
print(y_train_onehot[:3])

[[1. 0.]
 [1. 0.]
 [0. 1.]]


In [268]:
model = tf.keras.models.Sequential()

model.add(
    tf.keras.layers.Dense(
    units=50,
    input_dim=X_train_centered.shape[1],
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='relu'
    )
)

model.add(
    tf.keras.layers.Dense(
    units=30,
    input_dim=50,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='tanh'
    )
)
model.add(Dropout(0.3))


"""

model.add(
    tf.keras.layers.Dense(
    units=100,
    input_dim=150,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='tanh'
    )
)

model.add(
    tf.keras.layers.Dense(
    units=50,
    input_dim=100,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='tanh'
    )
)

model.add(
    tf.keras.layers.Dense(
    units=30,
    input_dim=50,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='tanh'
    )
)

model.add(
    tf.keras.layers.Dense(
    units=20,
    input_dim=30,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='tanh'
    )
)
"""

model.add(
    tf.keras.layers.Dense(
    units=y_train_onehot.shape[1],
    input_dim=30,
    kernel_initializer='glorot_uniform',
    bias_initializer='zeros',
    activation='softmax'
    )
)


In [263]:
model.summary()

Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_158 (Dense)            (None, 50)                750       
_________________________________________________________________
dense_159 (Dense)            (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_160 (Dense)            (None, 2)                 102       
Total params: 3,402
Trainable params: 3,402
Non-trainable params: 0
_________________________________________________________________


In [238]:
sgd_optimizer = tf.keras.optimizers.SGD(lr=0.001, decay=1e-7, momentum=.9)



In [269]:
#model.compile(optimizer=sgd_optimizer,metrics=['accuracy'], loss='categorical_crossentropy')
model.compile(optimizer='adam',metrics=['accuracy'], loss="categorical_crossentropy")


In [270]:
from keras.callbacks import EarlyStopping

#early_stopping = EarlyStopping()
history = model.fit(X_train_centered, y_train_onehot, batch_size=32, epochs=10, verbose=1, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [271]:
y_train_pred= model.predict_classes(X_train_centered, verbose=0)
correct_preds = np.sum(Y_train == y_train_pred, axis=0)

train_acc = correct_preds / Y_train.shape[0]

y_test_pred= model.predict_classes(X_test_centered, verbose=0)
correct_preds = np.sum(Y_test == y_test_pred, axis=0)

test_acc = correct_preds / Y_test.shape[0]

print("train 최종 결과", train_acc)
print("test 최종 결과", test_acc)



train 최종 결과 0.8511936339522547
test 최종 결과 0.8350570140546274




In [193]:
#훈련용 데이터 전부 사용
mean_vals = np.mean(x_train, axis=0)
std_val = np.std(x_train)

result_centered = (x_train  - mean_vals) / std_val

result_onehot = tf.keras.utils.to_categorical(y_train)



In [242]:
history = model.fit(result_centered, result_onehot, batch_size=48, epochs=20, verbose=1, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [151]:
#초기 신경망 데이터 제출용

mean_vals = np.mean(test_data, axis=0)
std_val = np.std(test_data)

test_centered = (test_data  - mean_vals) / std_val
test_pred= model.predict_classes(test_centered, verbose=0)




In [152]:
result = pd.DataFrame({'target': test_pred})
result.to_csv('day3_result.csv')

In [75]:
#첫 신경망 결과 0.85

print(train_acc)

0.8502210433244916
