# 딥러닝 기반 상품 카테고리 자동 분류 서버 예

### 파일에서 학습 데이터를 읽는다.

In [23]:
import os
import sys
import json
import gensim
import requests

import numpy
from numpy import array, argmax
from scipy import sparse

from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

from konlpy.tag import Kkma

import keras
import keras.preprocessing.text
from keras.utils import to_categorical

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [24]:
x_text_list = []
y_text_list = []
enc = sys.getdefaultencoding()
with open("refined_category_dataset.dat",encoding=enc) as fin:
    for line in fin.readlines():
        info = json.loads(line.strip())
        x_text_list.append((info['pid'],info['name']))
        y_text_list.append(info['cate'])

In [25]:
#joblib.dump(y_name_id_dict,"y_name_id_dict.dat")

### text 형식으로 되어 있는 카테고리 명을 숫자 id 형태로 변환한다.

In [26]:
y_name_id_dict = joblib.load("y_name_id_dict.dat")

In [27]:
print(y_name_id_dict)

{'가구/인테리어': 11, '반려동물': 4, '도서/문구': 8, '의류': 2, '여행/e쿠폰': 15, '식품': 6, '건강': 7, '뷰티': 0, '디지털': 10, '출산/육아': 16, '스포츠/레저': 9, '잡화': 14, '컴퓨터': 3, '자동차/공구': 1, '생필품/주방': 13, '가전': 12, '취미': 5}


In [28]:
#y_name_set = set(y_text_list)
#y_name_id_dict = dict(zip(y_name_set, range(len(y_name_set))))
#print(y_name_id_dict.items())
#y_id_name_dict = dict(zip(range(len(y_name_set)),y_name_set))
y_list = [y_name_id_dict[x] for x in y_text_list]

### train test 분리하는 방법 

In [29]:
x_train, x_test , y_train, y_test = train_test_split(x_text_list, y_list, test_size=0.2, random_state=42)

## 딥러닝 기반 text 분류에 필요한 모듈 로드

#### 모델 파일을 만약 만들었다면, 아래와 같이 로드 가능하다.

In [None]:
word2vec = gensim.models.word2vec.Word2Vec.load('word_models.model')
word2vec.init_sims(replace=True)

#model = KeyedVectors.load('ko.bin')

#word2vec = model
#word2vec.init_sims(replace=True)

## Train, Test 데이터 새로 만들기

In [None]:
kkma = Kkma()

kkma_x_train = list(map(lambda x: kkma.sentences(x[1])[0], x_train))
kkma_x_test = list(map(lambda x: kkma.sentences(x[1].replace('/', '').replace('(', '').replace('#', '') 
                .replace(')', '').replace('[', '').replace(']', ''))[0], x_test))

### text 데이터를 word-id 형태로 변환한다.

In [None]:
sequence_tokenizer = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
sequence_tokenizer.fit_on_texts(kkma_x_train)
max_features = len(sequence_tokenizer.word_index)


def texts_to_sequences2(d_list, tokenizer, maxlen=300):
    seq = tokenizer.texts_to_sequences(d_list)
    print('mean:', numpy.mean([len(x) for x in seq]))
    print('std:', numpy.std([len(x) for x in seq]))
    print('median:', numpy.median([len(x) for x in seq]))
    print('max:', numpy.max([len(x) for x in seq]))
    seq = keras.preprocessing.sequence.pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post')
    return seq

In [None]:
train = texts_to_sequences2(kkma_x_train, sequence_tokenizer)
test = texts_to_sequences2(kkma_x_test, sequence_tokenizer)

#### word의 embedding 형태의 weight 를 초기화 한다. 

In [None]:
input_dim = train.shape[1]

input_tensor = keras.layers.Input(shape=(input_dim,), dtype='int32')

In [None]:
word_vec_dim = 100
#word_vec_dim = 200
not_ct = 0
weights = numpy.zeros((max_features + 1, word_vec_dim))
for word, index in sequence_tokenizer.word_index.items():
    if False:
        pass
    if word in word2vec.wv.vocab:
        weights[index, :] = word2vec[word]
    else:
        not_ct+=1
        weights[index, :] = numpy.random.uniform(-0.25, 0.25, word_vec_dim)
# del word2vec
# del sequence_tokenizer
print (not_ct)

####  학습할 레이어를 구성한다.

In [None]:
embedded = keras.layers.Embedding(input_dim=max_features + 1,
                                  output_dim=word_vec_dim, input_length=input_dim,
                                  weights=[weights],trainable=True)(input_tensor)

In [None]:
tensors = []
for filter_length in [3, 5]:
    tensor = keras.layers.Convolution1D(nb_filter=50, filter_length=filter_length)(embedded)
    tensor = keras.layers.Dropout(0.5)(tensor)
    tensor = keras.layers.Activation('elu')(tensor)
    tensor = keras.layers.MaxPooling1D(pool_length=input_dim - filter_length + 1)(tensor)
    tensor = keras.layers.Flatten()(tensor)
    tensors.append(tensor)

In [None]:
# embedded = keras.layers.Dropout(0.5)(embedded)
output_tensor = keras.layers.merge(tensors, mode='concat', concat_axis=1)
output_tensor = keras.layers.Dropout(0.5)(output_tensor) 
output_tensor = keras.layers.Dense(len(set(y_list)), activation='softmax')(output_tensor)

# output = Dense(NUM_CLASSES, input_dim = hidden_dim_2, activation = "softmax")(pool_rnn) # See equations (6) and (7).

cnn = keras.models.Model(input_tensor, output_tensor)
cnn.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
print(cnn.summary())

In [None]:
'''
cnn.fit(train, numpy.asarray(to_categorical(y_train)), batch_size=30, nb_epoch=20,
        validation_data=(test, numpy.asarray(to_categorical(y_test))))
'''
cnn.fit(train, numpy.asarray(to_categorical(y_train)), batch_size=30, epochs=20,
        validation_data=(test, numpy.asarray(to_categorical(y_test))))

## 제출

In [None]:
mode = 'test'
#mode = 'eval'

eval_x_text_list = []
if mode == 'test':
    with open("soma8_test_data.dat",encoding=enc) as fin:
        for line in fin.readlines():
            info = json.loads(line.strip())
            eval_x_text_list.append((info['pid'],info['name']))
else:
    with open("soma8_eval_data.dat",encoding=enc) as fin:
        for line in fin.readlines():
            info = json.loads(line.strip())
            eval_x_text_list.append((info['pid'],info['name']))
kkma_eval_x_text_list = list(map(lambda x: kkma.sentences(x[1].replace('/', '').replace('(', '')
                            .replace(')', '').replace('[', '').replace(']', '').replace('#', ''))[0], eval_x_text_list))

In [None]:
#eval_x_list = texts_to_sequences2(map(lambda i : i[1],eval_x_text_list),sequence_tokenizer)
#eval_x_text_list = texts_to_sequences2(kkma_eval_x_text_list, sequence_tokenizer)
#eval_x_list = texts_to_sequences2(map(lambda i : i[1],eval_x_text_list),sequence_tokenizer)
eval_x_list = texts_to_sequences2(kkma_eval_x_text_list, sequence_tokenizer)

In [None]:
#pred_list = clf.predict(vectorizer.transform(map(lambda i : i[1],eval_x_text_list)))
#pred = cnn.predict(eval_x_list)
pred = cnn.predict(eval_x_list)
pred_list = [argmax(y) for y in pred]

In [None]:
name='하지윤'
nickname='punk_zzang3'

param = {
    'pred_list': ",".join(map(lambda i : str(int(i)), pred_list)),
    'name': name,
    'nickname': nickname,
    'mode': mode
}
d = requests.post('http://eval.buzzni.net:20001/eval', data=param)
print(d.json())