In [4]:
import os 
import json
import pandas as pd 
import numpy as np
from konlpy.tag import Okt
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import time

In [5]:
def categoryToTarget(text):
    if text == "판매":
        return 1
    elif text == "교환":
        return 2
    elif text == "구입":
        return 3
    elif text == "거래완료":
        return 4
    elif text == "그냥드림":
        return 5
    else:
        return 100


In [6]:
#데이터 가져오기
DATA_IN_PATH = './data/clienSample/'
fileList = os.listdir(DATA_IN_PATH)

data = pd.DataFrame()
# data = pd.DataFrame(columns=["title","article"])
# targetData = pd.DataFrame(columns=["category"])

for fileName in fileList:
    print(fileName)
    if fileName == ".DS_Store":
        continue
    with open(DATA_IN_PATH + fileName, 'r') as f:
        json_data = json.load(f)
        

    for document in json_data:
        documentDataFrame = pd.DataFrame.from_dict([{
            "title":document['title'],
            "article":document['article'],
            "category":document['category'],
            "category_to_target":categoryToTarget(document['category']),
            "collectDate":document['collectDate'],            
            "registerDate":document['registerDate'],            
        }])
#         articleDataFrame = pd.DataFrame.from_dict([{"title":document['title'],"article":document['article']}])
#         targetDataFrame = pd.DataFrame.from_dict([{"category":categoryToTarget(document['category'])}])

        data = data.append(documentDataFrame)
#         targetData = targetData.append(targetDataFrame)


20210701_clien
20210702_clien
20210703_clien
20210704_clien
20210705_clien


In [7]:
okt=Okt()
article_text = okt.morphs(data.iloc[0]['article'])
# print(article_text)
stopWords = ['은', '는', '이', '가', '하', '아', '것', '들', '의', '있', '되', '수', '보', '주', '등', '한']

In [8]:
def preprocessing(text, okt, remove_stopwords= False, stop_words=[]):
    #줄바꿈 문자 삭제
    text = text.replace("\n","")
    wordText = okt.morphs(text, stem=True)
    
    if remove_stopwords:
        wordText = [token for token in wordText if not token in stop_words]
        
    return wordText

In [9]:
# def nTextToWord(rawData,resultArray):
    
#     for article in rawData:
#         if type(article) == str:
#             resultArray.append(preprocessing(article, okt, remove_stopwords=True, stop_words=stopWords))
#         else:
#             resultArray.append([])

In [10]:
def rawdataToRefinedata(rawdata):
    xResultData = []
    yResultData = []
    
    for index, row in rawdata.iterrows():
#         temp = {'article':preprocessing(row['article'], okt, remove_stopwords=True, stop_words=stopWords),
#                 'title':preprocessing(row['article'], okt, remove_stopwords=True, stop_words=stopWords)}
#         xResultData.append(temp)
        xResultData.append(preprocessing(row['article'], okt, remove_stopwords=True, stop_words=stopWords))
        yResultData.append(row['category_to_target'])
        
    return xResultData, yResultData
        

In [11]:
startTime = time.time()
cleanTrainArticle = []
cleanTrainTitle = []
print('시작')
xData, yData = rawdataToRefinedata(data)
print('완료 : ', time.time()- startTime)
# nTextToWord(data['article'],cleanTrainArticle)
# nTextToWord(data['title'],cleanTrainTitle)

시작
완료 :  10.246124505996704


In [12]:
print(type(xData))
# print(xData[:1])
# print(yData[:100])


<class 'list'>


In [13]:
xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size=0.3)

print(len(xTrain), len(xTest))
print(len(yTrain), len(yTest))

# print(xTrain[:3])
print(yTest[:3])
print(yTest[:3])

826 355
826 355
[1, 1, 1]
[1, 1, 1]


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(xTrain)
tokenizer.fit_on_texts(xTest)
trainSequences = tokenizer.texts_to_sequences(xTrain)
testSequences = tokenizer.texts_to_sequences(xTest)

wordVocab = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100

trainInputs = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
trainLabels = np.array(yTrain)
testInputs = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
testLabels = np.array(yTest)


In [15]:
DATA_IN_PATH = './data_in/'
TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
TEST_INPUT_DATA = 'test_input.npy'
TEST_LABEL_DATA = 'test_label.npy'
DATA_CONFIGS = 'data_configs.json'

data_configs = {}

data_configs['vocab'] = wordVocab
data_configs['vocab_size'] = len(wordVocab)+1

if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)
    
np.save(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), trainInputs)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), trainLabels)
np.save(open(DATA_IN_PATH + TEST_INPUT_DATA, 'wb'), testInputs)
np.save(open(DATA_IN_PATH + TEST_LABEL_DATA, 'wb'), testLabels)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii=False)

