In [5]:
# -*- coding:utf-8 -*-

import pickle
with open('product_tokens.txt', 'rb') as f:
    product_tokens = pickle.load(f)

In [7]:
product_tokens[0]

['265', 'mm', '나이키', '인터내셔널', '리스트', '운동화', 'n', '26', '1', 'p']

In [24]:
# -*- coding:utf-8 -*-

import pickle
with open('product_names.txt', 'rb') as f:
    product_names = pickle.load(f)

product_names[0]

['▁265', 'mm', '▁나이키', '▁인터내셔널', '▁리스트', '▁운동화', '▁n', '26', '1', 'p']

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from itertools import chain
from nltk import word_tokenize

Using TensorFlow backend.


In [8]:
def labeling(label, tokens):
    return [(word, label) for word in tokens]

def branding(brand, tokens):
    if brand in tokens:
        br_idx = tokens.index(brand)
        return labeling('0', tokens[:br_idx]) + labeling('BRAND', [tokens[br_idx]]) + labeling('0', tokens[br_idx+1:])
    else:
        return labeling('0', tokens)

def one_hot_encoding(x, y):
    y = np.array(y)
    max_sentence_len = max(map(len, x))
    all_tags = set(chain(*y))
    NUM_TAGS = len(all_tags)
    TAGS_MAP = {"0": 0, "BRAND": 1, "NIL": 2}
    y = list(map(lambda x: [TAGS_MAP[t] for t in x], y))
    y = pad_sequences(y, max_sentence_len, padding='pre')
    y = np.array([to_categorical(t, NUM_TAGS) for t in y])
    return x, y

In [11]:
df = pd.read_csv('df.csv')

In [12]:
df.head()

Unnamed: 0,id,name,brand,preprocessed,tokens
0,99566976,265mm 나이키 인터내셔널 리스트 운동화 / N261P,나이키,265mm 나이키 인터내셔널 리스트 운동화 n261p,"['265', 'mm', '나이키', '인터내셔널', '리스트', '운동화', 'n..."
1,100268480,☀️나이키 후드트레이닝세트☀️,나이키,나이키 후드트레이닝세트,"['나이키', '후드', '트레이닝세트']"
2,888258,나이키 여성운동화 240,나이키,나이키 여성운동화 240,"['나이키', '여성운동화', '240']"
3,99987072,나이키 셀렉트 팬츠 구매합니다,나이키,나이키 셀렉트 팬츠 구매합니다,"['나이키', '', '셀렉트', '팬츠', '구매합니다']"
4,745734,급처나이키 크로스백팩,나이키,급처나이키 크로스백팩,"['급처', '나이키', '크로스백', '팩']"


In [25]:
df['branding'] = [branding(brand, tokens) for brand, tokens in zip(df.brand, product_names)]

In [26]:
raw_w, raw_t, raw_data = [], [], []
for i in df['branding']:
    for word, tag in i:
        raw_w.append(word)
        raw_t.append(tag)
    raw_data.append((tuple(raw_w), tuple(raw_t)))
    raw_w, raw_t = [], []

In [27]:
raw_data[0]

(('▁265', 'mm', '▁나이키', '▁인터내셔널', '▁리스트', '▁운동화', '▁n', '26', '1', 'p'),
 ('0', '0', '0', '0', '0', '0', '0', '0', '0', '0'))

In [22]:
import sentencepiece as sp

model_file = 'name_unigram_20000.model'
sp_processor = sp.SentencePieceProcessor(model_file)


In [46]:
all_x, all_y = [], []
max_sentence_len = 100
for words, tags in raw_data:
    encoded_words, encoded_tags = [], []
    for w, t in zip(words, tags):
        if sp_processor.encode(w):
            encoded_words.append(sp_processor.encode(w)[0])
            encoded_tags.append(t)
        else:
            encoded_words.append(np.ones(20000))
            encoded_tags.append(t)
    
    nil_x = np.zeros(20000)
    nil_y = 'NIL'
    pad_length = max_sentence_len - len(encoded_words)
    all_x.append((pad_length*[nil_x]) + encoded_words)
    all_y.append((pad_length*[nil_y]) + encoded_tags)
    

In [55]:
all_y[0]

['NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 'NIL',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [None]:
all_x, all_y = [], []
max_sentence_len = 36
for words, tags in raw_data:
    encoded_words, encoded_tags = [], []
    for w, t in zip(words, tags):
        if w.lower() in word_to_idx:
            encoded_words.append(word_to_idx[w.lower()])
            encoded_tags.append(t)
        else:
            encoded_words.append(np.ones(50))
            encoded_tags.append(t)

    nil_x = np.zeros(50)
    nil_y = 'NIL'
    pad_length = max_sentence_len - len(encoded_words)
    all_x.append((pad_length * [nil_x]) + encoded_words)
    all_y.append((pad_length * [nil_y]) + encoded_tags)
all_x, all_y = one_hot_encoding(all_x, all_y)
all_x, all_y = np.array(all_x), np.array(all_y)