In [2]:
import ndjson
df = None
with open('./data/product_brand_ner_data.ndjson', 'r', encoding='utf8') as f:
    df = ndjson.load(f)

In [3]:
df[0]

{'brands': ['나이키'], 'spans': [[6, 9]], 'text': '(새상품) 나이키 농구화 척포짓 240~300'}

In [4]:
brands = []
texts = []
spans = []
for i in df:
    brands.append(i['brands'])
    texts.append(i['text'])
    spans.append(i['spans'])

In [5]:
print(len(brands) == len(spans) == len(texts))

True


In [6]:
import pandas as pd
data = pd.DataFrame.from_dict({
    'brand': brands,
    'text': texts,
    'span': spans
})

In [7]:
data.sample(n=20)

Unnamed: 0,brand,text,span
390609,[노스페이스],95 노스페이스 플러피 리모핏 후리스 집업,"[[3, 8]]"
343758,[아디다스],아디다스 키즈 운동화 샌들 아사히 포켓몬 실내화 140 150,"[[0, 4]]"
16297,[나이키],나이키 여성 에어로로프트조끼,"[[0, 3]]"
893105,[오프화이트],오프화이트 반팔티 오버핏 l사이즈 100,"[[0, 5]]"
1665832,[디스커버리],디스커버리 패딩 95size 베이지색,"[[0, 5]]"
1124172,[폴로],[usxl] 폴로 랄프로렌 면 100% 케이블 니트 남녀공용,"[[7, 9]]"
1225402,[balenciaga],벨렌시아가 귀걸이 balenciaga st,"[[10, 20]]"
682879,[gucci],정 19ss gucci 볼캡 m,"[[7, 12]]"
1215866,[발렌시아가],[xs] 발렌시아가 크라운로고 후드티 팝니다,"[[5, 10]]"
76373,[나이키],나이키 p6000 택달린 새제품 판매 (240),"[[0, 3]]"


In [8]:
# def labeling(label, text):
#     return [(char, label) for char in text]

def branding(text, spans):
    length = len(text)
    labels = [0] * length
    if len(spans) > 0:
        for span in spans:
            b = span[0]
            e = span[1]
            labels[b:e] = [1] * (e-b)
    return [(text[idx], '1') if value == 1 else (text[idx], '0') for idx, value in enumerate(labels)]
        

In [9]:
branding('cos 코스 레더패치 울 크루넥 스웨터 / 라운드 니트', [[4, 6], [0, 3]])

[('c', '1'),
 ('o', '1'),
 ('s', '1'),
 (' ', '0'),
 ('코', '1'),
 ('스', '1'),
 (' ', '0'),
 ('레', '0'),
 ('더', '0'),
 ('패', '0'),
 ('치', '0'),
 (' ', '0'),
 ('울', '0'),
 (' ', '0'),
 ('크', '0'),
 ('루', '0'),
 ('넥', '0'),
 (' ', '0'),
 ('스', '0'),
 ('웨', '0'),
 ('터', '0'),
 (' ', '0'),
 ('/', '0'),
 (' ', '0'),
 ('라', '0'),
 ('운', '0'),
 ('드', '0'),
 (' ', '0'),
 ('니', '0'),
 ('트', '0')]

In [10]:
def generate_templates(begin=-2, end=2, min_range_length=3, max_range_length=5):
    templates = []
    for b in range(begin, end):
        for e in range(b, end+1):
            length = (e - b + 1)
            if length > max_range_length or length < min_range_length:
                continue
            if b * e > 0:
                continue
            templates.append((b, e))
    return templates

In [11]:
templates = generate_templates()
print(templates)

[(-2, 0), (-2, 1), (-2, 2), (-1, 1), (-1, 2), (0, 2)]


In [12]:
dict(zip([1,2,3], [2,3,4]))

{1: 2, 2: 3, 3: 4}

In [13]:
class CharacterFeatureTransformer:
    def __init__(self, begin=-2, end=2, min_range_length=3, max_range_length=5):
        self.templates = self._generate_templates(begin, end, min_range_length, max_range_length)
    
    def _generate_templates(self, begin=-2, end=2, min_range_length=3, max_range_length=5):
        templates = []
        for b in range(begin, end):
            for e in range(b, end+1):
                length = (e - b + 1)
                if length > max_range_length or length < min_range_length:
                    continue
                if b * e > 0:
                    continue
                templates.append((b, e))
        return templates        
    
    def get_features(self, text, i, tags=None):
        features = {
            'char_position': i,
            'char': text[i]
        }
        e_max = len(text)
        keys = []
        values = []
        for t in self.templates:
            b = i + t[0]
            e = i + t[1] + 1
            if b < 0 or e > e_max:
                continue
            keys.append('x[%d, %d]' % (t[0], t[1]))
            values.append([text[b:e]])
        features.update(dict(zip(keys, values)))
        return features

In [14]:
cft = CharacterFeatureTransformer()
cft.get_features('나이키 조던 운동화', 2)

{'char': '키',
 'char_position': 2,
 'x[-1, 1]': ['이키 '],
 'x[-1, 2]': ['이키 조'],
 'x[-2, 0]': ['나이키'],
 'x[-2, 1]': ['나이키 '],
 'x[-2, 2]': ['나이키 조'],
 'x[0, 2]': ['키 조']}

In [15]:
from tqdm import tqdm 

class DataLoader:
    def __init__(self, cft):
        self.cft = cft
    
    def get_data(self, brands, texts, spans):
        branding_list = []
        word2features_list = []
        label_list = []
        brand_list = []
        for idx, text in tqdm(enumerate(texts)):
            if len(brands[idx]) > 0:
                brand = brands[idx][0]
            else:
                brand = ''
            brand_list.append(brand)
            span = spans[idx]
            br = branding(text, span)
            branding_list.append(br)
            w2f = [self.cft.get_features(text, i) for i in range(len(text))]
            word2features_list.append(w2f)
            label = [label for char, label in br]
            label_list.append(label)
        return {
            'name': texts,
            'branding_list': branding_list, 
            'word2features_list': word2features_list, 
            'label_list': label_list,
            'brand': brand_list
           }

In [16]:
len(texts)

1765196

In [17]:
import numpy as np

idx_sample_list = np.random.choice(len(texts), 500000)

In [18]:
cft = CharacterFeatureTransformer()
dl = DataLoader(cft)
data = dl.get_data(np.array(brands)[idx_sample_list].tolist(), 
                   np.array(texts)[idx_sample_list].tolist(), 
                   np.array(spans)[idx_sample_list].tolist())

  This is separate from the ipykernel package so we can avoid doing imports until
  """
500000it [03:07, 2672.30it/s]


In [19]:
import json
import pickle

data_json = json.dumps(data)
with open('data_for_crf.pickle', 'wb') as f:
    pickle.dump(data_json, f)

In [20]:
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,name,branding_list,word2features_list,label_list,brand
0,프라다 패딩셔츠,"[(프, 1), (라, 1), (다, 1), ( , 0), (패, 0), (딩, 0...","[{'char_position': 0, 'char': '프', 'x[0, 2]': ...","[1, 1, 1, 0, 0, 0, 0, 0]",프라다
1,[50] 라프시몬스 아이솔레이티드 히어로즈 파카 블랙,"[([, 0), (5, 0), (0, 0), (], 0), ( , 0), (라, 1...","[{'char_position': 0, 'char': '[', 'x[0, 2]': ...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",라프시몬스
2,nike s/s athletic cut cotton treign sz s,"[(n, 1), (i, 1), (k, 1), (e, 1), ( , 0), (s, 0...","[{'char_position': 0, 'char': 'n', 'x[0, 2]': ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",nike
3,나이키 후리스 반집업 긴팔 xl,"[(나, 1), (이, 1), (키, 1), ( , 0), (후, 0), (리, 0...","[{'char_position': 0, 'char': '나', 'x[0, 2]': ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",나이키
4,폴로 벙거지햇,"[(폴, 1), (로, 1), ( , 0), (벙, 0), (거, 0), (지, 0...","[{'char_position': 0, 'char': '폴', 'x[0, 2]': ...","[1, 1, 0, 0, 0, 0, 0]",폴로


In [100]:
df['brand'].value_counts()

나이키          66424
아디다스         23757
폴로           21514
구찌           19069
루이비통         13484
             ...  
필립프레인            1
키이쓰              1
삼성 라이온즈          1
onitsuka         1
vera wang        1
Name: brand, Length: 717, dtype: int64

In [104]:
from sklearn_crfsuite import metrics, CRF
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import pandas as pd

class CrfBrandDetector:
    def __init__(self):
        self.df = None
        self.test_index = None
        self.crf = None
        self.y_pred = None
    
    def train_test_split(self, df, test_size=0.2, random_state=123):
        self.df = df
        x_train, x_test, y_train, y_test = train_test_split(
            self.df['word2features_list'], self.df['label_list'],
            test_size=test_size,
            random_state=random_state
        )
        self.test_index = x_test.index
        return x_train, x_test, y_train, y_test
    
    def fit(self, x_train, y_train):
        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.05,
            c2=0.05,
            max_iterations=100,
            all_possible_states=True
        )
        self.crf.fit(x_train, y_train)
    
    def save_model(self, model_file_name='crf_model.sav'):
        pickle.dump(self.crf, open(model_file_name, 'wb'))
        
    def predict(self, x):
        title = [[diction['char'] for diction in obs] for obs in x]
        self.y_pred = self.crf.predict(x)
        idx = [[True if label == '1' else False for label in obs] for obs in self.y_pred]
        preds = [''.join(np.array(title[i])[idx[i]]) for i in range(len(title))]
        df_pred = pd.concat([
            self.df[self.df.index.isin(self.test_index)].reindex(self.test_index).reset_index()['name'],
            pd.DataFrame(preds)
        ], axis=1)
        df_pred.columns = ['product_name', 'predicted_brand']
#         df_pred['predicted_brand'] = df_pred.apply(
#             lambda row: row.predicted_brand if row.predicted_brand in row['product_name'].lower()
#             else row.predicted_brand.split(), axis=1
#         )
        return df_pred

    def get_y_pred(self):
        return self.y_pred
    
    def report_classification(self, x_test, y_test):
        labels = list(self.crf.classes_)
        labels.remove('0')
        print(metrics.flat_classification_report(
            y_test,
            self.y_pred,
            labels=labels,
            digits=3
        ))

    def evaluate(self, y_test):
        acc = float(list(y_test == self.y_pred).count(True)) / len(self.y_pred)
        return acc

In [None]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

crf_model = CrfBrandDetector()
x_train, x_test, y_train, y_test = crf_model.train_test_split(df)
crf_model.fit(x_train, y_train)

current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 09:26:37


In [94]:
x_test.head()

112430    [{'char_position': 0, 'char': '1', 'x[0, 2]': ...
338861    [{'char_position': 0, 'char': '폴', 'x[0, 2]': ...
464653    [{'char_position': 0, 'char': '프', 'x[0, 2]': ...
344237    [{'char_position': 0, 'char': '자', 'x[0, 2]': ...
356227    [{'char_position': 0, 'char': '나', 'x[0, 2]': ...
Name: word2features_list, dtype: object

In [83]:
pred = crf_model.predict(x_test)

In [87]:
y_pred = crf_model.get_y_pred()

In [97]:
y_pred[41]

['0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [103]:
pred[:10]

Unnamed: 0,product_name,predicted_brand
0,100-105 나이키 ac밀란 바람막이,나이키
1,폴로 셔츠 105 남방 p-1693,폴로
2,프라다 라운드 반팔 티셔츠 [l1329],프라다
3,자크뮈스 블라우스,자크뮈스
4,나이키 덩크 미시건,나이키
5,미스치프 흑청 캡모자,미스치프
6,[260] 나이키 sb 덩크 로우 그레이트 풀데드,나이키
7,(정품) 스톤아일랜드 후드 블랙 l,스톤아일랜드
8,다이애그널 레이스 니트,다이애그널
9,구찌 레플 반바지,구찌


In [None]:
crf_model.predict()

In [91]:
crf_model.report_classification(x_test, y_test)

              precision    recall  f1-score   support

           1      0.996     0.998     0.997    393916

   micro avg      0.996     0.998     0.997    393916
   macro avg      0.996     0.998     0.997    393916
weighted avg      0.996     0.998     0.997    393916



In [92]:
crf_model.evaluate(y_test)

  return array(a, dtype, copy=False, order=order)


0.99493