In [1]:
import ndjson
df = None
with open('product_brand_ner_data.ndjson', 'r', encoding='utf8') as f:
    df = ndjson.load(f)

In [2]:
df[0]

{'brands': ['나이키'], 'spans': [[7, 10]], 'text': '(수입고퀄) 나이키 스우시 반팔티셔츠'}

In [3]:
brands = []
texts = []
spans = []
for i in df:
    brands.append(i['brands'])
    texts.append(i['text'])
    spans.append(i['spans'])

In [4]:
print(len(brands) == len(spans) == len(texts))

True


In [5]:
import pandas as pd
data = pd.DataFrame.from_dict({
    'brand': brands,
    'text': texts,
    'span': spans
})

In [6]:
data.sample(n=20)

Unnamed: 0,brand,text,span
228600,"[나이키, nike]",나이키 nike 스우시 바람막이 점퍼 자켓,"[[0, 3], [4, 8]]"
1104726,[폴로],폴로 단가라 와플티 108,"[[0, 2]]"
520293,[컨버스],컨버스 하이탑 235,"[[0, 3]]"
880942,[바버],바버 뷰포트 세이지 국내 정품 새상품 38,"[[0, 2]]"
633938,[구찌],구찌 수프림 인터로킹 남녀공용 백팩,"[[0, 2]]"
97982,"[나이키, 사카이]",# 프리미엄 # 나이키 x 사카이 ldv 와플 나일론 #,"[[9, 12], [15, 18]]"
143388,[나이키],나이키 트레비스스캇270 240사이즈,"[[0, 3]]"
740571,"[코스, cos]",cos 코스 레더패치 울 크루넥 스웨터 / 라운드 니트,"[[4, 6], [0, 3]]"
870930,[루이비통],루이비통 다미에 베로나,"[[0, 4]]"
320034,[아디다스],아디다스 트레포일 xxl 맨투맨 크루넥티 110 팝니다. ay7792,"[[0, 4]]"


In [25]:
# def labeling(label, text):
#     return [(char, label) for char in text]

def branding(text, spans):
    length = len(text)
    labels = [0] * length
    if len(spans) > 0:
        for span in spans:
            b = span[0]
            e = span[1]
            labels[b:e] = [1] * (e-b)
    return [(text[idx], '1') if value == 1 else (text[idx], '0') for idx, value in enumerate(labels)]
        

In [26]:
branding('cos 코스 레더패치 울 크루넥 스웨터 / 라운드 니트', [[4, 6], [0, 3]])

[('c', '1'),
 ('o', '1'),
 ('s', '1'),
 (' ', '0'),
 ('코', '1'),
 ('스', '1'),
 (' ', '0'),
 ('레', '0'),
 ('더', '0'),
 ('패', '0'),
 ('치', '0'),
 (' ', '0'),
 ('울', '0'),
 (' ', '0'),
 ('크', '0'),
 ('루', '0'),
 ('넥', '0'),
 (' ', '0'),
 ('스', '0'),
 ('웨', '0'),
 ('터', '0'),
 (' ', '0'),
 ('/', '0'),
 (' ', '0'),
 ('라', '0'),
 ('운', '0'),
 ('드', '0'),
 (' ', '0'),
 ('니', '0'),
 ('트', '0')]

In [27]:
def generate_templates(begin=-2, end=2, min_range_length=3, max_range_length=5):
    templates = []
    for b in range(begin, end):
        for e in range(b, end+1):
            length = (e - b + 1)
            if length > max_range_length or length < min_range_length:
                continue
            if b * e > 0:
                continue
            templates.append((b, e))
    return templates

In [29]:
templates = generate_templates()
print(templates)

[(-2, 0), (-2, 1), (-2, 2), (-1, 1), (-1, 2), (0, 2)]


In [31]:
dict(zip([1,2,3], [2,3,4]))

{1: 2, 2: 3, 3: 4}

In [63]:
class CharacterFeatureTransformer:
    def __init__(self, begin=-2, end=2, min_range_length=3, max_range_length=5):
        self.templates = self._generate_templates(begin, end, min_range_length, max_range_length)
    
    def _generate_templates(self, begin=-2, end=2, min_range_length=3, max_range_length=5):
        templates = []
        for b in range(begin, end):
            for e in range(b, end+1):
                length = (e - b + 1)
                if length > max_range_length or length < min_range_length:
                    continue
                if b * e > 0:
                    continue
                templates.append((b, e))
        return templates        
    
    def get_features(self, text, i, tags=None):
        features = {
            'char_position': i,
            'char': text[i]
        }
        e_max = len(text)
        keys = []
        values = []
        for t in self.templates:
            b = i + t[0]
            e = i + t[1] + 1
            if b < 0 or e > e_max:
                continue
            keys.append('x[%d, %d]' % (t[0], t[1]))
            values.append([text[b:e]])
        features.update(dict(zip(keys, values)))
        return features

In [65]:
cft = CharacterFeatureTransformer()
cft.get_features('나이키 조던 운동화', 2)

{'char': '키',
 'char_position': 2,
 'x[-1, 1]': ['이키 '],
 'x[-1, 2]': ['이키 조'],
 'x[-2, 0]': ['나이키'],
 'x[-2, 1]': ['나이키 '],
 'x[-2, 2]': ['나이키 조'],
 'x[0, 2]': ['키 조']}

In [66]:
from tqdm import tqdm 

class DataLoader:
    def __init__(self, cft):
        self.cft = cft
    
    def get_data(self, brands, texts, spans):
        branding_list = []
        word2features_list = []
        label_list = []
        brand_list = []
        for idx, text in tqdm(enumerate(texts)):
            if len(brands[idx]) > 0:
                brand = brands[idx][0]
            else:
                brand = ''
            brand_list.append(brand)
            span = spans[idx]
            br = branding(text, span)
            branding_list.append(br)
            w2f = [self.cft.get_features(text, i) for i in range(len(text))]
            word2features_list.append(w2f)
            label = [label for char, label in br]
            label_list.append(label)
        return {
            'name': texts,
            'branding_list': branding_list, 
            'word2features_list': word2features_list, 
            'label_list': label_list,
            'brand': brand_list
           }

In [69]:
len(texts)

1765207

In [70]:
import numpy as np

idx_sample_list = np.random.choice(len(texts), 500000)

In [77]:
cft = CharacterFeatureTransformer()
dl = DataLoader(cft)
data = dl.get_data(np.array(brands)[idx_sample_list].tolist(), 
                   np.array(texts)[idx_sample_list].tolist(), 
                   np.array(spans)[idx_sample_list].tolist())

  This is separate from the ipykernel package so we can avoid doing imports until
  """
500000it [26:08, 318.69it/s] 


In [78]:
import json
import pickle

data_json = json.dumps(data)
with open('data_for_crf.pickle', 'wb') as f:
    pickle.dump(data_json, f)

In [79]:
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,name,branding_list,word2features_list,label_list,brand
0,구찌 라이톤 275~280 삽니다!,"[(구, 1), (찌, 1), ( , 0), (라, 0), (이, 0), (톤, 0...","[{'char_position': 0, 'char': '구', 'x[0, 2]': ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",구찌
1,[나이키코리아] 나이키 드라이핏 스우시 퓨추라 맨투맨,"[([, 0), (나, 1), (이, 1), (키, 1), (코, 0), (리, 0...","[{'char_position': 0, 'char': '[', 'x[0, 2]': ...","[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...",나이키
2,빈폴 남성 바람막이 105,"[(빈, 1), (폴, 1), ( , 0), (남, 0), (성, 0), ( , 0...","[{'char_position': 0, 'char': '빈', 'x[0, 2]': ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",빈폴
3,루이비통 다미에 클러치백 gm size 풀구성 상태 굿,"[(루, 1), (이, 1), (비, 1), (통, 1), ( , 0), (다, 0...","[{'char_position': 0, 'char': '루', 'x[0, 2]': ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",루이비통
4,275 나이키 에어맥스97울트라17검흰 918356-001,"[(2, 0), (7, 0), (5, 0), ( , 0), (나, 1), (이, 1...","[{'char_position': 0, 'char': '2', 'x[0, 2]': ...","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",나이키


In [80]:
from sklearn_crfsuite import metrics, CRF
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import pandas as pd

class CrfBrandDetector:
    def __init__(self):
        self.df = None
        self.test_index = None
        self.crf = None
        self.y_pred = None
    
    def train_test_split(self, df, test_size=0.2, random_state=123):
        self.df = df
        x_train, x_test, y_train, y_test = train_test_split(
            self.df['word2features_list'], self.df['label_list'],
            test_size=test_size,
            random_state=random_state
        )
        self.test_index = x_test.index
        return x_train, x_test, y_train, y_test
    
    def fit(self, x_train, y_train):
        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.05,
            c2=0.05,
            max_iterations=100,
            all_possible_states=True
        )
        self.crf.fit(x_train, y_train)
    
    def save_model(self, model_file_name='crf_model.sav'):
        pickle.dump(self.crf, open(model_file_name, 'wb'))
        
    def predict(self, x):
        title = [[diction['char'] for diction in obs] for obs in x]
        self.y_pred = self.crf.predict(x)
        idx = [[True if label == '1' else False for label in obs] for obs in self.y_pred]
        preds = [''.join(np.array(title[i])[idx[i]]) for i in range(len(title))]
        df_pred = pd.concat([
            self.df[self.df.index.isin(self.test_index)].reindex(self.test_index).reset_index()['name'],
            pd.DataFrame(preds)
        ], axis=1)
        df_pred.columns = ['product_name', 'predicted_brand']
#         df_pred['predicted_brand'] = df_pred.apply(
#             lambda row: row.predicted_brand if row.predicted_brand in row['product_name'].lower()
#             else row.predicted_brand.split(), axis=1
#         )
        return df_pred

    def get_y_pred(self):
        return self.y_pred
    
    def report_classification(self, x_test, y_test):
        labels = list(self.crf.classes_)
        labels.remove('0')
        print(metrics.flat_classification_report(
            y_test,
            self.y_pred,
            labels=labels,
            digits=3
        ))

    def evaluate(self, y_test):
        acc = float(list(y_test == self.y_pred).count(True)) / len(self.y_pred)
        return acc

In [82]:
crf_model = CrfBrandDetector()
x_train, x_test, y_train, y_test = crf_model.train_test_split(df)
crf_model.fit(x_train, y_train)

In [83]:
pred = crf_model.predict(x_test)

In [87]:
y_pred = crf_model.get_y_pred()

In [89]:
y_pred[30]

['1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

In [90]:
pred[30:40]

Unnamed: 0,product_name,predicted_brand
30,시스템 플리츠 스트랩 스커트,시스템
31,스튜디오 톰보이 나그랑 t,톰보이
32,구찌 스타일 타이거 니트 미듐 55-77,구찌
33,디젤 청바지 남자34사이즈,디젤
34,(미착용 새상품)나이키 스포츠웨어 스우시 셰르파 s 나이키 크롭후리스,나이키
35,발렌시아가 노스사우스 쇼퍼백(스몰),발렌시아가
36,챔피온 바람막이 팝니다,챔피온
37,나이키 스타팅 파이브 바람막이 xl,나이키
38,#당일출고 스톤아일랜드 그레이 카고바지.,스톤아일랜드
39,나이키 덩크 범고래 bg 250,나이키


In [91]:
crf_model.report_classification(x_test, y_test)

              precision    recall  f1-score   support

           1      0.996     0.998     0.997    393916

   micro avg      0.996     0.998     0.997    393916
   macro avg      0.996     0.998     0.997    393916
weighted avg      0.996     0.998     0.997    393916



In [92]:
crf_model.evaluate(y_test)

  return array(a, dtype, copy=False, order=order)


0.99493