test_sample.csv
- id
- name
- brand

In [6]:
import re

def preprocess_text(text):
    text = str(text)
    text = remove_emoji(text)
    text = remove_puctuation(text)
    text = remove_space(text)
    return text.lower().strip()

def remove_emoji(text):
    regrex_pattern = re.compile(pattern="["
                                        u"\U0001F600-\U0001F64F"  # emoticons
                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                        "]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)

def remove_space(text):
    return re.sub(r'\s+', ' ', text)

def remove_puctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)


In [9]:
import pandas as pd
df = pd.read_csv('test_sample.csv')
df['preprocessed'] = df['name'].map(preprocess_text)
df.to_csv('test_sample.csv', index=False)

In [10]:
df.head()

Unnamed: 0,id,name,brand,preprocessed
0,99566976,265mm 나이키 인터내셔널 리스트 운동화 / N261P,나이키,265mm 나이키 인터내셔널 리스트 운동화 n261p
1,100268480,☀️나이키 후드트레이닝세트☀️,나이키,나이키 후드트레이닝세트
2,888258,나이키 여성운동화 240,나이키,나이키 여성운동화 240
3,99987072,나이키 셀렉트 팬츠 구매합니다,나이키,나이키 셀렉트 팬츠 구매합니다
4,745734,급처나이키 크로스백팩,나이키,급처나이키 크로스백팩


In [16]:
import sentencepiece as sp

model_file = 'name_unigram_20000.model'
sp_processor = sp.SentencePieceProcessor(model_file)


In [33]:
preprocessed = df['preprocessed']
product_names = [sp_processor.encode_as_pieces(str(name)) for name in preprocessed]

In [187]:
df.to_csv('./preprocessed_df.csv', index=False)

In [34]:
print(product_names[0])
print([t[1:] if t.startswith('▁') else t for t in product_names[0]])

['▁265', 'mm', '▁나이키', '▁인터내셔널', '▁리스트', '▁운동화', '▁n', '26', '1', 'p']
['265', 'mm', '나이키', '인터내셔널', '리스트', '운동화', 'n', '26', '1', 'p']


In [35]:
from tqdm import tqdm

product_tokens = []
for idx in tqdm(range(len(product_names))):
    name = product_names[idx]
    tokens = [t[1:] if t.startswith('▁') else t for t in name]
    product_tokens.append(tokens)

100%|██████████| 515/515 [00:00<00:00, 384011.83it/s]


In [179]:
# -*- coding:utf-8 -*-

import pickle
with open('product_tokens.txt', 'wb') as f:
    pickle.dump(product_tokens, f)

In [181]:
df['tokens'] = product_tokens
df.to_csv('df.csv', index=False)

In [42]:
brand_set = df.brand.unique()
for brand in brand_set:
    print(sp_processor.encode_as_pieces(brand))

['▁나이키']
['▁스톤아일랜드']
['▁뉴발란스']
['▁아디다스']
['▁노스페이스']
['▁스투시']
['▁아식스']
['▁무신사스탠다드']
['▁디스이즈네버댓']
['▁커버낫']
['▁메종키츠네']
['▁아미']
['▁아더에러']
['▁반스']
['▁아크네']
['▁컨버스']
['▁유니클로']
['▁알파인더스트리']
['▁에센셜']
['▁내셔널지오그래픽']
['▁앤더슨벨']
['▁파타고니아']
['▁르꼬끄']
['▁칼하트']
['▁구찌']
['▁프라다']
['▁자라']
['▁메종마르지엘라']
['▁라퍼지스토어']
['▁비바스튜디오']
['▁프라이탁']
['▁커먼프로젝트']
['▁라코스테']
['▁챔피온']
['▁모드나인']
['▁슈프림']
['▁오베이']
['▁피어오브갓']
['▁루이비통']
['▁페이탈리즘']
['▁뉴에라']
['▁바버']
['▁피스워커']
['▁오프화이트']
['▁톰브라운']
['▁보테가베네타']
['▁디올']
['▁알렉산더맥퀸']
['▁무스너클']
['▁티파니']
['▁에르메스']
['▁샤넬']
['▁토템']
['▁입생로랑']
['▁자크뮈스']


In [36]:
print(product_names[3])
print(product_tokens[3])

['▁나이키', '▁', '셀렉트', '▁팬츠', '▁구매합니다']
['나이키', '', '셀렉트', '팬츠', '구매합니다']


In [64]:
import re
w = '나이키'
bool(re.compile(r'[가-힣]+').match(w))


True

In [118]:
import string

def labeling(label, tokens):
    return [(word, label) for word in tokens]

def branding(brand, tokens):
    if brand in tokens:
        br_idx = tokens.index(brand)
        return labeling('0', tokens[:br_idx]) + labeling('BRAND', [tokens[br_idx]]) + labeling('0', tokens[br_idx+1:])
    else:
        return labeling('0', tokens)

In [119]:
branding('스톤아일랜드', product_tokens[10])

[('스톤아일랜드', 'BRAND'), ('니트', '0')]

In [154]:
import re

def word2features(tokens, i):
    word = str(tokens[i])
    features = {
        'bias': 1,
        'word_position': i,
        'word[-2:]': word[-2:],
        'word_lower': word.lower(),
        'word_anyDigit': any(ch.isdigit() for ch in word),
        'word_isDigit': bool(re.compile(r'[0-9]').match(word)),
        'word_isEng': bool(re.compile(r'[a-zA-Z]+').match(word)),
        'word_isKor': bool(re.compile(r'[가-힣]+').match(word))
    }
    
    if i > 0:
        prev_word = str(tokens[i-1])
        features.update({
            'prev_word_anyDigit': any(ch.isdigit() for ch in prev_word),
            'prev_word_isDigit': bool(re.compile(r'[0-9]').match(word)),
            'prev_word_isEng': bool(re.compile(r'[a-zA-Z]+').match(prev_word)),
            'prev_word_isKor': bool(re.compile(r'[가-힣]+').match(prev_word))
        })
    
    else:
        features['BOS'] = True
    
    if i < len(tokens) - 1:
        next_word = str(tokens[i+1])
        features.update({
            'next_word_anyDigit': any(ch.isdigit() for ch in next_word),
            'next_word_isDigit': bool(re.compile(r'[0-9]').match(word)),
            'next_word_isEng': bool(re.compile(r'[a-zA-Z]+').match(next_word)),
            'next_word_isKor': bool(re.compile(r'[가-힣]+').match(next_word))
        })
    else:
        features['EOS'] = True
    
    if i > 1:
        prev1 = str(tokens[i-1])
        prev2 = str(tokens[i-2])
        features.update({
            '-2ngram': '{} {}'.format(prev1, prev2)
        })
    if i < len(tokens) - 2:
        next1 = str(tokens[i+1])
        next2 = str(tokens[i+2])
        features.update({
            '+2ngram': '{} {}'.format(next1, next2)
        })
    
    return features

In [155]:
class DataLoader:
    def get_data(self, file_name='test_sample.csv'):
        df = pd.read_csv(file_name)
        branding_list = []
        word2features_list = []
        label_list = []
        for idx, row in df.iterrows():
            tokens = product_tokens[idx]
            br = branding(row.brand, tokens)
            branding_list.append(br)
            w2f = [word2features(tokens, i) for i in range(len(tokens))]
            word2features_list.append(w2f)
            label = [label for token, label in br]
            label_list.append(label)
        return {
            'name': df.preprocessed.tolist(),
            'branding_list': branding_list, 
            'word2features_list': word2features_list, 
            'label_list': label_list,
            'brand': df.brand.tolist()
           }

In [156]:
dl = DataLoader()

In [157]:
data = dl.get_data()

In [158]:
crf_df = pd.DataFrame.from_dict(data)

In [159]:
crf_df.columns

Index(['name', 'branding_list', 'word2features_list', 'label_list', 'brand'], dtype='object')

In [160]:
crf_df.head()

Unnamed: 0,name,branding_list,word2features_list,label_list,brand
0,265mm 나이키 인터내셔널 리스트 운동화 n261p,"[(265, 0), (mm, 0), (나이키, BRAND), (인터내셔널, 0), ...","[{'bias': 1, 'word_position': 0, 'word[-2:]': ...","[0, 0, BRAND, 0, 0, 0, 0, 0, 0, 0]",나이키
1,나이키 후드트레이닝세트,"[(나이키, BRAND), (후드, 0), (트레이닝세트, 0)]","[{'bias': 1, 'word_position': 0, 'word[-2:]': ...","[BRAND, 0, 0]",나이키
2,나이키 여성운동화 240,"[(나이키, BRAND), (여성운동화, 0), (240, 0)]","[{'bias': 1, 'word_position': 0, 'word[-2:]': ...","[BRAND, 0, 0]",나이키
3,나이키 셀렉트 팬츠 구매합니다,"[(나이키, BRAND), (, 0), (셀렉트, 0), (팬츠, 0), (구매합니...","[{'bias': 1, 'word_position': 0, 'word[-2:]': ...","[BRAND, 0, 0, 0, 0]",나이키
4,급처나이키 크로스백팩,"[(급처, 0), (나이키, BRAND), (크로스백, 0), (팩, 0)]","[{'bias': 1, 'word_position': 0, 'word[-2:]': ...","[0, BRAND, 0, 0]",나이키


In [182]:
from sklearn_crfsuite import metrics, CRF
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import pandas as pd

class CrfBrandDetector:
    def __init__(self):
        self.df = None
        self.test_index = None
        self.crf = None
        self.y_pred = None
    
    def train_test_split(self, df, test_size=0.2, random_state=123):
        self.df = df
        x_train, x_test, y_train, y_test = train_test_split(
            self.df['word2features_list'], self.df['label_list'],
            test_size=test_size,
            random_state=random_state
        )
        self.test_index = x_test.index
        return x_train, x_test, y_train, y_test
    
    def fit(self, x_train, y_train):
        self.crf = CRF(
            algorithm='lbfgs',
            c1=0.05,
            c2=0.05,
            max_iterations=100,
            all_possible_states=True
        )
        self.crf.fit(x_train, y_train)
    
    def save_model(self, model_file_name='crf_model.sav'):
        pickle.dump(self.crf, open(model_file_name, 'wb'))
        
    def predict(self, x):
        title = [[diction['word_lower'] for diction in obs] for obs in x]
        self.y_pred = self.crf.predict(x)
        idx = [[True if label == 'BRAND' else False for label in obs] for obs in self.y_pred]
        preds = [' '.join(np.array(title[i])[idx[i]]) for i in range(len(title))]
        df_pred = pd.concat([
            self.df[self.df.index.isin(self.test_index)].reindex(self.test_index).reset_index()['name'],
            pd.DataFrame(preds)
        ], axis=1)
        df_pred.columns = ['product_name', 'predicted_brand']
        df_pred['predicted_brand'] = df_pred.apply(
            lambda row: row.predicted_brand if row.predicted_brand in row['product_name'].lower()
            else row.predicted_brand.split(), axis=1
        )
        return df_pred

    def get_y_pred(self):
        return self.y_pred
    
    def report_classification(self, x_test, y_test):
        labels = list(self.crf.classes_)
        labels.remove('0')
        print(metrics.flat_classification_report(
            y_test,
            self.y_pred,
            labels=labels,
            digits=3
        ))

    def evaluate(self, y_test):
        acc = float(list(y_test == self.y_pred).count(True)) / len(self.y_pred)
        return acc

In [183]:
crf_model = CrfBrandDetector()
x_train, x_test, y_train, y_test = crf_model.train_test_split(crf_df)
crf_model.fit(x_train, y_train)

In [184]:
pred = crf_model.predict(x_test)

In [189]:
crf_model.get_y_pred()[5]

['0', '0', '0', '0', '0', '0', '0']

In [172]:
pred[:20]

Unnamed: 0,product_name,predicted_brand
0,아디다스 핑크 로고 후드집업,아디다스
1,95 르꼬끄 맨투맨 무료배송,르꼬끄
2,디스이즈네버댓 티셔츠,디스이즈네버댓
3,비바스튜디오 롱코트,비바스튜디오
4,아디다스 여성 춘추 트렌치 코트 상태좋음,아디다스
5,후루룩구제 나이키 265 운동화 중고 가방 신발,
6,삽니다 프라이탁 지갑 max 삽니다 구합니다 블랙만,프라이탁
7,새상품 피스워커 mine worker st develop,피스워커
8,톰브라운 캐시 사선완장 가디건 풀라벨 출고입니다 수입 부자재 원단,톰브라운
9,디올 스타일 열쇠고리 키링,디올


In [174]:
crf_model.report_classification(x_test, y_test)

              precision    recall  f1-score   support

       BRAND      0.990     0.951     0.970       103

   micro avg      0.990     0.951     0.970       103
   macro avg      0.990     0.951     0.970       103
weighted avg      0.990     0.951     0.970       103



In [175]:
crf_model.evaluate(y_test)

  return array(a, dtype, copy=False, order=order)


0.9514563106796117