In [2]:
import pycrfsuite
import sys
import pandas as pd


In [3]:
df = pd.read_csv('preprocessed_df.csv')

In [4]:
df.head()

Unnamed: 0,id,name,brand,preprocessed,tokens
0,99566976,265mm 나이키 인터내셔널 리스트 운동화 / N261P,나이키,265mm 나이키 인터내셔널 리스트 운동화 n261p,"['265', 'mm', '나이키', '인터내셔널', '리스트', '운동화', 'n..."
1,100268480,☀️나이키 후드트레이닝세트☀️,나이키,나이키 후드트레이닝세트,"['나이키', '후드', '트레이닝세트']"
2,888258,나이키 여성운동화 240,나이키,나이키 여성운동화 240,"['나이키', '여성운동화', '240']"
3,99987072,나이키 셀렉트 팬츠 구매합니다,나이키,나이키 셀렉트 팬츠 구매합니다,"['나이키', '', '셀렉트', '팬츠', '구매합니다']"
4,745734,급처나이키 크로스백팩,나이키,급처나이키 크로스백팩,"['급처', '나이키', '크로스백', '팩']"


In [5]:
def generate_templates(begin=-2, end=2, min_range_length=3, max_range_length=5):
    templates = []
    for b in range(begin, end):
        for e in range(b, end+1):
            length = (e - b + 1)
            if length > max_range_length or length < min_range_length:
                continue
            if b * e > 0:
                continue
            templates.append((b, e))
    return templates


In [8]:
templates = generate_templates()
print(templates)

[(-2, 0), (-2, 1), (-2, 2), (-1, 1), (-1, 2), (0, 2)]


In [7]:
class CharacterFeatureTransformer:
    def __init__(self, templates):
        self.templates = templates
    
    def __call__(self, chars, tags=None):
        x = []
        e_max = len(chars)
        for i in range(len(chars)):
            xi = []
            for t in self.templates:
                b = i + t[0]
                e = i + t[1] + 1
                if b < 0 or e > e_max:
                    continue
                xi.append(('X[%d, %d]' % (t[0], t[1]), chars[b:e]))
            x.append(xi)
        return x

In [9]:
chars = '안녕하세요'
transformer = CharacterFeatureTransformer(templates)
transformer(chars)

[[('X[0, 2]', '안녕하')],
 [('X[-1, 1]', '안녕하'), ('X[-1, 2]', '안녕하세'), ('X[0, 2]', '녕하세')],
 [('X[-2, 0]', '안녕하'),
  ('X[-2, 1]', '안녕하세'),
  ('X[-2, 2]', '안녕하세요'),
  ('X[-1, 1]', '녕하세'),
  ('X[-1, 2]', '녕하세요'),
  ('X[0, 2]', '하세요')],
 [('X[-2, 0]', '녕하세'), ('X[-2, 1]', '녕하세요'), ('X[-1, 1]', '하세요')],
 [('X[-2, 0]', '하세요')]]

In [12]:
def sent_to_chartags(sent, nonspace=0, space=1):
    chars = sent.replace(' ', '')
    if not chars:
        return '', []
    
    tags = [nonspace] * (len(chars)-1) + [space]
    idx = 0
    for c in sent:
        if c == ' ':
            tags[idx-1] = space
        else:
            idx += 1
    return chars, tags


In [13]:
sent_to_chartags('안녕 오랜만이야')

('안녕오랜만이야', [0, 1, 0, 0, 0, 0, 1])

In [16]:
def sent_to_xy(sent, cft):
    chars, tags = sent_to_chartags(sent)
    x = [['%s=%s' % (xij[0], xij[1]) for xij in xi] for xi in cft(chars, tags)]
    y = [str(t) for t in tags]
    return x, y

In [17]:
x, y = sent_to_xy('안녕 오랜만이야', transformer)

In [18]:
x

[['X[0, 2]=안녕오'],
 ['X[-1, 1]=안녕오', 'X[-1, 2]=안녕오랜', 'X[0, 2]=녕오랜'],
 ['X[-2, 0]=안녕오',
  'X[-2, 1]=안녕오랜',
  'X[-2, 2]=안녕오랜만',
  'X[-1, 1]=녕오랜',
  'X[-1, 2]=녕오랜만',
  'X[0, 2]=오랜만'],
 ['X[-2, 0]=녕오랜',
  'X[-2, 1]=녕오랜만',
  'X[-2, 2]=녕오랜만이',
  'X[-1, 1]=오랜만',
  'X[-1, 2]=오랜만이',
  'X[0, 2]=랜만이'],
 ['X[-2, 0]=오랜만',
  'X[-2, 1]=오랜만이',
  'X[-2, 2]=오랜만이야',
  'X[-1, 1]=랜만이',
  'X[-1, 2]=랜만이야',
  'X[0, 2]=만이야'],
 ['X[-2, 0]=랜만이', 'X[-2, 1]=랜만이야', 'X[-1, 1]=만이야'],
 ['X[-2, 0]=만이야']]

In [19]:
y

['0', '1', '0', '0', '0', '0', '1']

In [20]:
params = {
    'max_iterations': 50, #default 1000
    'c1': 0, # L1 regularization, default 1
    'c2': 1, # 
    'feature.minfreq': 3
}
model_fname = 'crfsuite_spacing.model'


In [21]:
trainer = pycrfsuite.Trainer(verbose=True)
for name in df['preprocessed']:
    x, y = sent_to_xy(name, transformer)
    trainer.append(x, y)
trainer.set_params(params)
trainer.train(model_fname)

Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1196
Seconds required: 0.064

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 3603.973039
Feature norm: 1.000000
Error norm: 1095.764896
Active features: 1192
Line search trials: 1
Line search step: 0.000487
Seconds required for this iteration: 0.002

***** Iteration #2 *****
Loss: 3363.430585
Feature norm: 1.043656
Error norm: 516.243307
Active features: 1196
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #3 *****
Loss: 3172.116165
Feature norm: 1.479237
Error norm: 465.376426
Active features: 1196
Line search trials: 1
Line search step: 1.000000
Seconds required for this iterati

In [22]:
tagger = pycrfsuite.Tagger()
tagger.open(model_fname)

<contextlib.closing at 0x12444ac10>

In [23]:
def correct(sent, cft):
    char = sent.replace(' ', '')
    x, y = sent_to_xy(sent, cft)
    y_pred = tagger.tag(x)
    return ''.join([c if tag == '0' else c + ' ' for c, tag in zip(char, y_pred)])

In [24]:
correct('나이키에어포스', transformer)

'나이키 에어포스 '

In [25]:
correct('샤넬루이비통프라다팝니다', transformer)

'샤넬루이비통 프라다 팝니다 '

In [26]:
debugger = tagger.info()
weights = debugger.state_features

In [27]:
list(weights.items())[:10]

[(('X[0, 2]=265', '0'), 0.101827),
 (('X[-1, 1]=265', '0'), 0.405864),
 (('X[-2, 0]=265', '1'), 0.639093),
 (('X[0, 2]=나이키', '0'), 0.731112),
 (('X[-1, 1]=나이키', '0'), 0.742382),
 (('X[-2, 0]=나이키', '1'), 1.507208),
 (('X[0, 2]=내셔널', '0'), 0.400621),
 (('X[-1, 1]=내셔널', '0'), 0.633279),
 (('X[-2, 0]=내셔널', '0'), 0.047145),
 (('X[0, 2]=운동화', '0'), 0.617385)]

In [30]:
from pycrfsuite_spacing import TemplateGenerator, CharacterFeatureTransformer
from pprint import pprint

In [31]:
import pycrfsuite_spacing
pycrfsuite_spacing.__version__

'1.0.2'

In [32]:
templates = TemplateGenerator(
    begin=-2,
    end=2,
    min_range_length=3,
    max_range_length=3
)

to_feature = CharacterFeatureTransformer(tmemplates)

In [None]:
list(to_feature.templates)
