# Pipelining for `D2VTransforemr` Evaluation

- Load & Preprocessing -> GridSearchCV

---

## Load & Preprocessing (New)

In [1]:
import pickle
import pandas as pd
import numpy as np

import itertools
from time import time

import os
from konlpy.tag import Kkma, Komoran
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from tqdm import tqdm_notebook
import warnings

from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb


warnings.filterwarnings('ignore')

from nylondetector.preprocess.siu_data_preproc import *
from nylondetector.preprocess.text_handling import ma_with_ngrams
from document_embedding import *

In [2]:
tgt_yyyymm = '2021-09'

doubtful_hospital_list = pd.read_csv('hospital_list.csv', delimiter='\t')

In [3]:
### Preprocessing chunk

preprocessor = SIUDataPreprocess(target_yyyymm=tgt_yyyymm,
                                 data_directory=os.getcwd(),
                                 text_columns=['name', 'title', 'content'])

preprocessor.load_data_dict()
preprocessor.transform_date()
preprocessor.combine_data_dict()
preprocessor.extract_hashtags(content_col='content', 
                              hospital_re_target='.*(안과|병원|의원)$')

data_hospital, data_prsnl = preprocessor.hospital_prsnl_split(hospital_words_list=['병원', '의원', '외과', '안과'],
                                                        non_word_list=['동물', '동물병원'])

hospital_names = preprocessor.extract_hospital_name('.*(안과|병원|의원).*').rename('hospital_name')
doubtful_hospital_names = preprocessor.match_doubtful_hospitals(hospital_names, 
                                                             doubtful_hospital_list['병원명'], 
                                                             0.7).rename('doubtful_hospital_name')

data_hospital_2 = pd.concat([data_hospital, hospital_names, doubtful_hospital_names], axis=1)

///////////////////////////////////////////////
////////// Data Loading
///////////////////////////////////////////////
백내장+부수입: (5, 7)
백내장+소개: (1050, 7)
백내장+수당: (150, 7)
백내장+숙소: (110, 7)
백내장+실비: (1050, 7)
백내장+실손: (1050, 7)
백내장+페이백: (60, 7)
백내장+할인: (1050, 7)
백내장+호텔: (720, 7)
///////////////////////////////////////////////
////////// Date column processing
///////////////////////////////////////////////
///////////////////////////////////////////////
////////// Combining to dataframe
///////////////////////////////////////////////
Raw table---------------------
(5245, 7)
After filtering---------------
(3244, 7)
///////////////////////////////////////////////
////////// Extracting hashtags
///////////////////////////////////////////////
///////////////////////////////////////////////
////////// Data seperation; by hospital or not
///////////////////////////////////////////////
Original:  (3244, 8)
Blog written by hospital:  (817, 8)
Blog written by personal:  (2410, 8)


In [5]:
%%time

## Tagging

tagger = Komoran()
pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA')
# pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA', 'XR', 'MAG')
max_ngrams = 3

data_hospital_ma = ma_with_ngrams(data_hospital_2, 'content', tagger, pos_list, max_ngrams, [])

CPU times: user 1min 6s, sys: 3.38 s, total: 1min 9s
Wall time: 44.1 s


In [6]:
## Keyword Check
# target_words_1 = ['실손', '실소', '실비', '수수료', '수당', '할인', '세일', '페이백', '소개',
#                   '부수입', '호텔', '숙소']
# tagtet_words_2 = ['수_당', '할_인', '실_손', '실_비', '페이_백']

# for word in target_words_1:
#     data_hospital_ma[word] = data_hospital_ma['content_1gram_2'].map(lambda x: x.count(word))
# for word in tagtet_words_2:
#     data_hospital_ma[word] = data_hospital_ma['content_2gram'].map(lambda x: x.count(word))

레이블은 추후에 전달받은거라 아래 청크처럼 추가함. 
- 현재 버전의 전처리와 맞지 않아 코드 검증용으로만 활용, 유효성 낮음

In [7]:
labels = pd.read_csv('siu_백내장_label_ver1.txt', header=None)[:816]

data_hospital_ma['label'] = labels.astype(int)

---

---

## GridSearchCV
- 현재는 `D2VTransformer`의 분류 시 성능을 체크하는 게 목적  
- `FeatureUnion` 활용해 앞 전처리부분은 추가 및 변경 가능
- Keyword generator 등의 custom transformer를 class로 구축해 활용가능

In [8]:
data_hospital_ma = data_hospital_ma.loc[data_hospital_ma['content'].map(lambda x: '동물병원' not in x)]
data_hospital_ma = data_hospital_ma.loc[data_hospital_ma['label'].map(lambda x: x!=2)]
data_hospital_ma['label'].value_counts()

0.0    499
1.0    108
Name: label, dtype: int64

In [9]:
doc_embedding = DocumentEmbedding(ngram_names=['content_1gram_1', 'content_2gram', 'content_3gram'],
                                  ngram_ns=[1, 2, 3],
                                  how_ngram='1_2_3')

In [10]:
# # filtering monograms only
# [x for x in X[0] if '_' not in x]

In [11]:
X = doc_embedding.make_input(data_hospital_ma)
y = data_hospital_ma['label'].reset_index(drop=True)

pipeline = Pipeline([
                        ('embed', D2VTransformer()),
                        ('clf', xgb.XGBClassifier())
                    ]) #TFIDFTransformer 추가예정 via FeatureUnion

In [12]:
pipeline.get_params()

{'memory': None,
 'steps': [('embed',
   D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
                  dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
                  docvecs=None, docvecs_mapfile=None,
                  hashfxn=<built-in function hash>, hs=0, iter=5,
                  max_vocab_size=None, min_alpha=0.0001, min_count=5, negative=5,
                  sample=0.001, seed=1, size=100, sorted_vocab=1, trim_rule=None,
                  window=5, workers=3)),
  ('clf',
   XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None, gamma=None,
                 gpu_id=None, importance_type='gain', interaction_constraints=None,
                 learning_rate=None, max_delta_step=None, max_depth=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
            

In [13]:
%%time

parameters_dict = {
    'embed__window' : [5,6,7]
}

best_params_1 = doc_embedding.doc2vec_cv(X, y, pipeline, parameters_dict, 'y')

-------Labels: [0. 1.]

-------Confusion Matrix
[[118  12]
 [ 17   5]]

---Accuary: 0.8092105263157895
---F1 Score: 0.25641025641025644
-------Best Parameters
{'embed__window': 7}
CPU times: user 2min 22s, sys: 1.58 s, total: 2min 23s
Wall time: 1min 31s


---

## 결정된 매개변수 적용해 Embedding 진행 후 저장

In [23]:
best_params_2 = dict()
for key in best_params_1.keys():
    best_params_2[key.split('__')[1]] = best_params_1[key]

In [26]:
d2v_transformer = D2VTransformer()
d2v_transformer.set_params(**best_params_2)

D2VTransformer(alpha=0.025, batch_words=10000, cbow_mean=1, comment=None,
               dbow_words=0, dm=1, dm_concat=0, dm_mean=None, dm_tag_count=1,
               docvecs=None, docvecs_mapfile=None,
               hashfxn=<built-in function hash>, hs=0, iter=5,
               max_vocab_size=None, min_alpha=0.0001, min_count=5, negative=5,
               sample=0.001, seed=1, size=100, sorted_vocab=1, trim_rule=None,
               window=5, workers=3)

In [29]:
def pickle_save(filename, object_name):
    with open(filename, 'wb') as f:
        pickle.dump(object_name, f)

doc_embedded_vector = d2v_transformer.fit_transform(X)
pickle_save('doc_embedded_vector.pkl', doc_embedded_vector)

NameError: name 'pickle' is not defined

---