# Pipelining for Whole Procedure after Crawling
## Ver 2: Add `TfidfTransformer` to D2V pipeline, via `FeatureUnion`

# ■Steps
## 1. Load & Preprocess
 - using `SIUDataPreprocess`
 - **MUST load labels for testing**  
 
## 2. Find hyperparameters for document embedding
 - using `D2VInputTransformer`(custom), `D2VTransformer`, `GridSearchCV`
 - Save `.best_params_` and use later
 
## 3. Pipelining other steps for modeling
#### - Keyword Generating
- using `KeywordGenerator`(custom)

#### - Classifying
- using `FeatureUnion` for concatenating two steps above (later)
    - There are issues while doing CV,   
        because of **t-test for subsetting words according to their correlation with y**
- using `xgboost` to classify

In [1]:
import pickle
import pandas as pd
import numpy as np

import itertools
from time import time

import os
from konlpy.tag import Kkma, Komoran
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from tqdm import tqdm_notebook
import warnings

from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, \
                            recall_score, f1_score,roc_auc_score
import xgboost as xgb

warnings.filterwarnings('ignore')

from nylondetector.preprocess.siu_data_preproc import *
from nylondetector.preprocess.text_handling import do_text_ma, make_ngram, add_ngrams_to_ma
from document_embedding import *
from d2v_input_transformer_2 import D2VInputTransformer
from keyword_generator import KeywordGenerator

def pickle_save(filename, object_name):
    with open(filename, 'wb') as f:
        pickle.dump(object_name, f)
        
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

---

# 1. Load & Preprocess

In [2]:
tgt_keyword = 'expanded'
tgt_yyyymm = '2021-11-21'
data_dir = 'data'

doubtful_hospital_list = pd.read_csv('hospital_list.csv', delimiter='\t')

In [3]:
### Preprocessing chunk

preprocessor = SIUDataPreprocess(target_keyword=tgt_keyword,
                                 target_yyyymm=tgt_yyyymm,
                                 data_directory=data_dir,
                                 text_columns=['name', 'title', 'content'])

preprocessor.load_data_dict()
preprocessor.transform_date()
preprocessor.combine_data_dict()
preprocessor.extract_hashtags(content_col='content', 
                              hospital_re_target='.*(안과|병원|의원)$')

data_hospital, data_prsnl = preprocessor.hospital_prsnl_split(hospital_words_list=['병원', '의원', '외과', '안과'],
                                                        non_word_list=['동물', '동물병원'])


# ## 병원명 관련 과정 // 'hospital_list.csv'가 없다면 생략 가능
# hospital_names = preprocessor.extract_hospital_name('.*(안과|병원|의원).*').rename('hospital_name')
# doubtful_hospital_names = preprocessor.match_doubtful_hospitals(hospital_names, 
#                                                              doubtful_hospital_list['병원명'], 
#                                                              0.7).rename('doubtful_hospital_name')

# data_hospital_2 = pd.concat([data_hospital, hospital_names, doubtful_hospital_names], axis=1)

///////////////////////////////////////////////
////////// Data Loading
///////////////////////////////////////////////
data/blog_contents_백내장+숙소_from2020-11-21_to2021-11-21_expanded.csv
백내장+숙소: (121, 6)
data/blog_contents_백내장+수당_from2020-11-21_to2021-11-21_expanded.csv
백내장+수당: (108, 6)
data/blog_contents_백내장+할인_from2020-11-21_to2021-11-21_expanded.csv
백내장+할인: (1959, 6)
data/blog_contents_백내장+호텔_from2020-11-21_to2021-11-21_expanded.csv
백내장+호텔: (554, 6)
data/blog_contents_백내장+실비_from2020-11-21_to2021-11-21_expanded.csv
백내장+실비: (1332, 6)
data/blog_contents_백내장+실손_from2020-11-21_to2021-11-21_expanded.csv
백내장+실손: (1575, 6)
data/blog_contents_백내장+페이백_from2020-11-21_to2021-11-21_expanded.csv
백내장+페이백: (58, 6)
data/blog_contents_백내장+부수입_from2020-11-21_to2021-11-21_expanded.csv
백내장+부수입: (7, 6)
data/blog_contents_백내장+소개_from2020-11-21_to2021-11-21_expanded.csv
백내장+소개: (8330, 6)
///////////////////////////////////////////////
////////// Date column processing
/////////////////////////////////////

레이블은 추후에 전달받은거라 아래 청크처럼 추가함. 
- 현재 버전의 전처리와 맞지 않아 코드 검증용으로만 활용, 유효성 낮음

In [4]:
data_hospital.shape

(2988, 7)

In [5]:
labels = pd.read_csv('siu_백내장_label_ver1.txt', header=None)[:data_hospital.shape[0]]

data_hospital['label'] = labels.astype(int)
data_hospital['label'] = data_hospital['label'].fillna(0)

data_hospital['label'].value_counts()

0.0    2853
1.0     114
2.0      21
Name: label, dtype: int64

---

# 2. Find hyperparameters for `D2VTransformer` and `TfidfTransformer`

In [6]:
data_hospital = data_hospital.loc[data_hospital['content'].map(lambda x: '동물병원' not in x)]
data_hospital = data_hospital.loc[data_hospital['label'].map(lambda x: x!=2)].reset_index(drop=True)

data_hospital['label'].value_counts()

0.0    2566
1.0     114
Name: label, dtype: int64

In [7]:
## Trimming 1% from each side w.r.t. length of content
trim_thresh = .01
quantile_lower = data_hospital['content'].apply(len).quantile(0 + trim_thresh)
quantile_upper = data_hospital['content'].apply(len).quantile(1 - trim_thresh)

data_hospital = data_hospital.loc[(data_hospital['content'].apply(len) > quantile_lower) \
                            & (data_hospital['content'].apply(len) < quantile_upper)].reset_index(drop=True)
data_hospital.shape

(2626, 8)

In [8]:
%%time

X = data_hospital['content']
y = data_hospital['label']

tagger = Komoran()
pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA')
stopwords = []

# X_ma = do_text_ma(X, tagger, pos_list, stopwords, is_morph='y')

CPU times: user 20.8 s, sys: 624 ms, total: 21.4 s
Wall time: 11.5 s


In [9]:
# d2v_input = D2VInputTransformer(ngram_names, ngram_ns, '1')
# tfidf = TfidfTransformer()

In [32]:
# aaa = d2v_input.fit_transform(X_ma)

In [33]:
# bbb = tfidf.fit_transform(aaa)

In [10]:
## Set pipeline for d2v hyperparameters
tagger = Komoran()
pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA')
stopwords = []
ngram_names = ['content_1gram_1', 'content_2gram', 'content_3gram']
ngram_ns = [1,2,3]
how_ngram = '1_2_3'
is_morph = 'y'

# pipeline_d2v = Pipeline([
#                             ('preproc', D2VInputTransformer(ngram_names, ngram_ns, how_ngram)),
#                             ('embed', D2VTransformer()),
#                             ('clf', xgb.XGBClassifier(eval_metric = 'logloss'))
#                         ]) 

name_ftr_union = 'features'
name_d2v_pplne = 'd2v'

pipeline_whole = Pipeline([
                            (name_ftr_union, FeatureUnion([
                                (name_d2v_pplne, Pipeline([
                                    ('preproc', D2VInputTransformer(tagger, pos_list, stopwords,
                                                                    ngram_names, ngram_ns, how_ngram)),
                                    ('embed', D2VTransformer())
                                ])),
                                ('tfidf', TfidfTransformer())
                            ])),
    
                            ('clf', xgb.XGBClassifier(eval_metric = 'logloss'))
                        ])

In [11]:
# pipeline_whole.get_params()

In [12]:
# aaa = do_text_ma(X, tagger, pos_list, stopwords)

In [13]:
%%time

## Set parameters and do CV

parameters_whl = {
    f'{name_ftr_union}__{name_d2v_pplne}__embed__window' : [5,6,7]
}

X_train, X_test, y_train, y_test = train_test_split(X, y)


cv_d2v = RandomizedSearchCV(pipeline_whole, parameters_whl)
cv_d2v.fit(X_train, y_train)
# cv_d2v.fit(X_train.reset_index(drop=True), y_train.reset_index(drop=True))

TypeError: No matching overloads found for constructor kr.co.shineware.nlp.komoran.core.Komoran(), options are:
	public kr.co.shineware.nlp.komoran.core.Komoran(kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL)
	public kr.co.shineware.nlp.komoran.core.Komoran(java.lang.String)



In [53]:
## Check the best parameters and save

print(cv_d2v.best_params_)
best_params_d2v = cv_d2v.best_params_

{'embed__window': 5}


### 결정된 `best_params_d2v` 적용 및 Embedding 진행 후 저장

In [54]:
pipeline_whl_final = Pipeline([
                                (name_ftr_union, FeatureUnion([
                                    (name_d2v_pplne, Pipeline([
                                        ('preproc', D2VInputTransformer(ngram_names, ngram_ns, how_ngram)),
                                        ('embed', D2VTransformer())
                                    ])),
                                    ('tfidf', TfidfTransformer())
                                ]))
                            ])

pipeline_whl_final.set_params(**best_params_d2v)

Pipeline(steps=[('preproc',
                 D2VInputTransformer(how_ngram='1_2_3',
                                     ngram_names=['content_1gram_1',
                                                  'content_2gram',
                                                  'content_3gram'],
                                     ngram_ns=[1, 2, 3])),
                ('embed', D2VTransformer())])

In [22]:
doc_embedded_vector = pipeline_d2v_final.fit_transform(X_ma)
pickle_save('doc_embedded_vector.pkl', doc_embedded_vector)

NameError: name 'pipeline_d2v_final' is not defined

---

# 3. Pipelining 3 steps for modeling 

In [None]:
%%time

keyword_gen = KeywordGenerator(ngram_names, ngram_ns, y)

keyword_based_df = keyword_gen.fit_transform(X_ma)
keyword_based_df

In [15]:
doc_embedded_df = pd.DataFrame(doc_embedded_vector)
doc_embedded_df.columns = ['embed_{}'.format(x) for x in range(doc_embedded_df.shape[1])]
doc_embedded_df

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_90,embed_91,embed_92,embed_93,embed_94,embed_95,embed_96,embed_97,embed_98,embed_99
0,-0.130672,-0.139732,-0.280245,-0.306579,0.040433,-0.075980,-0.404651,-0.183711,-0.242848,-0.154886,...,-0.189532,0.183875,0.220398,0.134514,-0.105723,-0.442406,0.151708,-0.193907,-0.085935,0.517322
1,-0.187871,-0.085935,-0.567291,-0.476746,0.183907,-0.243452,-0.666741,-0.458038,-0.631209,-0.384837,...,-0.443228,0.172696,0.411963,0.142299,-0.177837,-0.736314,0.225144,-0.411812,-0.316324,1.240476
2,-0.163872,-0.363274,-0.417238,-0.356397,0.076937,-0.192622,-0.664016,-0.209913,-0.297889,-0.308035,...,-0.311872,0.266376,0.303512,0.308851,-0.067522,-0.717268,0.323501,-0.338088,-0.106795,0.815071
3,-0.183429,-0.427713,-0.337900,-0.331755,0.067737,-0.274567,-0.613142,-0.165823,-0.331394,-0.365816,...,-0.356037,0.194347,0.662764,0.277113,-0.036117,-1.007436,0.285898,-0.302107,-0.290115,0.812572
4,-0.149656,0.154097,-0.664767,-0.876243,0.259200,-0.140386,-0.583339,-0.518950,-0.724208,-0.088212,...,-0.436539,0.381203,0.131788,0.194018,-0.342730,-0.624315,0.086143,-0.468958,-0.037061,1.037269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,0.001609,-0.014882,-0.012283,-0.038307,-0.011720,0.004587,-0.044771,0.007221,-0.043715,0.018086,...,0.005673,-0.008816,0.042763,-0.006269,-0.001272,-0.056176,-0.001708,0.018716,-0.027250,0.017651
603,-0.160529,-0.094069,-0.688987,-0.808263,0.285809,-0.430363,-0.826116,-0.600407,-0.809809,-0.642501,...,-0.542360,0.108734,0.508095,0.160788,-0.237811,-0.956217,0.265468,-0.450815,-0.293498,1.384349
604,-0.105781,-0.023528,-0.192938,-0.263216,0.057987,-0.030214,-0.253339,-0.071769,-0.142469,-0.052566,...,-0.150370,0.151482,0.171383,0.113609,-0.088020,-0.379990,0.055656,-0.081192,-0.045519,0.279030
605,-0.013886,-0.002686,-0.177681,-0.322883,0.034663,-0.080072,-0.259623,-0.170023,-0.234912,-0.112661,...,-0.098152,0.069423,0.031939,0.037107,-0.069648,-0.264644,0.120543,-0.150395,-0.070267,0.368271


In [16]:
## Make training and test set for classifying
X_fin = pd.concat([keyword_based_df, doc_embedded_df], axis=1)

X_fin_train, X_fin_test, y_train, y_test = train_test_split(X_fin, y)

In [17]:
pipeline_clf = Pipeline([
    ('clf', xgb.XGBClassifier())
])

pipeline_clf.get_params()

{'memory': None,
 'steps': [('clf',
   XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None, gamma=None,
                 gpu_id=None, importance_type='gain', interaction_constraints=None,
                 learning_rate=None, max_delta_step=None, max_depth=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
                 objective='binary:logistic', random_state=None, reg_alpha=None,
                 reg_lambda=None, scale_pos_weight=None, subsample=None,
                 tree_method=None, validate_parameters=None, verbosity=None))],
 'verbose': False,
 'clf': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None, gamma=None,
               gpu_id=None, importance_type='gain', interaction_constraints=None,
               learnin

In [18]:
%%time

parameters_clf = {
    'clf__n_estimators': [100, 110, 120]
}

cv_clf = GridSearchCV(pipeline_clf, parameters_clf)
cv_clf.fit(X_fin_train, y_train)

CPU times: user 18.7 s, sys: 7.48 ms, total: 18.7 s
Wall time: 1.35 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                      

In [19]:
best_model_clf = cv_clf.best_estimator_
best_model_clf.fit(X_fin_train, y_train)

Pipeline(memory=None,
         steps=[('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1,
                               objective='binary:logistic', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))],
         verbose=False)

In [20]:
y_pred = best_model_clf.predict(X_fin_test)

get_clf_eval(y_test, y_pred)

오차행렬:
 [[116   3]
 [ 21  12]]

정확도: 0.8421
정밀도: 0.8000
재현율: 0.3636
F1: 0.5000
AUC: 0.6692


In [21]:
# save model
joblib.dump(best_model_clf, 'siu_xgb_clf_ver1.pkl')

['siu_xgb_clf_ver1.pkl']