# Pipelining for Whole Procedure after Crawling
## Ver 1: Using `GridSearchCV`

# ■Steps
## 1. Load & Preprocess
 - using `SIUDataPreprocess`
 - **MUST load labels for testing**  
 
## 2. Find hyperparameters for document embedding
 - using `D2VInputTransformer`(custom), `D2VTransformer`, `GridSearchCV`
 - Save `.best_params_` and use later
 
## 3. Pipelining other steps for modeling
#### - Keyword Generating
- using `KeywordGenerator`(custom)

#### - Classifying
- using `FeatureUnion` for concatenating two steps above (later)
    - There are issues while doing CV,   
        because of **t-test for subsetting words according to their correlation with y**
- using `xgboost` to classify

In [2]:
import pickle
import pandas as pd
import numpy as np

import itertools
from time import time

import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('util'))))
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('preprocess'))))
from konlpy.tag import Kkma, Komoran
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from tqdm import tqdm_notebook
import warnings

from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
import xgboost as xgb

warnings.filterwarnings('ignore')

from preprocess.siu_data_preproc import *
from preprocess.text_handling import do_text_ma, make_ngram, add_ngrams_to_ma
from document_embedding import *
from d2v_input_transformer_2 import D2VInputTransformer
from keyword_generator import KeywordGenerator

def pickle_save(filename, object_name):
    with open(filename, 'wb') as f:
        pickle.dump(object_name, f)
        
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
        
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

---

# 1. Load & Preprocess

레이블은 추후에 전달받은거라 아래 청크처럼 추가함. 
- 현재 버전의 전처리와 맞지 않아 코드 검증용으로만 활용, 유효성 낮음

In [3]:
tgt_keyword = 'expanded'
tgt_yyyymm = '2021-11-21'
data_dir = 'data'

doubtful_hospital_list = pd.read_csv('preprocess/hospital_list.csv', delimiter='\t')

In [4]:
### Preprocessing chunk

preprocessor = SIUDataPreprocess(target_keyword=tgt_keyword,
                                 target_yyyymm=tgt_yyyymm,
                                 data_directory=data_dir,
                                 text_columns=['name', 'title', 'content'])

preprocessor.load_data_dict()
preprocessor.transform_date()
preprocessor.combine_data_dict()
preprocessor.extract_hashtags(content_col='content', 
                              hospital_re_target='.*(안과|병원|의원)$')

data_hospital, data_prsnl = preprocessor.hospital_prsnl_split(hospital_words_list=['병원', '의원', '외과', '안과'],
                                                        non_word_list=['동물', '동물병원'])


# ## 병원명 관련 과정 // 'hospital_list.csv'가 없다면 생략 가능
# hospital_names = preprocessor.extract_hospital_name('.*(안과|병원|의원).*').rename('hospital_name')
# doubtful_hospital_names = preprocessor.match_doubtful_hospitals(hospital_names, 
#                                                              doubtful_hospital_list['병원명'], 
#                                                              0.7).rename('doubtful_hospital_name')

# data_hospital_2 = pd.concat([data_hospital, hospital_names, doubtful_hospital_names], axis=1)

///////////////////////////////////////////////
////////// Data Loading
///////////////////////////////////////////////
data/blog_contents_백내장+숙소_from2020-11-21_to2021-11-21_expanded.csv
백내장+숙소: (121, 6)
data/blog_contents_백내장+수당_from2020-11-21_to2021-11-21_expanded.csv
백내장+수당: (108, 6)
data/blog_contents_백내장+할인_from2020-11-21_to2021-11-21_expanded.csv
백내장+할인: (1959, 6)
data/blog_contents_백내장+호텔_from2020-11-21_to2021-11-21_expanded.csv
백내장+호텔: (554, 6)
data/blog_contents_백내장+실비_from2020-11-21_to2021-11-21_expanded.csv
백내장+실비: (1332, 6)
data/blog_contents_백내장+실손_from2020-11-21_to2021-11-21_expanded.csv
백내장+실손: (1575, 6)
data/blog_contents_백내장+페이백_from2020-11-21_to2021-11-21_expanded.csv
백내장+페이백: (58, 6)
data/blog_contents_백내장+부수입_from2020-11-21_to2021-11-21_expanded.csv
백내장+부수입: (7, 6)
data/blog_contents_백내장+소개_from2020-11-21_to2021-11-21_expanded.csv
백내장+소개: (8330, 6)
///////////////////////////////////////////////
////////// Date column processing
/////////////////////////////////////

In [5]:
data_hospital.shape

(2988, 7)

In [6]:
labels = pd.read_csv('siu_백내장_label_ver1.txt', header=None)[:data_hospital.shape[0]]

data_hospital['label'] = labels.astype(int).fillna(0)

In [7]:
data_hospital['label'].value_counts()

0.0    699
1.0    114
2.0     21
Name: label, dtype: int64

---

# 2. Find hyperparameters for document embedding

In [8]:
data_hospital = data_hospital.loc[data_hospital['content'].map(lambda x: '동물병원' not in x)]
data_hospital = data_hospital.loc[data_hospital['label'].map(lambda x: x!=2)].reset_index(drop=True)
data_hospital['label'] = data_hospital['label'].fillna(0)
data_hospital['label'].value_counts()

0.0    2566
1.0     114
Name: label, dtype: int64

In [9]:
## Trimming 1% from each side w.r.t. length of content
trim_thresh = .01
quantile_lower = data_hospital['content'].apply(len).quantile(0 + trim_thresh)
quantile_upper = data_hospital['content'].apply(len).quantile(1 - trim_thresh)

data_hospital = data_hospital.loc[(data_hospital['content'].apply(len) > quantile_lower) \
                            & (data_hospital['content'].apply(len) < quantile_upper)].reset_index(drop=True)
data_hospital.shape

(2626, 8)

In [10]:
%%time

X = data_hospital['content']
y = data_hospital['label']

tagger = Komoran()
pos_list = ('NNG', 'NNP', 'NP', 'VV', 'VA')
stopwords = []

# X_ma = do_text_ma(X, tagger, pos_list, stopwords, is_morph='y')

CPU times: user 15.9 s, sys: 361 ms, total: 16.2 s
Wall time: 5.79 s


In [11]:
# save
# pickle_save('X_ma.pkl', X_ma)

In [12]:
# load
X_ma = pickle_load('X_ma.pkl')

In [13]:
## Set pipeline for d2v hyperparameters
ngram_names = ['content_1gram_1', 'content_2gram', 'content_3gram']
ngram_ns = [1,2,3]
how_ngram = '1_2_3'

pipeline_d2v = Pipeline([
                            ('preproc', D2VInputTransformer(ngram_names, ngram_ns, how_ngram)),
                            ('embed', D2VTransformer()),
                            ('clf', xgb.XGBClassifier(eval_metric = 'logloss'))
                        ]) 

In [14]:
data_hospital['content'].map(lambda x: x.strip()=='').sum()

0

In [15]:
# %%time

# ## Set parameters and do CV

# parameters_d2v = {
#     'embed__window' : [5,6,7]
# }

# X_train, X_test, y_train, y_test = train_test_split(X_ma, y)

# # X_train = X_train.reset_index(drop=True) 
# # y_train = y_train.reset_index(drop=True) 

# cv_d2v = GridSearchCV(pipeline_d2v, parameters_d2v)
# cv_d2v.fit(X_train, y_train)

In [16]:
# ## Check the best parameters and save

# print(cv_d2v.best_params_)
# best_params_d2v = cv_d2v.best_params_

### 결정된 `best_params_d2v` 적용 및 Embedding 진행 후 저장

In [17]:
# pipeline_d2v_final = Pipeline([
#                                 ('preproc', D2VInputTransformer(ngram_names, ngram_ns, how_ngram)),
#                                 ('embed', D2VTransformer())
#                             ])

# pipeline_d2v_final.set_params(**best_params_d2v)

In [18]:
# %%time

# doc_embedded_vector = pipeline_d2v_final.fit_transform(X_ma)
# pickle_save('doc_embedded_vector.pkl', doc_embedded_vector)

In [19]:
# load
doc_embedded_vector = pickle_load('doc_embedded_vector.pkl')

---

# 3. Pipelining 3 steps for modeling 

In [20]:
# %%time

# keyword_gen = KeywordGenerator(ngram_names, ngram_ns, y)

# keyword_based_df = keyword_gen.fit_transform(X_ma)
# keyword_based_df.head(5)

In [21]:
# # save
# pickle_save('keyword_based_df.pkl', keyword_based_df)

In [22]:
# # load
# keyword_based_df = pickle_load('keyword_based_df.pkl')

In [23]:
# doc_embedded_df = pd.DataFrame(doc_embedded_vector)
# doc_embedded_df.columns = ['embed_{}'.format(x) for x in range(doc_embedded_df.shape[1])]
# doc_embedded_df.head(5)

In [24]:
# save
# pickle_save('doc_embedded_df.pkl', doc_embedded_df)

In [25]:
# # load
# doc_embedded_df = pickle_load('doc_embedded_df.pkl')

In [26]:
# ## Make training and test set for classifying
# X_fin = pd.concat([keyword_based_df, doc_embedded_df], axis=1)

# X_fin_train, X_fin_test, y_train, y_test = train_test_split(X_fin, y)

In [27]:
# # save
# pickle_save('X_fin.pkl', X_fin)

In [28]:
# load
X_fin = pickle_load('X_fin.pkl')

X_fin_train, X_fin_test, y_train, y_test = train_test_split(X_fin, y)

In [29]:
pipeline_clf = Pipeline([
    ('clf', xgb.XGBClassifier(eval_metric = 'logloss'))
])

# pipeline_clf.get_params()

In [30]:
%%time

parameters_clf = {
    'clf__n_estimators': [100, 110, 120]
}

cv_clf = GridSearchCV(pipeline_clf, parameters_clf)
cv_clf.fit(X_fin_train, y_train)

CPU times: user 1min 33s, sys: 104 ms, total: 1min 33s
Wall time: 23.8 s


GridSearchCV(estimator=Pipeline(steps=[('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      eval_metric='logloss',
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                     

In [31]:
best_model_clf = cv_clf.best_estimator_
best_model_clf.fit(X_fin_train, y_train)

Pipeline(steps=[('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               eval_metric='logloss', gamma=0, gpu_id=-1,
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=120,
                               n_jobs=4, num_parallel_tree=1, predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])

In [32]:
y_pred = best_model_clf.predict(X_fin_test)

get_clf_eval(y_test, y_pred)

오차행렬:
 [[619   9]
 [ 25   4]]

정확도: 0.9482
정밀도: 0.3077
재현율: 0.1379
F1: 0.1905
AUC: 0.5618


In [33]:
# save
# pickle_save('y_pred.pkl', y_pred)

In [34]:
# # load
# y_pred = pickle_load('y_pred.pkl')

---

# Making Data for Visualization in Quick Sight

In [35]:
from util.S3Manager import S3Manager
s3_mng = S3Manager()

s3_vis_dir = 'nylon-detector/visualization/'

## 1. Related to keyword frequency

### 1-1. Simple frequency

In [36]:
keywords = X_fin_test.columns[:295]

data_keywords_vis = pd.concat([X_fin_test.iloc[:, :295].reset_index(drop=False).rename(columns={'index': 'doc_num'}), pd.Series(y_pred).rename('y_pred')], axis=1)
data_keywords_vis = data_keywords_vis.sort_values('doc_num').reset_index(drop=True)
data_keywords_vis.head(3)

Unnamed: 0,doc_num,가격,가까이,가깝,가입,가족,감수,감언이설,감염,강하,...,해당,해외,현명,혜택,홍채,확인,활용,후기,후반,y_pred
0,3,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.0
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0.0
2,14,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [37]:
def reshape_keywords(data):
    for i, keyword in enumerate(keywords):
        if i==0:
            result = pd.concat([
                data['doc_num'],
                pd.Series([keyword]*data.shape[0]).rename('term'),
                data[keyword].rename('term_cnt'),
                data['y_pred'].rename('pred_yn')
            ], axis=1)
        else:
            to_add = pd.concat([
                data['doc_num'],
                pd.Series([keyword]*data.shape[0]).rename('term'),
                data[keyword].rename('term_cnt'),
                data['y_pred'].rename('pred_yn')
            ], axis=1)
            
            result = pd.concat([result, to_add], axis=0)
    
    return result

In [38]:
data_keywords_vis_2 = reshape_keywords(data_keywords_vis)
data_keywords_vis_2.head(5)

Unnamed: 0,doc_num,term,term_cnt,pred_yn
0,3,가격,0,0.0
1,4,가격,0,0.0
2,14,가격,1,0.0
3,22,가격,2,0.0
4,25,가격,0,0.0


In [39]:
# save to local directory
data_keywords_vis_2.to_csv('visualize_data_1_keywords.csv', encoding='utf-8-sig')

In [40]:
# upload to s3
s3_mng.upload_file('visualize_data_1_keywords.csv', f'{s3_vis_dir}visualize_data_1_keywords.csv')

2021-12-01 04:06:10,952 - root - INFO - upload : visualize_data_1_keywords.csv to Target: nylon-detector/visualization/visualize_data_1_keywords.csv Success.


## 1-2. Percentage grouped by `y_pred`

In [41]:
data_keywords_vis_bool = data_keywords_vis_2.copy()
data_keywords_vis_bool['term_cnt'] = data_keywords_vis_bool['term_cnt'].map(lambda x: 1 if x!=0 else 0)
data_keywords_vis_bool

Unnamed: 0,doc_num,term,term_cnt,pred_yn
0,3,가격,0,0.0
1,4,가격,0,0.0
2,14,가격,1,0.0
3,22,가격,1,0.0
4,25,가격,0,0.0
...,...,...,...,...
652,2583,후반,0,0.0
653,2597,후반,0,0.0
654,2604,후반,0,0.0
655,2614,후반,0,0.0


In [42]:
grouped_y = data_keywords_vis_bool.loc[data_keywords_vis_bool['pred_yn']==1].groupby(['term'])
grouped_n = data_keywords_vis_bool.loc[data_keywords_vis_bool['pred_yn']==0].groupby(['term'])

In [43]:
grouped_y_pcntg = round(100 * grouped_y.sum()['term_cnt'] / grouped_y.count()['term_cnt'], 2).rename('pred_y_avg')
grouped_n_pcntg = round(100 * grouped_n.sum()['term_cnt'] / grouped_n.count()['term_cnt'], 2).rename('pred_n_avg')

total_pcntg = pd.concat([grouped_y_pcntg, grouped_n_pcntg], axis=1).reset_index()
total_pcntg

Unnamed: 0,term,pred_y_avg,pred_n_avg
0,가격,53.85,11.80
1,가까이,15.38,2.17
2,가깝,46.15,23.14
3,가입,69.23,6.37
4,가족,0.00,13.66
...,...,...,...
290,홍채,0.00,4.66
291,확인,61.54,32.30
292,활용,15.38,9.01
293,후기,7.69,5.28


In [44]:
# save to local directory
total_pcntg.to_csv('visualize_data_2_percentage.csv', encoding='utf-8-sig')

In [45]:
# upload to s3
s3_mng.upload_file('visualize_data_2_percentage.csv', f'{s3_vis_dir}visualize_data_2_percentage.csv')

2021-12-01 04:06:11,380 - root - INFO - upload : visualize_data_2_percentage.csv to Target: nylon-detector/visualization/visualize_data_2_percentage.csv Success.


---

## 2. Network

In [46]:
hospital_words_list=['병원', '의원', '외과', '안과']
hospital_re_target = '.*(안과|병원|의원).*'

In [69]:
# 1. name, title, #s로부터 병원의심단어 추출해 pd.Series 만들기
hospitals_dict = dict()

for i in range(data_hospital.shape[0]):
    hospitals_dict[i] = []  # list로 만들거

    name = data_hospital['name'][i]
    if np.sum([x in name for x in hospital_words_list]) != 0:
        hospitals_dict[i].append(name)

    title = data_hospital['title'][i]
    if np.sum([x in title for x in hospital_words_list]) != 0:
        hospitals_dict[i].append(title)

    hashs = data_hospital['#s'].map(
        lambda x: [tag for tag in x if np.sum([word in tag for word in hospital_words_list]) != 0])[i]
    hospitals_dict[i] += hashs

hospitals_series = pd.Series(hospitals_dict).map(lambda x: [y.replace('#', '') for y in x])

# 2. 한 번 더 검토하며 병원이름 추가 & 정제하기
hospitals_series = hospitals_series.map(lambda x: [y if (bool(re.search(hospital_re_target, y))) else None for y in x])
hospitals_series = hospitals_series.map(lambda x: list(filter(None.__ne__, x)))
hospitals_series = hospitals_series.map(lambda x: [[y for y in z.split() if re.match(hospital_re_target, y)!=None] for z in x])
hospitals_series = hospitals_series.map(lambda x: list(itertools.chain.from_iterable(x)))

In [70]:
hospitals_series

0                                                   [안과의]
1                                   [빛소망안과의원, 여의도안과국가유공자]
2                                         [빛소망안과의원, 보훈병원]
3       [광주안과동그라미빌딩, 광주안과, 동그라미광주안과, 광천동안과, 상무지구안과, 목포...
4                                          [김해서울안과, 서울안과]
                              ...                        
2621                                       [푸른세상안과, 안양안과]
2622        [서울대입구역안과, 청안과를, 청안과, 서울대입구역안과, 관악구안과, 봉천동안과]
2623                                      [푸른세상안과, 안양안과를]
2624                                          [우리안과소개합니다]
2625                                           [밝은세상안과에서]
Length: 2626, dtype: object

In [None]:
data_keywords_vis