# 0. 기본 설정 (필수실행, 수정 금지)

In [None]:
import os
import sys
import time
import warnings
import itertools
warnings.filterwarnings('ignore')

import pandas as pd
import ujson as json
from datetime import datetime
from dateutil.relativedelta import relativedelta

from nylondetector.modeling.get_score import *
from blog_hospital_crawler import HospitalCrawler
from cafe_crawl_srch_content import get_srch_cafe_df

---

# 1. 블로그

# 1-1. 수집

## ⭐️파라미터 설정(수정가능구역)

In [None]:
keyword_list = ['"강남아이원스안과"']

start_date = 20220401
how_many_months = 2

count_word_list = ['실비']

### URL 스크래핑

In [None]:
hc = HospitalCrawler(start_date)

In [5]:
start = time.time()
for keyword in keyword_list:
    hc.crawl_blog_urls(keyword, how_many_months, 133) # 133 is heuristic number.
end = time.time() - start

print(f'elapsed = {end}')

"강남아이원스안과" start
From 20220301 to 20220401
From 20220201 to 20220301
"강남아이원스안과" end
elapsed = 33.53213858604431


### URL 중복제거

In [6]:
start = time.time()
hc.deduplicate_url()
end = time.time() - start

print(f'elapsed = {end}')

elapsed = 4.89771842956543


### 수집된 URL 기반으로 본문 스크래핑

In [10]:
keyword_dir_list = [x.replace('"', '') for x in keyword_list]

start = time.time()
for keyword_dir in keyword_dir_list:
    hc.crawl_blog_contents(keyword_dir)
end = time.time() - start

print(f'elapsed = {end}')

강남아이원스안과 start
process 강남아이원스안과 from 0 to 500
강남아이원스안과 end
elapsed = 1.7863726615905762


### 읽을 수 있도록 인코딩 변환 & 단어 카운트 후 저장

In [11]:
start = time.time()

for keyword_dir in keyword_dir_list:
    cntnts_dir = f'data/blog/{keyword_dir}/contents'
    file_paths = [f'{cntnts_dir}/{x}' for x in os.listdir(cntnts_dir) if 'ipynb' not in x and 'rdbl' not in x]
    
    for path in file_paths:
        blog_cntnts = pd.read_csv(path, delimiter='|DELIMITER|')
        
        blog_cntnts_cols = blog_cntnts.columns[0].split('|DELIMITER|')
        blog_cntnts_2 = blog_cntnts[blog_cntnts.squeeze().map(lambda x: len(x)<50000)]
        blog_cntnts_series = blog_cntnts_2.squeeze().map(lambda x: x.split('|DELIMITER|'))
        
        rslt = pd.DataFrame.from_dict(dict(zip(blog_cntnts_series.index, blog_cntnts_series.values))).T
        rslt.columns = blog_cntnts_cols
        
        if len(count_word_list)!=0:
            for count_word in count_word_list:
                rslt[count_word] = rslt['content'].map(lambda x: x.count(count_word))
        
        rslt.to_csv(f"{path.split('.csv')[0]}" + "_rdbl.csv", encoding='utf-8-sig')
        
end = time.time() - start

print(f'elapsed = {end}')    

elapsed = 0.7964391708374023


# 1-2. 스코어 추가

In [9]:
dir_blog = '../crawling/data/blog'

In [10]:
df = load_files_df(dir_blog, keyword_list, start_date)
df_hsptl, df_prsnl = cleanse_df(df, [.1, .1], start_date, 202109)

////////// Converting dates
////////// Extracting hashtags
////////// Data seperation; hospital and personal
Original: (212, 8)
Blog written by hospital: (173, 8)
Blog written by personal: (39, 8)


In [11]:
%%time

X_ma = make_morph_series(df_hsptl['content'], is_save=False)
input_df = make_input_df(df_hsptl, X_ma)

CPU times: user 40.8 s, sys: 825 ms, total: 41.7 s
Wall time: 24.9 s


In [12]:
model_path = '../../../nylon-detector-backup/modeling/siu_clf_ver2.sav'
scored_df = get_scored_df(df_hsptl, input_df, model_path)

In [18]:
scored_dir = 'data/scored_results'

if not os.path.exists(scored_dir):
    os.makedirs(scored_dir)

to_save = f'{scored_dir}/scored_result_{str(datetime.today())[:10]}.csv'
scored_df.to_csv(to_save, encoding='utf-8-sig')

- 이후 `crawling/data/scored_results/scored_result_실행일자.csv` 다운로드

---

# 2. 카페

## 파라미터 설정(수정가능구역)

In [3]:
keywords = ['실비보험']
page_max = 15 #가급적 크게

how_many_months = 10

In [4]:
date_lst = [(datetime.strptime(str(start_date), '%Y%m%d')
             - relativedelta(months=i)).strftime('%Y%m%d') for i in range(how_many_months + 1)]

period_from = date_lst[-1]
period_to = date_lst[0]

## 2-1. 각 카페 내의 특정 키워드 검색목록

In [5]:
# print(keyword, period_from, period_to, page_max)

In [6]:
# !python3 cafe_get_each_info.py run $keyword $period_from $period_to $page_max

### - `pandas` 버전문제 존재(20220428)

## 2-2. 카페 전체 검색목록

In [7]:
optional_words = ['백내장','실비','동물']
                # include, includeAll, exclude 순서의 string list
    
for keyword in keywords:
    result = get_srch_cafe_df(keyword, 
                              period_from, 
                              period_to, 
                              page_max, 
                              optional_words)
    
    save_to = f'data/cafe_search/cafe_srch_contents_{keyword}_from{period_from}_to{period_to}.csv'
    result.to_csv(save_to, encoding='utf-8-sig')

- 이후 `crawling/data/cafe_search/cafe_srch_contents_검색키워드_~~~.csv` 다운로드

### - `pandas` 버전문제로 인한 comments 미기재(20220501)