In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook

In [None]:
filter_word = pd.read_excel('filter_word.xlsx')['filter_word'].values
path = './output/크롤링/'

In [None]:
def crawling_merge(path, verbose=True, not_all=False, sep_list=''):
    '''크롤링된 keyword별 폴더가 위치한 경로를 입력하면, 각 폴더에 있는 csv파일을 통합하고 통합폴더에 저장한다.
    
    Parameters
    ----------
    path(string) : 크롤링 결과폴더 경로
    verbose(bool) : default = True, 각 폴더 통합진행과정 출력
    not_all(bool) : default = False, 전체폴더가 아닌 일부 사용시
    sep_list(list) : default = '', 일부폴더 지정
    
    Returns
    --------
    result_df : DataFrame
        각 keyword별 전후차이 비교
    통합폴더에 각 keyword별로 저장 ex) f"{키워드}_통합_{전체 row수}.csv"
    
    '''
    result_dict = dict()
    result_dict['keyword'] = []
    result_dict['before_len'] = []
    result_dict['after_len'] = []
    
    
    #각 폴더별로 자동화 
    os.makedirs('./output/크롤링/통합',exist_ok=True)
    folder_list = os.listdir(path)
    folder_list.remove('통합')

    #일부일 경우
    if not_all == False:
        folder_list = os.listdir(path)
        folder_list.remove('통합')
    else :      
        folder_list = sep_list

    for folder in tqdm_notebook(folder_list,desc = '전체'):
        file_list = os.listdir(path+folder)
        file_path = path+folder+'/'
        all_df = pd.DataFrame()
        for file in file_list:
            read_file = pd.read_csv(file_path + file)
            all_df = pd.concat([all_df,read_file])
        before_len = all_df.shape[0] # before

        #중복제거
        all_df = all_df.drop_duplicates('full_text')

        #필터링
        all_df = all_df.fillna('')
        filtered = all_df.full_text.apply(lambda x : any(ele in x for ele in filter_word))
        clean_file = all_df[~filtered]
        filtered = clean_file.title.apply(lambda x : any(ele in x for ele in filter_word))
        clean_file = clean_file[~filtered]
        after_len  = clean_file.shape[0] #after

        #연도 달 추가
        clean_file['year'] = clean_file.post_dates.apply(lambda dates : int(dates.split("-")[0]))
        clean_file['month'] = clean_file.post_dates.apply(lambda dates : int(dates.split("-")[1]))
        clean_file = clean_file[['post_dates','year', 'month', 'title', 'full_text', 'url']]
        #저장 
        clean_file.to_csv(f'./output/크롤링/통합/{folder}_통합_{after_len}.csv',index = False)
        
        result_dict['keyword'].append(folder)
        result_dict['before_len'].append(before_len)
        result_dict['after_len'].append(after_len)
        
                
        if verbose == True:
            print('시작전row:',before_len)   
            print('종료후row:',after_len)
            print(folder,"완료")
            print('---------------------')
    #총결과 출력
    result_df = pd.DataFrame(result_dict)
    result_df['diff'] = result_df['before_len'] -result_df['after_len']
    result_df['loss'] = round(result_df['diff']/result_df['before_len'],4) *100
    
    return result_df

In [None]:
# #일부분일 경우 
# sep_list = ["광진해변 +양양","광진해수욕장 +양양"]
# crawling_merge(path, verbose=True, not_all=True, sep_list=sep_list)
#전체
result_df = crawling_merge(path, verbose=True)
result_df