In [18]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook
from glob import glob

In [2]:
filter_word = pd.read_excel('filter_word.xlsx')['filter_word'].values
path = 'new_output'

In [56]:
def crawling_merge(path, verbose=True, not_all=False):
    '''크롤링된 keyword별 폴더가 위치한 경로를 입력하면, 각 폴더에 있는 csv파일을 통합하고 통합폴더에 저장한다.
    
    Parameters
    ----------
    path(string) : 통합결과를 넣을 결과폴더 경로
    verbose(bool) : default = True, 각 폴더 통합진행과정 출력
    not_all(bool) : default = False, 전체폴더가 아닌 일부 사용시
    
    Returns
    --------
    result_df : DataFrame
        각 keyword별 전후차이 비교
    통합폴더에 각 keyword별로 저장 ex) f"{키워드}_통합_{전체 row수}.csv"
    
    '''
    #저장위치
    os.makedirs(f'./{path}/크롤링_통합/',exist_ok=True)
    
    result_dict = dict()
    result_dict['keyword'] = []
    result_dict['before_len'] = []
    result_dict['after_len'] = []
    
    
    #크롤링 내 키워드 폴더명 리스트
    folder_list = os.listdir(f'./{path}/크롤링')

    for folder in tqdm_notebook(folder_list,desc = '전체'):
        file_list = os.listdir(f'./{path}/크롤링/{folder}')
        files = glob(f'./{path}/크롤링/{folder}/*.csv')
        all_df = pd.DataFrame()
        for file in files:
            read_file = pd.read_csv(file)
            all_df = pd.concat([all_df,read_file])
        before_len = all_df.shape[0] # before

        #필터링
        all_df = all_df.fillna('')
        all_df = all_df.drop_duplicates('full_text') #중복제거
        text_filtered = all_df[~(all_df.text.apply(lambda x : any(ele in x for ele in filter_word)))] #text 필터링
        clean_file = text_filtered[~(text_filtered.title.apply(lambda x : any(ele in x for ele in filter_word)))] #title 필터링
        after_len = clean_file.shape[0] #after

        #연도 달 추가
        clean_file['year'] = clean_file.post_dates.apply(lambda dates : int(dates.split("-")[0]))
        clean_file['month'] = clean_file.post_dates.apply(lambda dates : int(dates.split("-")[1]))
        
        #저장 
        clean_file.to_csv(f'./{path}/크롤링_통합/{folder}_통합_{after_len}.csv',index = False)
        
        #결과저장
        result_dict['keyword'].append(folder)
        result_dict['before_len'].append(before_len)
        result_dict['after_len'].append(after_len)        
                                   
        if verbose == True:                        
            print('시작전row:',before_len)   
            print('종료후row:',after_len)
            print(folder,"완료")
            print('---------------------')
    #총결과 출력
    result_df = pd.DataFrame(result_dict)
    result_df['diff'] = result_df['before_len'] -result_df['after_len']
    result_df['loss'] = round(result_df['diff']/result_df['before_len'],4) *100

    return result_df

In [58]:
#전체
result_df = crawling_merge(path, verbose=True)
result_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='전체', max=1.0, style=ProgressStyle(description_width='init…

시작전row: 9660
종료후row: 6277
강릉 완료
---------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,keyword,before_len,after_len,diff,loss
0,강릉,9660,6277,3383,35.02
