In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm, tqdm_notebook,tqdm_pandas
import itertools
import re

### sample 강릉

In [3]:
path = './output/token/'
file_list = os.listdir(path)

In [10]:
csv_ls = []
for file in file_list:
    if os.path.splitext(file)[1] == '.csv':
        csv_ls.append(file)

### 연도별 gap 확인

In [3]:
all_df = all_df.fillna("")
all_df['Noun'] = all_df['Noun'].apply(lambda x : re.sub("[\[\]' ]","",x).split(','))
all_df['Adjective'] = all_df['Adjective'].apply(lambda x : re.sub("[\[\]' ]","",x).split(','))

In [59]:
def count_word(df, tag, stopword, save_name, start_year=2010, end_year=2020, gap=1, month=False, months=[], save=True):
    '''단어가 등장한 횟수를 카운팅하고, 각 연도별 순위변동을 표시

    Parameters
    ----------
        df (dataFrame): DataFrame
        tag (string) : 품사지정
        stopword (list) : 카운팅에서 제외할 불용어 추가(유사검색어)
        save_name : 저장파일이름
        
        start_year (int) : 순위변동 확인시 시작할 연도위치
        end_year (int) : 순위변동 확인시 마지막 연도위치 
        gap (int): 연도별 기간차이 지정
        month (bool) : 특정월 추출여부
        months (list) : 추출할 월list
        save : 파일저장여부
       
    Returns
    -------
    vertical_df : DataFrame

    '''
    main_df = pd.DataFrame()

    if month == True:
        df = df[df.month.isin(months)]
    for s_year in range(2010, 2020):
        word_year_df = df[df['year'] == s_year]

        target_df = list(itertools.chain(*word_year_df[tag]))
        counted = pd.DataFrame(
            pd.Series(target_df).value_counts()).reset_index()
        counted.columns = ['name', 'value']

        drop_row = counted[counted['name'].isin(stopword)].index
        counted.drop(drop_row, inplace=True)
        counted = counted.head(15)

        counted['year'] = s_year
        counted['rank'] = counted.value.rank(ascending=False)
        main_df = pd.concat([main_df, counted])

    # gap
    main_df['gap'] = 0
    years = []
    for one_year in range(start_year, end_year, gap):
        years.append(one_year)
        names = main_df.loc[(main_df['year'] == one_year), "name"].values
        for name in names:
            try:
                new_rank = main_df.loc[(main_df['year'] == one_year) & (
                    main_df['name'] == name), 'rank'].values[0]
                old_rank = main_df.loc[(
                    main_df['year'] == one_year-gap) & (main_df['name'] == name), 'rank'].values[0]
                main_df.loc[(main_df['year'] == one_year) & (
                    main_df['name'] == name), 'gap'] = old_rank-new_rank
            except:
                main_df.loc[(main_df['year'] == one_year) & (
                    main_df['name'] == name), 'gap'] = "NEW"
    main_df = main_df[main_df['year'].isin(years)]
    main_df.drop('rank', axis=1, inplace=True)
    main_df.reset_index(drop = True, inplace = True)
    #vertical
    vertical_df = pd.DataFrame()
    for year in main_df.year.unique():
        sperated = main_df[main_df.year == year].reset_index(drop = True)
        vertical_df = pd.concat([vertical_df,sperated],axis = 1)

    if save == True:
        vertical_df.to_csv('./output/연도별카운트/{save_name}.csv'.format(
            save_name=save_name), encoding='cp949', index='False')

    return vertical_df

In [67]:
noun_df = count_word(all_df, 'Noun', ['강릉','강릉시','강원'],gap = 2, save_name = '2년 gap명사')
adj_df = count_word(all_df, 'Adjective', ['이다'],save_name = '1년 gap형용사')

In [65]:
#월[6,7,8] 여름기준
vertical_df = count_word(all_df, 'Noun', ['강릉','강릉시','강원','여름','거리'],gap = 1, save_name = '여름 1년gap명사',month = True, months = [6,7,8])