In [2]:
# import
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook  # 진행과정 시각화
from datetime import timedelta  # 시간날짜
import pandas as pd
import numpy as np
import gc
import re

import wordcloud
from PIL import Image
import random
import os

import imageio

#한글깨짐방지
plt.rc('font',family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

from IPython.core.display import display, HTML
display(HTML('<style>.container {width:100% !important; }</style>'))

### load

In [3]:
#stopword
sw = list(pd.read_excel("stopword(cp949).xlsx",encoding = 'cp949')['불용어']) #불용어 불러오기
#개별 불용어 
with open('custom_sw.json') as load_file:
    custom_sw = json.load(load_file)

In [4]:
def draw_wordcloud(df, stopword,custom_sw, keyword, options = "all"):
    ''' 토큰화된 DataFrame을 넣으면 연도에 따른 워드클라우드 생성 
    
    Parameters
    ----------
    df(DataFrame) : 분석할 데이터프레임
    stopword(list) : 불용어 리스트
    custom_sw(dictionary) : 각 keyword별 추가 불용어
    options(string) : (Default = "all") "all" 선택시 형용사,명사 모두 분석, "noun" 명사만, "adjective" 형용사만 분석
    
    Return
    ------
    Wordcloud : 10X10 워드클라우드 png
        './new_output/워드클라우드/{keyword}_{options}/{keyword}_{time}_{options}.png'이하에 연도별로 저장
    
    '''
    #stopword
    stopword.append(keyword)
    try :
        stopword = stopword +custom_sw[keyword]
        stopword = list(set(stopword))
    except:
        pass
    
    #flattern 
    df = df.fillna("")
    total_len = df.shape[0]
    df['Noun'] = df['Noun'].apply(lambda x : re.sub("[\[\]' ]","",x).split(','))
    df['Adjective'] = df['Adjective'].apply(lambda x : re.sub("[\[\]' ]","",x).split(','))
    
    all_noun_flatten = [y for x in df["Noun"] for y in x]
    all_adj_flatten = [y for x in df["Adjective"] for y in x]
    all_word_flatten = all_noun_flatten + all_adj_flatten
    
    if options == 'all':
        all_word_flatten = all_noun_flatten + all_adj_flatten
    elif options == 'noun':
        all_word_flatten = [y for x in df["Noun"] for y in x]
    elif options == 'adjective':
        all_adj_flatten = [y for x in df["Adjective"] for y in x]
    
    years = df.year.unique()
    if len(years) != 1:
        last_year = years.max()
        start_year = years.min()
        time = '{s}~{l}'.format(s = start_year, l = last_year)
    else :
        time = years[0]
    
    # 워드클라우드
    # 마스킹
    img_path = "vector-cloud-png.png"
    img = Image.open(img_path).convert('RGBA')
    mask = Image.new("RGB", img.size, (255, 255, 255))
    mask.paste(img, img)
    mask = np.array(mask)

    text = ' '.join(all_word_flatten)

    wordc = wordcloud.WordCloud(background_color='White', max_words=200,
                                font_path='C:/Windows/Fonts/malgun.ttf',
                                relative_scaling=0.5,
                                stopwords=stopword,
                                collocations=False,
                                mask=mask)
    wordc.generate(text)

    def grey_color(word, font_size, position, orientation, random_state=None, **kwargs):
        return 'hsl(%d,100%%,50%%)' % random.randint(200, 300)  # 색상 채도 밝기
    wordc.recolor(color_func=grey_color, random_state=3)

    # 그리기
    plt.figure(figsize=(10, 10))
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    plt.subplots_adjust(left = 0, bottom = 0, right = 1, top = 1)
#     plt.title(f'{keyword}_{time}_{options}_{total_len}',fontsize = 20)
    plt.title(f'{time}',fontsize = 20) # 연도만출력
    plt.imshow(wordc, interpolation='bilinear')
    
    #저장
    os.makedirs(f'./new_output/워드클라우드/{keyword}_{options}/',exist_ok=True)
    plt.savefig(f'./new_output/워드클라우드/{keyword}_{options}/{keyword}_{time}_{options}.png',bbox_inches='tight')
    plt.close()

In [5]:
def make_gif(path):
    '''
    이미지 여러장이 들어있는 폴더를 input하면 gif를 만들어냄
    jpg, png파일만 허용
    '''
    from PIL import Image
    import os
    import imageio
    
    file_list = os.listdir(path)
    
    #select png
    png_ls =[]
    for file in file_list:
        try :
            if file.split('.')[1] in (['png','jpg']):
                png_ls.append(file)
        except : 
            pass
    
    #naming
    main = png_ls[0].split('_')[0]
    tail = png_ls[0].split('_')[2]
    start = png_ls[0].split('_')[1]
    end = png_ls[-1].split('_')[1]
                                      
    images = [np.array(Image.open(path+file)) for file in png_ls]
    imageio.mimsave(f'./new_output/워드클라우드/{main}_{start}~{end}_{tail}.gif', images, fps=0.5)

In [6]:
path = './new_output/token_통합/'
file_list = os.listdir(path)

for file in tqdm_notebook(file_list):
    file_df = pd.read_csv(path+file)
    keyword = file.split('_')[0]
    years = file_df.year.unique()
    for year in years:
        sample = file_df[file_df['year'] == year]
        draw_wordcloud(sample,sw,custom_sw,keyword)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




### gif 만들기
* 2010년부터 2019까지 추이변화를 볼수 있는 특징이 있다

In [7]:
path = './new_output/워드클라우드'

for folder in os.listdir(path):
    if os.path.splitext(folder)[1] == "":
        make_gif(f'{path}/{folder}/')