In [1]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import os
import csv
import textract
import nltk

import MeCab
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from konlpy.tag import *
from collections import Counter
from nltk.corpus import stopwords
import pyLDAvis.sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [2]:
df = pd.read_csv('adoor_data/sns_feed.csv', encoding='UTF8')
df['created_at'] = pd.to_datetime(df['created_at'])

# df = df[df['author_id'] != 5]

start_date = pd.Timestamp(2019, 1, 28, 0)
end_date = pd.Timestamp(2019, 4, 1, 0)

mask = (df['created_at'] > start_date) & (df['created_at'] <= end_date)
df = df.loc[mask]

df.head()

Unnamed: 0,adoor,SNS,SNS 아이디,private,created_at,photo,content
0,2,facebook,김유리,False,2019-02-08,홍보,저와 팀원들이 영혼을 갈아넣은 서비스 베타버전이 드디어 나왔어요!!🤩\n밤잠을 줄여...
1,2,instagram,nnnyu_ri,False,2019-02-08,홍보,https://adoor.app @adoor.team\n저와 팀원들이 영혼을 갈아넣...
2,2,instagram,yulo_mon,True,2019-02-07,"가족, 아기, 강아지",카조꾸또 잇쇼니 토떼모 우레시깟딴데쓰
3,2,instagram,yulo_mon,True,2019-02-25,일상,🙋🏻‍♀️
4,2,instagram,yulo_mon,True,2019-03-02,나들이,필름캐머러 첫 롤 사진들1\n수라미가 준 로모그래피 심플유즈 기본 장착 컬러네거티브...


In [3]:
def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    word_tag = [w for w in parsed.split("\n")]
    pos = []
    tags = ['NNG','NNP','VV','VA', 'VX', 'VCP','VCN']
    for word_ in word_tag[:-2]:
        word = word_.split("\t")
        tag = word[1].split(",")
        if(len(word[0]) < 2) or ("게" in word[0]):
            continue
        if(tag[-1] != '*'):
            t = tag[-1].split('/')
            if(len(t[0]) > 1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                pos.append(t[0])
        else:
            if(tag[0] in tags):
                pos.append(word[0])
    return pos

In [4]:
tf_vect = TfidfVectorizer(tokenizer=getNVM_lemma,ngram_range=(1, 2), min_df=2, max_df=20000) 
dtm = tf_vect.fit_transform(df['content'].values.astype('U'))

n_topics = 3

lda = LatentDirichletAllocation(n_components=n_topics) 
lda.fit(dtm)

LatentDirichletAllocation(n_components=3)

In [5]:
names = tf_vect.get_feature_names() 
topics = dict() 

for idx, topic in enumerate(lda.components_): 
    vocab = [] 
    for i in topic.argsort()[:-(10-1):-1]: 
        vocab.append((names[i], topic[i].round(2))) 
    print("주제 %d:" % (idx +1)) 
    print([(names[i], topic[i].round(2)) for i in topic.argsort()[:-(10-1):-1]])

주제 1:
[('날씨', 2.69), ('도어', 2.34), ('사진', 2.28), ('필름', 2.06), ('생일', 1.56), ('라이프', 1.39), ('언니', 1.36), ('통제', 1.33)]
주제 2:
[('민윤기', 2.33), ('학교', 1.82), ('가지', 1.66), ('느끼', 1.6), ('디자인', 1.53), ('세상', 1.53), ('재미', 1.49), ('사람', 1.48)]
주제 3:
[('친구', 3.09), ('최고', 2.04), ('생각', 2.02), ('서비스', 1.96), ('사람', 1.72), ('강아지', 1.67), ('만들', 1.63), ('노래', 1.62)]


In [6]:
visual = pyLDAvis.sklearn.prepare(lda_model=lda, dtm=dtm, vectorizer=tf_vect) 
pyLDAvis.save_html(visual, 'LDA_Visualization.html') 
pyLDAvis.display(visual)