In [1]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import os
import csv
import textract
import nltk

import MeCab
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from konlpy.tag import *
from collections import Counter
from nltk.corpus import stopwords
import pyLDAvis.sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [9]:
df = pd.read_csv('adoor_data/answers.csv', encoding='UTF8')
df['created_at'] = pd.to_datetime(df['created_at'])

# df = df[df['author_id'] != 5]

start_date = pd.Timestamp(2019, 1, 28, 0)
end_date = pd.Timestamp(2019, 4, 1, 0)

mask = (df['created_at'] > start_date) & (df['created_at'] <= end_date)
df = df.loc[mask]

df.head()

Unnamed: 0,id,author_id,question_id,content,tag_string,created_at,updated_at
4,6,4,5,진수니랑 같이 프론트 더 멋있게 만들기!!!!!\r\n,,2019-01-28 15:55:14.088330,2019-01-28 15:55:14.088330
5,7,4,236,나는 칭찬받는걸 좋아하고 칭찬하는것도 좋아한다! 칭찬은 내가 일할 수 있는 가장 큰...,,2019-01-28 15:58:52.305801,2019-01-28 15:58:52.305801
6,8,4,95,자신감 없는거,,2019-01-28 16:42:02.175275,2019-01-28 16:42:02.175275
7,9,4,103,로또 당첨 재능(이것도 재능인가여?),,2019-01-28 16:43:33.654065,2019-01-28 16:43:33.654065
8,11,3,134,나는 선택에 있어서 무섭도록 직감을 믿는다\r\n어렸을 때는 선택 전에 꽤나 많은 ...,,2019-01-29 01:05:03.014552,2019-01-29 01:05:03.014552


In [3]:
# content = df.content

# for row in content:
#     text = row
#     if not pd.isna(text):
#         morphs.append(twitter.pos(text))

In [10]:
def getNVM_lemma(text):
    tokenizer = MeCab.Tagger()
    parsed = tokenizer.parse(text)
    word_tag = [w for w in parsed.split("\n")]
    pos = []
    tags = ['NNG','NNP','VV','VA', 'VX', 'VCP','VCN']
    for word_ in word_tag[:-2]:
        word = word_.split("\t")
        tag = word[1].split(",")
        if(len(word[0]) < 2) or ("게" in word[0]):
            continue
        if(tag[-1] != '*'):
            t = tag[-1].split('/')
            if(len(t[0]) > 1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
                pos.append(t[0])
        else:
            if(tag[0] in tags):
                pos.append(word[0])
    return pos

In [11]:
tf_vect = TfidfVectorizer(tokenizer=getNVM_lemma,ngram_range=(1, 2), min_df=2, max_df=20000) 
dtm = tf_vect.fit_transform(df['content'].values.astype('U'))

n_topics = 5

lda = LatentDirichletAllocation(n_components=n_topics) 
lda.fit(dtm)

LatentDirichletAllocation(n_components=5)

In [12]:
names = tf_vect.get_feature_names() 
topics = dict() 

for idx, topic in enumerate(lda.components_): 
    vocab = [] 
    for i in topic.argsort()[:-(30-1):-1]: 
        vocab.append((names[i], topic[i].round(2))) 
    print("주제 %d:" % (idx +1)) 
    print([(names[i], topic[i].round(2)) for i in topic.argsort()[:-(30-1):-1]])

주제 1:
[('사랑', 9.9), ('생각', 7.66), ('사람', 7.23), ('만들', 5.44), ('학교', 5.01), ('최악', 4.96), ('행복', 4.85), ('대하', 3.9), ('정도', 3.84), ('아니', 3.71), ('경우', 3.51), ('귀찮', 3.37), ('상처', 3.32), ('느끼', 3.23), ('바뀌', 3.16), ('그렇', 2.94), ('무섭', 2.87), ('감사', 2.71), ('과일', 2.5), ('최악 경우', 2.49), ('인생', 2.39), ('공부', 2.32), ('예전', 2.3), ('개강', 2.3), ('얼굴', 2.27), ('공연', 2.21), ('얘기', 2.21), ('소리', 2.13)]
주제 2:
[('생각', 15.14), ('시간', 8.29), ('모르', 7.04), ('사람', 6.69), ('보이', 5.35), ('맛있', 4.08), ('일어나', 3.99), ('세상', 3.94), ('알리', 3.4), ('만드', 3.3), ('수업', 3.28), ('의식', 3.1), ('능력', 2.72), ('인생', 2.7), ('이야기', 2.64), ('아침', 2.6), ('가족', 2.51), ('영화', 2.27), ('아니', 2.23), ('위하', 2.2), ('필라테스', 2.14), ('멋지', 2.12), ('마음', 2.12), ('예쁘', 1.96), ('자유', 1.93), ('집중', 1.9), ('엄마', 1.9), ('맛나', 1.88)]
주제 3:
[('사람', 13.29), ('배우', 6.01), ('그러', 5.0), ('동안', 4.09), ('선택', 4.07), ('질문', 3.95), ('만나', 3.91), ('행복', 3.9), ('버리', 3.68), ('고민', 2.93), ('마음', 2.87), ('나가', 2.83), ('지나', 2.68), ('생각', 2.61), ('열정'

In [7]:
visual = pyLDAvis.sklearn.prepare(lda_model=lda, dtm=dtm, vectorizer=tf_vect) 
pyLDAvis.save_html(visual, 'LDA_Visualization.html') 
pyLDAvis.display(visual)