In [10]:
import datetime
import _pickle as pickle
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from collections import defaultdict

from Components import Comment, Post, POSPost, POSComment
from Models import MyDictionary, MyTfidf, MyLda
from helper import load_v1, load_v2

## Initialization

In [12]:
init_notebook_mode(connected=True)

In [59]:
tfidf = pickle.load(open('models/tfidf/400_t_d1t', 'rb'))

- Loaded v1: 18570


In [60]:
v1 = load_v1()

- Loaded v1: 18570


## Define functions

In [15]:
# Plot time series for a single cluster
def plot_time_series(cluster, tfidf, v1):
    ids = tfidf.cluster2ids[cluster]
    ids.sort(key=lambda i: v1[i].date)
    
    counts = defaultdict(int)
    for i in ids:
        date = v1[i].date
        string = "{}-{}".format(date.year, date.month)
        counts[string] += 1

    data = [go.Scatter(x=list(counts.keys()), y=list(counts.values()))]

    iplot(data)

In [38]:
# Given a list of queries, find clusters that contain it as a keyword
def find_clusters(queries, tfidf):
    results = []
    for cluster in tfidf.cluster2ids.keys():
        keywords = [keyword for keyword, _ in tfidf.get_cluster_keywords(cluster)[:20]]
        if any([query in keywords for query in queries]):
            results.append((cluster, keywords))
    return results

## Experiments

In [61]:
## Plot all data in dataset
counts = defaultdict(int)
for post in v1:
    date = post.date
    string = "{}-{:02}".format(date.year, date.month)
    counts[string] += 1

counts = list(counts.items())
counts.sort(key=lambda c: c[0])
data = [go.Scatter(x=[date for date, _ in counts], y=[count for _, count in counts])]

iplot(data)

In [41]:
## Find clusters containing these queries
queries = ['시험']
clusters = find_clusters(queries, tfidf)
for cluster, keywords in clusters:
    print(cluster, keywords)

28 ['것', '보다', '아니다', '사람', '수업', '너', '나', '좋다', '교수', '않다', '분', '가다', '수', '시험', '내', '말', '그렇다', '알다', '때', '거']
23 ['학점', '과목', '전공', '학기', '들다', '듣다', '졸업', '수강', '인정', '교수', '재수강', '전산', '전자', '수업', '수강신청', '시험', '학년', '성적', '영어', '받다']
3 ['밤샘', '차리다', '밤새다', '공부', '버티다', '방법', '바짝', '정신', '섬뜩하다', '취약', '시험', '자다', '졸리', '체질', '저', '스탠드', '자지', '밤', '갈수', '보다']


In [47]:
## Plot a cluster from the results above
plot_time_series(28, tfidf, v1)

28 13615 것 보다 아니다 사람 수업 너 나 좋다 교수 않다
