In [1]:
import jieba
import pickle
import os
import re
from gensim import models
from multiprocessing import Pool as P
import time
import datetime
from math import floor
from tqdm import tqdm

In [2]:
# setting directories
data_dir = '../master_thesis_data/weibo_deduplicated'
dates_name, texts_name = 'weibo_all_dates.pickle', 'weibo_all_texts_new.pickle'

In [3]:
# loading in data
with open(os.path.join(data_dir, texts_name), 'rb') as handle:
    texts = pickle.load(handle)
with open(os.path.join(data_dir, dates_name), 'rb') as handle:
    dates = pickle.load(handle)

In [57]:
seg_list = jieba.cut(texts[0], cut_all=False)

In [58]:
sample_line = ' '.join(seg_list)

In [59]:
sample_line

'发现 我 的 微 博会 自动 取消 关注 , , , 也 是 醉 了 , , , 您 这么 自说自话 地 取消 真的 好 吗 ？ 问过 我 吗 ？ 我 很 怕 的 你 知道 吗 ？ 【 本周 刚 和 人 探讨 过 人工智能 的 觉醒 】'

In [61]:
sample_line = re.sub(r'\W+', ' ', sample_line)
sample_line

'发现 我 的 微 博会 自动 取消 关注 也 是 醉 了 您 这么 自说自话 地 取消 真的 好 吗 问过 我 吗 我 很 怕 的 你 知道 吗 本周 刚 和 人 探讨 过 人工智能 的 觉醒 '

In [62]:
sample_line.split(' ')

['发现',
 '我',
 '的',
 '微',
 '博会',
 '自动',
 '取消',
 '关注',
 '也',
 '是',
 '醉',
 '了',
 '您',
 '这么',
 '自说自话',
 '地',
 '取消',
 '真的',
 '好',
 '吗',
 '问过',
 '我',
 '吗',
 '我',
 '很',
 '怕',
 '的',
 '你',
 '知道',
 '吗',
 '本周',
 '刚',
 '和',
 '人',
 '探讨',
 '过',
 '人工智能',
 '的',
 '觉醒',
 '']

In [245]:
with open(os.path.join(data_dir, 'weibo_all_texts_new.pickle'), 'rb') as handle:
    texts = pickle.load(handle)

In [247]:
cores = multiprocessing.cpu_count() - 2

In [248]:
model = models.Word2Vec(min_count=5,
                       window = 10,
                       size = 3000,
                       sample = 6e-5,
                       alpha = 0.03,
                       min_alpha = 0.0007,
                       negative = 50,
                       workers = cores)

In [249]:
model.build_vocab(texts[:10000], progress_per = 10000)

In [99]:
begin = time.time()
model.train(texts[:10000], total_examples=model.corpus_count, epochs=30, report_delay=1)
end = time.time()

In [102]:
(end - begin)/60

44.65111419359843

In [105]:
model.wv.most_similar(positive = ['人工智能'], topn = 20)

[('的', 0.5325926542282104),
 ('', 0.5124344825744629),
 ('O', 0.4849620461463928),
 ('AI', 0.44319653511047363),
 ('未来', 0.42329198122024536),
 ('技术', 0.4019441604614258),
 ('和', 0.36998993158340454),
 ('发展', 0.36974191665649414),
 ('机器人', 0.3674817681312561),
 ('机器', 0.35526013374328613),
 ('网页', 0.35511350631713867),
 ('链接', 0.34447500109672546),
 ('科技', 0.3357526659965515),
 ('skymind', 0.33529406785964966),
 ('智能', 0.33263707160949707),
 ('了', 0.32909056544303894),
 ('是', 0.3266730308532715),
 ('时代', 0.32044821977615356),
 ('如何', 0.3176739513874054),
 ('将', 0.310677170753479)]

# let's see how we can partition the data for continous training

In [4]:
begin_date = '2016-4-17'
begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
window_size = int(356/12*6)
slider_step = int(356/12*3)
d_upper_limit = datetime.datetime.strptime('2019-4-17', '%Y-%m-%d')

In [5]:
end_date = begin_date + datetime.timedelta(days = window_size)

In [13]:
begin_date, end_date

(datetime.datetime(2016, 4, 17, 0, 0), datetime.datetime(2016, 10, 12, 0, 0))

In [17]:
sub_texts = [text for i, text in enumerate(texts) if begin_date <= dates[i] <= end_date]

In [7]:
begin_date + datetime.timedelta(days = 30)

datetime.datetime(2016, 5, 17, 0, 0)

In [8]:
n_iter = floor(((d_upper_limit - begin_date).days - slider_step)/slider_step) + 1

In [9]:
n_iter

12

In [11]:
begin = time.time()

# placeholder for the top n keywords
li_top_n = []

# iterate through for continous execution
for i in range(n_iter):
    print('iter {}'.format(i))
    # determining the sub period limits
    sub_begin_date = begin_date + datetime.timedelta(days = i * slider_step)
    print('begin date is {}'.format(sub_begin_date))
    sub_end_date = sub_begin_date + datetime.timedelta(days = window_size)
    print('end date is {}'.format(sub_end_date))
    
    # getting the corresponding texts based on date range
    sub_texts = [texts for i, text in enumerate(texts) if sub_begin_date <= dates[i] <= sub_end_date]
    print(sub_texts[0])

iter 0
begin date is 2016-04-17 00:00:00
end date is 2016-10-12 00:00:00


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




iter 4
begin date is 2017-04-08 00:00:00
end date is 2017-10-03 00:00:00


KeyboardInterrupt: 

In [None]:
    # initiate the model
    model = models.Word2Vec(min_count=3,
                       window = 10,
                       size = 3000,
                       sample = 6e-5,
                       alpha = 0.03,
                       min_alpha = 0.0007,
                       negative = 80,
                       workers = cores)
    
    # build vocab
    model.build_vocab(sub_texts)
    # train
    model.train(sub_texts, total_examples=model.corpus_count, epochs=50, report_delay=1)
    # getting the top keywords
    top_n = model.wv.most_similar(positive = ['人工智能'], topn = 50)
    li_top_n.append(top_n)
    # get time lapsed
    time_lapsed = time.time() - begin
    print('iteration {} complete, time lapsed: {}'.format(i, round(time_lapsed/60, 2)))
    print('time remaining is estimated to be {}'.format(round(((n_iter - i - 1)*time_lapsed/(i + 1))/60), 2))

# saving
with open(os.path.join(data_dir, 'top_n_list.pickle'), 'wb') as handle:
    pickle.dump(li_top_n, handle)