In [126]:
import jieba
import pickle
import os
import re
from gensim import models
import multiprocessing
import time
import datetime
from math import floor

In [137]:
# setting directories
data_dir = '../master_thesis_data/weibo_deduplicated'
dates_name, texts_name = 'weibo_all_dates.pickle', 'weibo_all_texts_new.pickle'

In [138]:
# loading in data
with open(os.path.join(data_dir, texts_name), 'rb') as handle:
    texts = pickle.load(handle)
with open(os.path.join(data_dir, dates_name), 'rb') as handle:
    dates = pickle.load(handle)

KeyboardInterrupt: 

In [57]:
seg_list = jieba.cut(texts[0], cut_all=False)

In [58]:
sample_line = ' '.join(seg_list)

In [59]:
sample_line

'发现 我 的 微 博会 自动 取消 关注 , , , 也 是 醉 了 , , , 您 这么 自说自话 地 取消 真的 好 吗 ？ 问过 我 吗 ？ 我 很 怕 的 你 知道 吗 ？ 【 本周 刚 和 人 探讨 过 人工智能 的 觉醒 】'

In [61]:
sample_line = re.sub(r'\W+', ' ', sample_line)
sample_line

'发现 我 的 微 博会 自动 取消 关注 也 是 醉 了 您 这么 自说自话 地 取消 真的 好 吗 问过 我 吗 我 很 怕 的 你 知道 吗 本周 刚 和 人 探讨 过 人工智能 的 觉醒 '

In [62]:
sample_line.split(' ')

['发现',
 '我',
 '的',
 '微',
 '博会',
 '自动',
 '取消',
 '关注',
 '也',
 '是',
 '醉',
 '了',
 '您',
 '这么',
 '自说自话',
 '地',
 '取消',
 '真的',
 '好',
 '吗',
 '问过',
 '我',
 '吗',
 '我',
 '很',
 '怕',
 '的',
 '你',
 '知道',
 '吗',
 '本周',
 '刚',
 '和',
 '人',
 '探讨',
 '过',
 '人工智能',
 '的',
 '觉醒',
 '']

In [65]:
with open(os.path.join(data_dir, 'weibo_all_texts_new.pickle'), 'rb') as handle:
    texts = pickle.load(handle)

In [72]:
cores = multiprocessing.cpu_count() - 2

In [97]:
model = models.Word2Vec(min_count=5,
                       window = 10,
                       size = 3000,
                       sample = 6e-5,
                       alpha = 0.03,
                       min_alpha = 0.0007,
                       negative = 50,
                       workers = cores)

In [98]:
model.build_vocab(texts[:100000], progress_per = 10000)

In [99]:
begin = time.time()
model.train(texts[:100000], total_examples=model.corpus_count, epochs=30, report_delay=1)
end = time.time()

In [102]:
(end - begin)/60

44.65111419359843

In [105]:
model.wv.most_similar(positive = ['人工智能'], topn = 20)

[('的', 0.5325926542282104),
 ('', 0.5124344825744629),
 ('O', 0.4849620461463928),
 ('AI', 0.44319653511047363),
 ('未来', 0.42329198122024536),
 ('技术', 0.4019441604614258),
 ('和', 0.36998993158340454),
 ('发展', 0.36974191665649414),
 ('机器人', 0.3674817681312561),
 ('机器', 0.35526013374328613),
 ('网页', 0.35511350631713867),
 ('链接', 0.34447500109672546),
 ('科技', 0.3357526659965515),
 ('skymind', 0.33529406785964966),
 ('智能', 0.33263707160949707),
 ('了', 0.32909056544303894),
 ('是', 0.3266730308532715),
 ('时代', 0.32044821977615356),
 ('如何', 0.3176739513874054),
 ('将', 0.310677170753479)]

# let's see how we can partition the data for continous training

In [122]:
begin_date = '2016-4-17'
begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
window_size = int(356/12*6)
slider_step = int(356/12*3)
d_upper_limit = datetime.datetime.strptime('2019-4-17', '%Y-%m-%d')

In [129]:
end_date = begin_date + datetime.timedelta(days = window_size)

In [130]:
begin_date, end_date

(datetime.datetime(2016, 4, 17, 0, 0), datetime.datetime(2016, 10, 12, 0, 0))

In [132]:
begin_date + datetime.timedelta(days = 30)

datetime.datetime(2016, 5, 17, 0, 0)

In [128]:
n_iter = floor(((d_upper_limit - begin_date).days - slider_step)/slider_step) + 1

In [133]:
# iterate through for continous execution
for i in range(n_iter):
    # determining the sub period limits
    begin_date = begin_date + datetime.timedelta(days = i * slider_step)
    end_date = begin_date + datetime.timedelta(days = i * window_size)
    
    # getting the indices for the correct texts based on date range
    indx = [i for i, date in enumerate(dates) if begin_date <= date <= end_date]
    
    # getting the corresponding texts
    sub_texts = [text for i, text in enumerate(texts) if i in indx]
    
    print(len(texts))
    
    

416
0
0
0
0
0
0
0
0
0
0
0


In [134]:
len(texts)

0