## We want to be able to do a couple things in this repo:
0. checking consistancy and general sanity check
1. deduplicate (naïve if faster)
2. tokenisation
3. stop-words removal
4. others

In [1]:
# reserved for libraries
import os
import datetime
from tqdm import tqdm
from multiprocessing import pool as P
import pickle
import random
from math import floor
from datetime import timedelta

In [2]:
import datetime

datetime.datetime.strptime('2018-2-2', '%Y-%m-%d')

datetime.datetime(2018, 2, 2, 0, 0)

In [3]:
# data dir
data_dir = "../master_thesis_data/weibo_raw"

In [4]:
# finding all files and seperating them into dates and tweets
all_files = os.listdir(data_dir)

all_texts = [file for file in all_files if file.split('.')[0][-5:] == 'texts']
all_dates = [file for file in all_files if file.split('.')[0][-5:] == 'dates']

In [5]:
# how many days of data did we actually get
len(all_texts), len(all_dates), 365 * 3 + 1

(1096, 1096, 1096)

In [6]:
# if they match up
actual_dates_0 = sorted([text.split('_')[1] for text in all_texts])
actual_dates_1 = sorted([date.split('_')[1] for date in all_dates])

matches = [actual_dates_0[i] == actual_dates_1[i] for i in range(len(actual_dates_0))]
sum(matches)

1096

It seems though we managed to scrape data from nearly all dates and the 'date' labels are correctly matched, let's check using datetime which dates we're missing

In [7]:
# setting limits
starting_date = datetime.datetime.strptime('2016-4-17', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2019-4-17', '%Y-%m-%d')

In [8]:
# converting to datetime format
actual_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in actual_dates_0]

In [9]:
# finding which dates are missing
missing_dates = []
while starting_date != end_date + datetime.timedelta(days = 1):
    if starting_date not in actual_dates:
        missing_dates.append(starting_date)
    starting_date += datetime.timedelta(days = 1)

In [10]:
len(missing_dates)

0

Ok so we got the entire period covered

## Now let's see how many tweets we managed in total

In [11]:
# counting total number of tweets
tweet_count = 0
for file in all_texts:
    with open(os.path.join(data_dir, file), 'rb') as handle:
        file_holder = pickle.load(handle)
    tweet_count += len(file_holder)

In [12]:
tweet_count, round(tweet_count/len(matches), 0), round(tweet_count * 0.6, 0)

(741389, 676.0, 444833.0)

We managed ball park 740K tweets, or on avg 676 tweets per day, we can expect around 440K tweets in total after deduplication

## Prototyping deduplication

We'll use a random period first

In [13]:
# loading in the sample date's text and dates
with open(os.path.join(data_dir, sorted(all_texts)[0]), 'rb') as handle:
    texts = pickle.load(handle)
with open(os.path.join(data_dir, sorted(all_dates)[0]), 'rb') as handle:
    dates = pickle.load(handle)

In [14]:
sum(['人工智能' in text for text in texts])/len(texts)
# we only have 70% of the data that mention the search term, most of the non mentionning ones should be retweets

0.9943342776203966

In [15]:
# recording ones we want to keep
Mention_idx = [i for i, text in enumerate(texts) if '人工智能' in text]

In [16]:
# keeping the tweets we want
texts = [texts[i] for i in Mention_idx]

In [17]:
# keeping the dates we want
dates = [dates[i] for i in Mention_idx]

In [18]:
len(dates), len(texts)

(702, 702)

In [19]:
len(set(texts))

# so unique tweets wise, we only have 573

573

Now let's take a look at naive duplicates

In [20]:
clusters = []

for i, tweet_0 in tqdm(enumerate(texts)):
    if sum([i in clus for clus in clusters]) == 0:
        clusters.append([i])
    else:
        continue
    for j,tweet_1 in enumerate(texts[i+1:]):
        if tweet_0 == tweet_1:
            clusters[-1].append(i+j+1)

702it [00:00, 17954.32it/s]


In [21]:
len(clusters)

573

In [22]:
# we manually check of what nature they are, we'll sample a few
clusters = [clus for clus in clusters if len(clus) > 1]

In [23]:
for idx in random.choice(clusters):
    print(texts[idx])
    print('----------------------')
    print(' ')

//@上海交通大学:#交大分享# 在菁菁堂遇见李开复不要错过哦 //@上海交通大学研究生会:O明天晚上19:00菁菁堂 ，李开复人工智能分享会 ，没课的同学不要错过了哦！ //@李开复:明天晚上，我在上海交大等你，想要更深入研究人工智能的同学请踊跃报名哦。
----------------------
 
//@上海交通大学:#交大分享# 在菁菁堂遇见李开复不要错过哦 //@上海交通大学研究生会:O明天晚上19:00菁菁堂 ，李开复人工智能分享会 ，没课的同学不要错过了哦！ //@李开复:明天晚上，我在上海交大等你，想要更深入研究人工智能的同学请踊跃报名哦。
----------------------
 


OK so let's now go on away and execute a naïve deduplication now

In [24]:
texts

['AKA打造的人工智能机器人Musio，有着出色的学习与自然语言处理能力，能用讲话、表情以及姿势等与人交流，每一次互动都会让它变得更智能。它还可以与所有的智能家庭设备连接，并控制与之相连的设备，能关灯、调节恒温器等。',
 'AKA打造的人工智能机器人Musio，有着出色的学习与自然语言处理能力，能用讲话、表情以及姿势等与人交流，每一次互动都会让它变得更智能。它还可以与所有的智能家庭设备连接，并控制与之相连的设备，能关灯、调节恒温器等。',
 '航空公司为什么青睐AI（人工智能）？-科技频道-手机搜狐 O航空公司为什么青睐AI（人工智能）？',
 'AKA打造的人工智能机器人Musio，有着出色的学习与自然语言处理能力，能用讲话、表情以及姿势等与人交流，每一次互动都会让它变得更智能。它还可以与所有的智能家庭设备连接，并控制与之相连的设备，能关灯、调节恒温器等。',
 '【信息科技：五大巨头联合推动人工智能 荐9股】O网页链接',
 'AKA打造的人工智能机器人Musio，有着出色的学习与自然语言处理能力，能用讲话、表情以及姿势等与人交流，每一次互动都会让它变得更智能。它还可以与所有的智能家庭设备连接，并控制与之相连的设备，能关灯、调节恒温器等。',
 '看完了 深恭还是那么美//@会员号是空号的mimo:刚看完录画，还不错。顺说那个人工智能女高中生rinna，是真的人工智能哦，有推特跟line账号的，会回复留言，日本微软开发的（翻过这条新闻 //@系录芥末:马！//@风中劲节_goro酱是小天使: 这个马一个。//@悠幽刨冰: 嗷嗷嗷嗷嗷终于等到了看看',
 '//@李开复: 明天晚上，我在上海交大等你，想要更深入研究人工智能的同学请踊跃报名哦。',
 '【信息科技：五大巨头联合推动人工智能 荐9股】 O网页链接',
 '【《聚焦：人工智能或引发乐视价值井喷》】//@红叶st: 转发微博',
 '横看人脑侧智能，远近高低各不同，不识人脑真面目，只缘脑在此脑中。人工智能超越人脑对于现在来说只能是伪命题，不能以一部虚构的神剧下定论，说白了人脑子还在人脑里，人工智能也是大脑想的。所以现在担心的不是人工智能，反而倒是人。如果现在高度依赖智能网络的社会被别有用心的人控制，那后果……',
 '人工智能(AI)的研发将对人类产生最大的影响。全球的科学家们都在疯狂的研发着人工智能

In [25]:
begin_date = '2018-02-05'
end_date = '2019-02-08'

In [26]:
# converting input dates into datetime format
begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')

In [27]:
n_days = (end_date - begin_date).days

In [28]:
window_size = 3

In [29]:
floor(n_days/window_size)+1

123

In [30]:
len(texts), len(dates)

(702, 702)

In [31]:
begin_date

datetime.datetime(2018, 2, 5, 0, 0)

In [32]:
begin_date + timedelta(days = window_size-1)

datetime.datetime(2018, 2, 7, 0, 0)

In [33]:
ind_remove = []
for clus in clusters:
    min_date = None
    ind_holder = 0
    for i, ind in enumerate(clus):
        if i == 0:
            min_date = dates[ind]
            ind_holder = ind
        elif dates[ind] < min_date:
            min_date = dates[ind]
            ind_holder = ind
    ind_remove += [ind for ind in clus if ind != ind_holder]

In [34]:
len(ind_remove)

129

In [35]:
import time
time.time() - time.time()

0.0

In [36]:
from deduplication import load_and_deduplicate

In [37]:
deduplicator = load_and_deduplicate(data_dir)