## We want to be able to do a couple things in this repo:
0. checking consistancy and general sanity check
1. deduplicate (naïve if faster)
2. tokenisation
3. stop-words removal
4. others

In [77]:
# reserved for libraries
import os
import datetime
from tqdm import tqdm
from multiprocessing import pool as P
import pickle
import random

In [81]:
import datetime

datetime.datetime.strptime('2018-2-2', '%Y-%m-%d')

datetime.datetime(2018, 2, 2, 0, 0)

In [2]:
# data dir
data_dir = "../master_thesis_data/weibo_raw"

In [3]:
# finding all files and seperating them into dates and tweets
all_files = os.listdir(data_dir)

all_texts = [file for file in all_files if file.split('.')[0][-5:] == 'texts']
all_dates = [file for file in all_files if file.split('.')[0][-5:] == 'dates']

In [4]:
# how many days of data did we actually get
len(all_texts), len(all_dates), 365 * 3 + 1

(1096, 1096, 1096)

In [5]:
# if they match up
actual_dates_0 = sorted([text.split('_')[1] for text in all_texts])
actual_dates_1 = sorted([date.split('_')[1] for date in all_dates])

matches = [actual_dates_0[i] == actual_dates_1[i] for i in range(len(actual_dates_0))]
sum(matches)

1096

It seems though we managed to scrape data from nearly all dates and the 'date' labels are correctly matched, let's check using datetime which dates we're missing

In [6]:
# setting limits
starting_date = datetime.datetime.strptime('2016-4-17', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2019-4-17', '%Y-%m-%d')

In [7]:
# converting to datetime format
actual_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in actual_dates_0]

In [8]:
# finding which dates are missing
missing_dates = []
while starting_date != end_date + datetime.timedelta(days = 1):
    if starting_date not in actual_dates:
        missing_dates.append(starting_date)
    starting_date += datetime.timedelta(days = 1)

In [9]:
len(missing_dates)

0

Ok so we got the entire period covered

## Now let's see how many tweets we managed in total

In [10]:
# counting total number of tweets
tweet_count = 0
for file in all_texts:
    with open(os.path.join(data_dir, file), 'rb') as handle:
        file_holder = pickle.load(handle)
    tweet_count += len(file_holder)

In [11]:
tweet_count, round(tweet_count/len(matches), 0), round(tweet_count * 0.6, 0)

(741389, 676.0, 444833.0)

We managed ball park 740K tweets, or on avg 676 tweets per day, we can expect around 440K tweets in total after deduplication

## Prototyping deduplication

We'll use a random period first

In [12]:
# loading in the sample date's text and dates
with open(os.path.join(data_dir, sorted(all_texts)[0]), 'rb') as handle:
    texts = pickle.load(handle)
with open(os.path.join(data_dir, sorted(all_dates)[0]), 'rb') as handle:
    dates = pickle.load(handle)

In [13]:
sum(['人工智能' in text for text in texts])/len(texts)
# we only have 70% of the data that mention the search term, most of the non mentionning ones should be retweets

0.9943342776203966

In [14]:
# recording ones we want to keep
Mention_idx = [i for i, text in enumerate(texts) if '人工智能' in text]

In [15]:
# keeping the tweets we want
texts = [texts[i] for i in Mention_idx]

In [16]:
# keeping the dates we want
dates = [dates[i] for i in Mention_idx]

In [17]:
len(dates), len(texts)

(702, 702)

In [18]:
len(set(texts))

# so unique tweets wise, we only have 573

573

Now let's take a look at naive duplicates

In [19]:
clusters = []

for i, tweet_0 in tqdm(enumerate(texts)):
    if sum([i in clus for clus in clusters]) == 0:
        clusters.append([i])
    else:
        continue
    for j,tweet_1 in enumerate(texts[i+1:]):
        if tweet_0 == tweet_1:
            clusters[-1].append(i+j+1)

702it [00:00, 14430.79it/s]


In [20]:
clusters

[[0,
  1,
  3,
  5,
  16,
  42,
  61,
  116,
  224,
  276,
  341,
  557,
  671,
  675,
  676,
  678,
  679,
  689],
 [2],
 [4],
 [6],
 [7, 20, 104, 213, 342, 365, 388, 389, 401, 449, 509, 513, 521],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13],
 [14],
 [15],
 [17],
 [18],
 [19],
 [21, 282],
 [22],
 [23],
 [24],
 [25],
 [26],
 [27],
 [28],
 [29],
 [30, 199],
 [31],
 [32],
 [33],
 [34],
 [35],
 [36],
 [37],
 [38],
 [39],
 [40],
 [41],
 [43],
 [44],
 [45],
 [46],
 [47],
 [48],
 [49],
 [50],
 [51],
 [52],
 [53, 284],
 [54],
 [55],
 [56],
 [57],
 [58],
 [59],
 [60],
 [62],
 [63],
 [64],
 [65],
 [66],
 [67],
 [68],
 [69],
 [70],
 [71],
 [72],
 [73],
 [74],
 [75],
 [76],
 [77],
 [78,
  80,
  83,
  297,
  311,
  337,
  343,
  347,
  348,
  349,
  378,
  384,
  392,
  394,
  397,
  410,
  422,
  430,
  434,
  454,
  462,
  465,
  466,
  468,
  480,
  482,
  484,
  499,
  502,
  507,
  512,
  518,
  523,
  524],
 [79],
 [81, 139, 146, 168, 408, 432],
 [82],
 [84],
 [85],
 [86],
 [87],
 [88],
 [89],
 [

In [21]:
# we manually check of what nature they are, we'll sample a few
clusters = [clus for clus in clusters if len(clus) > 1]

In [76]:
for idx in random.choice(clusters):
    print(texts[idx])
    print('----------------------')
    print(' ')

#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 
#小米电视3s#人工智能电视，蜕变真实的你！
----------------------
 


OK so let's now go on away and execute a naïve deduplication now