In [26]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import datetime
from tqdm import tqdm
import spacy
from multiprocessing import Pool
import pickle

data_path = '/Users/ruoyangzhang/Documents/PythonWorkingDirectory/news_exploration/Data/'

In [27]:
nlp = spacy.load('en')

### Importing data

In [28]:
data_1 = pd.read_csv(data_path + 'all-the-news/articles1.csv', encoding = 'utf8')

In [29]:
data_1 = data_1.drop(['Unnamed: 0'], axis = 1)

In [30]:
data_1.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


### Data preprocessing and repartition

- remove punctuations
- lower case everything
- reduce consecutive white spaces
- tokenisation



we repartition the data into intervals of 7 days with a padding period of 23 days:

- this means that for one period, we have 30 days of articles
- the padding is there to ensure the smooth transition between periods
- as well as to ensure we have just slightly more volume, since it is recommended to apply W2V on sufficiently large datasets

In [31]:
data_1.date = [datetime.datetime.strptime(day, '%Y-%m-%d') for day in data_1.date]

We filter the data to only focus on data from 2016

In [32]:
data_1 = data_1[[str(day).split('-')[0] == '2016' for day in data_1.date]].reset_index(drop = True)

remove all non alpha numeric characters

In [33]:
data_1.content = [re.sub('[\W_]+', ' ', text).lower().strip() for text in tqdm(data_1.content)]

100%|██████████| 28451/28451 [00:06<00:00, 4525.48it/s]


Tokenisation

In [34]:
p = Pool(8, maxtasksperchild=1)
tks_1 = p.map(nlp.tokenizer, tqdm(data_1.content))
p.close()

100%|██████████| 28451/28451 [02:00<00:00, 235.51it/s]


In [49]:
data_1.content = [[str(word) for word in list(tk)] for tk in tqdm(tks_1)]

100%|██████████| 28451/28451 [00:24<00:00, 1151.81it/s]


We repartition the data

In [36]:
min_date = min(data_1.date)
max_date = max(data_1.date)

In [37]:
min_date, max_date

(Timestamp('2016-01-01 00:00:00'), Timestamp('2016-12-31 00:00:00'))

In [51]:
# we could parallelise/multithread this process if the datasets are much bigger
begin_date = min_date
repartitioned_articles = []
for i in tqdm(range(int((int((max_date - min_date).days) - 30) / 7))):
    articles = data_1[[begin_date <= day < begin_date + datetime.timedelta(days = 30) for day in data_1.date]].content
    #with open(data_path+'all-the-news/articles_1_preprocessed.txt', 'w') as f:
    #    for article in articles:
    #       f.write(' '.join(article) + ' \n')
    begin_date += datetime.timedelta(days = 7)
    repartitioned_articles.append(articles)

100%|██████████| 47/47 [00:05<00:00,  8.35it/s]


We pickle the above file for later use

In [52]:
with open(data_path+'all-the-news/articles_1_preprocessed.pickle', 'wb') as handle:
    pickle.dump(repartitioned_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)