In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import datetime
from tqdm import tqdm
import spacy
from multiprocessing import Pool
import pickle
import string

data_path = '../Data/'

In [2]:
nlp = spacy.load('en')

### Importing data

In [2]:
data_1 = pd.read_csv(data_path + 'all-the-news/articles1.csv', encoding = 'utf8')

In [3]:
data_1 = data_1.drop(['Unnamed: 0'], axis = 1)

In [5]:
data_1.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


### Data preprocessing and repartition

- remove punctuations
- lower case everything
- reduce consecutive white spaces
- tokenisation



we repartition the data into intervals of 7 days with a padding period of 23 days:

- this means that for one period, we have 30 days of articles
- the padding is there to ensure the smooth transition between periods
- as well as to ensure we have just slightly more volume, since it is recommended to apply W2V on sufficiently large datasets

In [4]:
data_1.date = [datetime.datetime.strptime(day, '%Y-%m-%d') for day in data_1.date]

We filter the data to only focus on data from 2016

In [26]:
data_1 = data_1[[str(day).split('-')[0] == '2016' for day in data_1.date]].reset_index(drop = True)

remove all non alpha numeric characters

In [27]:
p = Pool(8, maxtasksperchild=1)
tks_1 = p.map(nlp.tokenizer, tqdm(data_1.content))
p.close()

100%|██████████| 28451/28451 [02:13<00:00, 213.81it/s]


data_1.content = [re.sub('[\W_]+', ' ', text).lower().strip() for text in tqdm(data_1.content)]

Tokenisation

We repartition the data

In [54]:
min_date = min(data_1.date)
max_date = max(data_1.date)

In [55]:
min_date, max_date

(Timestamp('2016-01-01 00:00:00'), Timestamp('2016-12-31 00:00:00'))

In [56]:
# we could parallelise/multithread this process if the datasets are much bigger
begin_date = min_date
repartitioned_articles = []
for i in tqdm(range(int((int((max_date - min_date).days) - 30) / 7))):
    articles = data_1[[begin_date <= day < begin_date + datetime.timedelta(days = 30) for day in data_1.date]].content
    #with open(data_path+'all-the-news/articles_1_preprocessed.txt', 'w') as f:
    #    for article in articles:
    #       f.write(' '.join(article) + ' \n')
    begin_date += datetime.timedelta(days = 7)
    repartitioned_articles.append(articles)


  0%|          | 0/47 [00:00<?, ?it/s][A
  2%|▏         | 1/47 [00:00<00:09,  4.77it/s][A
  4%|▍         | 2/47 [00:00<00:20,  2.20it/s][A
  6%|▋         | 3/47 [00:01<00:16,  2.70it/s][A
  9%|▊         | 4/47 [00:01<00:14,  3.06it/s][A
 11%|█         | 5/47 [00:01<00:12,  3.38it/s][A
 13%|█▎        | 6/47 [00:02<00:15,  2.56it/s][A
 15%|█▍        | 7/47 [00:02<00:14,  2.75it/s][A
 17%|█▋        | 8/47 [00:02<00:13,  2.95it/s][A
 19%|█▉        | 9/47 [00:02<00:12,  3.12it/s][A
 21%|██▏       | 10/47 [00:03<00:13,  2.74it/s][A
 23%|██▎       | 11/47 [00:03<00:12,  2.89it/s][A
 26%|██▌       | 12/47 [00:03<00:11,  3.02it/s][A
 28%|██▊       | 13/47 [00:04<00:12,  2.72it/s][A
 30%|██▉       | 14/47 [00:04<00:11,  2.84it/s][A
 32%|███▏      | 15/47 [00:05<00:10,  2.95it/s][A
 34%|███▍      | 16/47 [00:05<00:10,  3.03it/s][A
 36%|███▌      | 17/47 [00:06<00:10,  2.78it/s][A
 38%|███▊      | 18/47 [00:06<00:10,  2.87it/s][A
 40%|████      | 19/47 [00:06<00:09,  2.96it/s]

We pickle the above file for later use

In [57]:
with open(data_path+'all-the-news/articles_1_preprocessed.pickle', 'wb') as handle:
    pickle.dump(repartitioned_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
with open(data_path+'all-the-news/articles_1_preprocessed.pickle', 'rb') as handle:
    df = pickle.load(handle)

## Testing out the class

In [2]:
from preprocessing import news_preprocess

In [3]:
prepro = news_preprocess(cores = 8)

In [4]:
data_dir = '../Data/all-the-news/articles1.csv'
prepro.pre_process(data_dir, 'content', 'date', begin = 20170101, end = 20171231)

100%|██████████| 17908/17908 [01:41<00:00, 176.62it/s]
100%|██████████| 17908/17908 [00:52<00:00, 340.40it/s]


the dataframe is preprocessed successfully


In [5]:
prepro.cut_and_slide(30,7)

100%|██████████| 20/20 [00:02<00:00,  8.37it/s]

articles repartitioned, they can be accessed at self.reparitioned_articles





In [6]:
prepro.save_to_pickle(data_path+'all-the-news/')

In [20]:
with open(data_path + 'articles_1_preprocessed.pickle', 'rb') as handle:
    article_lst = pickle.load(handle)

success