In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import datetime
from tqdm import tqdm
import spacy
from multiprocessing import Pool
import pickle
import string

data_path = '../Data/'

In [2]:
nlp = spacy.load('en')

### Importing data

In [29]:
data_1 = pd.read_csv(data_path + 'all-the-news/articles1.csv', encoding = 'utf8')

In [30]:
data_1 = data_1.drop(['Unnamed: 0'], axis = 1)

In [31]:
data_1.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


### Data preprocessing and repartition

- remove punctuations
- lower case everything
- reduce consecutive white spaces
- tokenisation



we repartition the data into intervals of 7 days with a padding period of 23 days:

- this means that for one period, we have 30 days of articles
- the padding is there to ensure the smooth transition between periods
- as well as to ensure we have just slightly more volume, since it is recommended to apply W2V on sufficiently large datasets

In [4]:
data_1.date = [datetime.datetime.strptime(day, '%Y-%m-%d') for day in data_1.date]

We filter the data to only focus on data from 2016

In [26]:
data_1 = data_1[[str(day).split('-')[0] == '2016' for day in data_1.date]].reset_index(drop = True)

remove all non alpha numeric characters

In [27]:
p = Pool(8, maxtasksperchild=1)
tks_1 = p.map(nlp.tokenizer, tqdm(data_1.content))
p.close()

100%|██████████| 28451/28451 [02:13<00:00, 213.81it/s]


data_1.content = [re.sub('[\W_]+', ' ', text).lower().strip() for text in tqdm(data_1.content)]

Tokenisation

We repartition the data

In [54]:
min_date = min(data_1.date)
max_date = max(data_1.date)

In [55]:
min_date, max_date

(Timestamp('2016-01-01 00:00:00'), Timestamp('2016-12-31 00:00:00'))

In [56]:
# we could parallelise/multithread this process if the datasets are much bigger
begin_date = min_date
repartitioned_articles = []
for i in tqdm(range(int((int((max_date - min_date).days) - 30) / 7))):
    articles = data_1[[begin_date <= day < begin_date + datetime.timedelta(days = 30) for day in data_1.date]].content
    #with open(data_path+'all-the-news/articles_1_preprocessed.txt', 'w') as f:
    #    for article in articles:
    #       f.write(' '.join(article) + ' \n')
    begin_date += datetime.timedelta(days = 7)
    repartitioned_articles.append(articles)


  0%|          | 0/47 [00:00<?, ?it/s][A
  2%|▏         | 1/47 [00:00<00:09,  4.77it/s][A
  4%|▍         | 2/47 [00:00<00:20,  2.20it/s][A
  6%|▋         | 3/47 [00:01<00:16,  2.70it/s][A
  9%|▊         | 4/47 [00:01<00:14,  3.06it/s][A
 11%|█         | 5/47 [00:01<00:12,  3.38it/s][A
 13%|█▎        | 6/47 [00:02<00:15,  2.56it/s][A
 15%|█▍        | 7/47 [00:02<00:14,  2.75it/s][A
 17%|█▋        | 8/47 [00:02<00:13,  2.95it/s][A
 19%|█▉        | 9/47 [00:02<00:12,  3.12it/s][A
 21%|██▏       | 10/47 [00:03<00:13,  2.74it/s][A
 23%|██▎       | 11/47 [00:03<00:12,  2.89it/s][A
 26%|██▌       | 12/47 [00:03<00:11,  3.02it/s][A
 28%|██▊       | 13/47 [00:04<00:12,  2.72it/s][A
 30%|██▉       | 14/47 [00:04<00:11,  2.84it/s][A
 32%|███▏      | 15/47 [00:05<00:10,  2.95it/s][A
 34%|███▍      | 16/47 [00:05<00:10,  3.03it/s][A
 36%|███▌      | 17/47 [00:06<00:10,  2.78it/s][A
 38%|███▊      | 18/47 [00:06<00:10,  2.87it/s][A
 40%|████      | 19/47 [00:06<00:09,  2.96it/s]

We pickle the above file for later use

In [57]:
with open(data_path+'all-the-news/articles_1_preprocessed.pickle', 'wb') as handle:
    pickle.dump(repartitioned_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
with open(data_path+'all-the-news/articles_1_preprocessed.pickle', 'rb') as handle:
    df = pickle.load(handle)

## Testing out the class

In [1]:
from preprocessing import news_preprocess

In [2]:
prepro = news_preprocess(cores = 8)

In [3]:
data_dir = '../Data/all-the-news/articles1.csv'
prepro.pre_process(data_dir, 'content', 'date', begin = 20170101, end = 20171231)

100%|██████████| 17908/17908 [01:46<00:00, 167.64it/s]
100%|██████████| 17908/17908 [01:00<00:00, 298.30it/s]


training for and detecting bigrams


100%|██████████| 17908/17908 [00:33<00:00, 542.28it/s]


the dataframe is preprocessed successfully


In [4]:
prepro.cut_and_slide(30,7)

100%|██████████| 20/20 [00:02<00:00,  8.53it/s]

articles repartitioned, they can be accessed at self.reparitioned_articles





In [7]:
prepro.save_to_pickle(data_path+'all-the-news/articles_1_')

In [8]:
with open(data_path + '/all-the-news/articles_1_preprocessed.pickle', 'rb') as handle:
    article_lst = pickle.load(handle)

success

In [12]:
len(article_lst)

20

In [13]:
[len(per) for per in article_lst]

[3525,
 3605,
 3548,
 3455,
 3446,
 3364,
 3360,
 3323,
 3207,
 3243,
 3404,
 3589,
 3556,
 3392,
 3037,
 2602,
 2500,
 2410,
 2435,
 2454]

In [15]:
with open(data_path + 'all-the-news/articles_1_1_preprocessed.pickle', 'wb') as handle:
    pickle.dump(article_lst[1],handle)

## testing bigram

In [6]:
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

In [7]:
with open(data_path + 'all-the-news/articles_1_0_preprocessed.pickle', 'rb') as handle:
    articles = pickle.load(handle)

In [19]:
phrases = Phrases(articles[:10], min_count=10, threshold=50)

In [20]:
bigram = Phraser(phrases)

In [21]:
print(bigram[articles[1]])

['seoul', 'south_korea', 'north_korea', '’s', 'leader', 'kim', 'said', 'on_sunday', 'that', 'his', 'country', 'was', 'making', 'final', 'preparations', 'to', 'conduct', 'its', 'first', 'test', 'of', 'an_intercontinental', 'ballistic_missile', 'a', 'bold', 'statement', 'less_than', 'a', 'month', 'before', 'the', 'inauguration', 'of', 'donald_j.', 'trump', 'although', 'north_korea', 'has', 'conducted', 'five', 'nuclear', 'tests', 'in', 'the', 'last_decade', 'and', 'more_than', '20', 'ballistic_missile', 'tests', 'in', '2016', 'alone', 'and', 'although', 'it', 'habitually', 'threatens', 'to', 'attack', 'the', 'united_states', 'with', 'nuclear_weapons', 'the', 'country', 'has', 'never', 'an_intercontinental', 'ballistic_missile', 'or', 'icbm', 'in', 'his', 'annual_new', 'year', '’s', 'day', 'speech', 'which', 'was', 'broadcast', 'on', 'the', 'north', '’s', 'kctv', 'on_sunday', 'mr.', 'kim', 'spoke', 'proudly', 'of', 'the', 'strides', 'he', 'said', 'his', 'country', 'had', 'made', 'in', 'it

## Converting json files to dataframe

In [4]:
import jsonlines
#from pprint import pprint
data = []
with jsonlines.open(data_path + 'signalmedia-1m.jsonl', 'r') as reader:
    for entry in reader:
        data.append(entry)

In [17]:
for k,v in data[0].items():
    print('*****',k,'*****')
    print(v)
    print('------------------------------')

***** id *****
f7ca322d-c3e8-40d2-841f-9d7250ac72ca
------------------------------
***** content *****
VETERANS saluted Worcester's first ever breakfast club for ex-soldiers which won over hearts, minds and bellies. 
 
The Worcester Breakfast Club for HM Forces Veterans met at the Postal Order in Foregate Street at 10am on Saturday. 
 
The club is designed to allow veterans a place to meet, socialise, eat and drink, giving hunger and loneliness their marching orders. 
 
Father-of-two Dave Carney, aged 43, of Merrimans Hill, Worcester, set up the club after being inspired by other similar clubs across the country. 
 
He said: "As you can see from the picture, we had a good response. Five out of the 10 that attended said they saw the article in the newspaper and turned up. 
 
"We even had an old chap travel from Droitwich and he was late on parade by three hours. 
 
"It's generated a lot of interest and I estimate (from other veterans who saw the article) that next month's meeting will a

In [5]:
df = pd.DataFrame(data)

In [6]:
df.published = [date[:10] for date in df.published]

In [25]:
df.content = [article.replace('\r', ' ') for article in df.content]
df.content = [article.replace('\t', ' ') for article in df.content]

In [12]:
df.head()

Unnamed: 0,content,id,media-type,published,source,title
0,VETERANS saluted Worcester's first ever breakf...,f7ca322d-c3e8-40d2-841f-9d7250ac72ca,News,2015-09-07,Redditch Advertiser,Worcester breakfast club for veterans gives hu...
1,New Product Gives Marketers Access to Real Key...,609772bc-0672-4db5-8516-4c025cfd54ca,News,2015-09-17,Virtualization Conference & Expo,Jumpshot Gives Marketers Renewed Visibility In...
2,Home »\rStyle » The Return Of The Nike Air Max...,1aa9d1b0-e6ba-4a48-ad0c-66552d896aac,Blog,2015-09-22,Streets Connect,The Return Of The Nike Air Max Sensation Has 8...
3,NYMag.com Daily Intelligencer Vulture The Cut ...,719699f9-47be-4bc7-969b-b53a881c95ae,Blog,2015-09-16,The Cut,This New Dating App Will Ruin Your Internet Game
4,"KUALA LUMPUR, Sept 15 (MySinchew) -- The Kuala...",a080f99a-07d9-47d1-8244-26a540017b7a,News,2015-09-15,My Sinchew,Pay up or face legal action: DBKL


In [26]:
df.to_csv(data_path + 'signal.csv', sep = ',')

In [27]:
pd.read_csv(data_path + 'signal.csv', sep = ',').head(5)

Unnamed: 0.1,Unnamed: 0,content,id,media-type,published,source,title
0,0,VETERANS saluted Worcester's first ever breakf...,f7ca322d-c3e8-40d2-841f-9d7250ac72ca,News,2015-09-07,Redditch Advertiser,Worcester breakfast club for veterans gives hu...
1,1,New Product Gives Marketers Access to Real Key...,609772bc-0672-4db5-8516-4c025cfd54ca,News,2015-09-17,Virtualization Conference & Expo,Jumpshot Gives Marketers Renewed Visibility In...
2,2,Home » Style » The Return Of The Nike Air Max ...,1aa9d1b0-e6ba-4a48-ad0c-66552d896aac,Blog,2015-09-22,Streets Connect,The Return Of The Nike Air Max Sensation Has 8...
3,3,NYMag.com Daily Intelligencer Vulture The Cut ...,719699f9-47be-4bc7-969b-b53a881c95ae,Blog,2015-09-16,The Cut,This New Dating App Will Ruin Your Internet Game
4,4,"KUALA LUMPUR, Sept 15 (MySinchew) -- The Kuala...",a080f99a-07d9-47d1-8244-26a540017b7a,News,2015-09-15,My Sinchew,Pay up or face legal action: DBKL


In [18]:
import csv
with open(r"../Data/signal.csv", 'r') as f:
    reader = csv.reader(f)
    linenumber = 1
    try:
        for row in reader:
            linenumber += 1
    except Exception as e:
        print (("Error line %d: %s %s" % (linenumber, str(type(e)), e.message)))

In [19]:
[i for i, article in enumerate(df.content) if '\r' in article]

[2,
 27,
 29,
 32,
 35,
 44,
 52,
 61,
 75,
 81,
 82,
 87,
 90,
 92,
 93,
 105,
 129,
 133,
 136,
 137,
 138,
 140,
 149,
 184,
 185,
 196,
 211,
 229,
 263,
 280,
 289,
 291,
 297,
 309,
 311,
 312,
 313,
 316,
 319,
 320,
 331,
 343,
 344,
 354,
 362,
 370,
 376,
 386,
 394,
 404,
 405,
 437,
 440,
 443,
 451,
 458,
 459,
 461,
 468,
 485,
 495,
 510,
 512,
 528,
 530,
 538,
 566,
 584,
 589,
 609,
 615,
 617,
 636,
 640,
 653,
 672,
 673,
 690,
 693,
 697,
 728,
 738,
 743,
 762,
 764,
 768,
 781,
 785,
 798,
 801,
 813,
 815,
 818,
 819,
 822,
 829,
 835,
 853,
 855,
 856,
 860,
 870,
 872,
 893,
 895,
 901,
 907,
 928,
 944,
 959,
 961,
 982,
 1004,
 1020,
 1053,
 1055,
 1069,
 1084,
 1115,
 1118,
 1125,
 1136,
 1139,
 1143,
 1151,
 1178,
 1181,
 1186,
 1198,
 1207,
 1211,
 1225,
 1233,
 1235,
 1245,
 1249,
 1255,
 1261,
 1263,
 1292,
 1302,
 1303,
 1304,
 1305,
 1319,
 1333,
 1336,
 1351,
 1358,
 1359,
 1366,
 1369,
 1375,
 1377,
 1394,
 1416,
 1430,
 1448,
 1473,
 1488,
 1494,
 

In [22]:
df.content[2]

'Home » Style » The Return Of The Nike Air Max Sensation Has 80’s Babies Hyped!\tPosted on Sep 22, 2015 If you were a basketball fan who was born in the 80s, you were lucky enough to witness the beauty that is 90s basketball. It was truly a great time to be basketball fan. If you played close attention to what the players were wearing on their feet you would have also noticed the wide array of footwear these player used to rock. One of those happens to the the Nike Air Max Sensation, which is also set to receive the retro treatment this year! Originally released back in …read more Author: KicksOnFire\tShare This Post On GoogleFacebookTwitter'