# Create dataset with articles as embeddings

### Author 
Stephen Lee

### Goal
Input raw .txt files of articles and output a dataframe with the articles as word embeddings.

### Date 
6.12.19

## Read Data, Remove Missing Values

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 


import os 
import math 
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
os.getcwd()

'/home/stephen/Dropbox/General/Projects/Thesis/code/clean-data'

In [3]:
FOLDER_READ = '/home/stephen/Dropbox/General/Projects/Thesis/data'
FILE = 'articles.csv'
ARTICLE_LENGTH = 1000    # max length for an article

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['vox-politics.tar.gz',
 'pbs-politics.tar.gz',
 'articles.csv',
 'bbc-articles.tar.gz',
 'pbs-politics',
 'fox-politics',
 'fox-politics.tar.gz',
 'vox-politics',
 'archive']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article
0,fox_politics_166,Fox,Video\n<br>\nFormer New Jersey Gov. Chris Chri...
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B..."
2,fox_politics_423,Fox,"Video\nHoward Kurtz: How Michael Cohen, Democr..."
3,fox_politics_102,Fox,Video\nStudent Union: Make UC Berkeley a sanct...
4,fox_politics_492,Fox,Video\nPresident Trump’s health care executive...


In [7]:
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Fox,1024,1023
PBS,1752,1752
Vox,2000,1938


In [8]:
df_all = df_all.dropna()
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article
source,Unnamed: 1_level_1,Unnamed: 2_level_1
Fox,1023,1023
PBS,1752,1752
Vox,1938,1938


### Check for and remove duplicates

In [9]:
df_all.groupby("source").describe()

Unnamed: 0_level_0,article,article,article,article,article id,article id,article id,article id
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,1023,661,Video\nJudicial nominee Neomi Rao may get 'Kav...,4,1023,1023,fox_politics_909,1
PBS,1752,1739,"It is messy, tentacled, and increasingly confu...",5,1752,1752,pbs_politics_556,1
Vox,1938,1027,"Part of The 2018 midterm elections, explained",152,1938,1938,vox_politics_1877,1


In [10]:
df_all = df_all.drop_duplicates('article', keep='first')
df_all.groupby("source").describe()

Unnamed: 0_level_0,article,article,article,article,article id,article id,article id,article id
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,661,661,Video\nOcasio-Cortez suggests a 60 to 70 perce...,1,661,661,fox_politics_365,1
PBS,1739,1739,"ALEXANDRIA, Va. — Prosecutors will ask jurors ...",1,1739,1739,pbs_politics_556,1
Vox,1027,1027,President Donald Trump has declared a national...,1,1027,1027,vox_politics_1514,1


### Replace "video" with ""
This fixes a potential issue with the fox news articles, many of which start with "video\n..."

In [11]:
df_all['article'] = df_all['article'].str.replace('Video\n','')
df_all.head()

Unnamed: 0,article id,source,article
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B..."
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto..."
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...
4,fox_politics_492,Fox,President Trump’s health care executive order:...


### Look at text from Fox and Vox and PBS for idiosyncracies 

In [12]:
df_all[df_all['source'] == "Fox"].head(15)

Unnamed: 0,article id,source,article
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B..."
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto..."
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...
4,fox_politics_492,Fox,President Trump’s health care executive order:...
5,fox_politics_554,Fox,Former Bernie Sanders campaign staffer on repo...
6,fox_politics_490,Fox,Trump takes on ObamaCare subsidies\nBrad Blake...
7,fox_politics_590,Fox,Washington State proposes a new carbon tax\nTh...
8,fox_politics_1,Fox,Do voters think Ocasio-Cortez's Green New Deal...
9,fox_politics_971,Fox,President Trump gave Dr. Ronny Jackson a clean...


In [13]:
df_all[df_all['source'] == "Vox"].head(15)

Unnamed: 0,article id,source,article
1024,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...
1025,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov...."
1026,vox_politics_602,Vox,The Trump administration wants to send a messa...
1027,vox_politics_1198,Vox,"Donald Trump’s long, improbable journey to pol..."
1028,vox_politics_682,Vox,The Trump administration threw the fate of the...
1029,vox_politics_1634,Vox,"On Wednesday, the White House released a state..."
1030,vox_politics_976,Vox,"Part of The 2018 midterm elections, explained"
1031,vox_politics_590,Vox,Part of Understanding the Trump era
1032,vox_politics_71,Vox,Republicans and Democrats in Congress have fin...
1033,vox_politics_714,Vox,"Two months ago, things looked dire for Obamaca..."


In [14]:
df_all[df_all['source'] == "PBS"].head(15)

Unnamed: 0,article id,source,article
3024,pbs_politics_396,PBS,President Donald Trump’s longtime personal law...
3025,pbs_politics_372,PBS,WASHINGTON — Facing a midnight deadline to avo...
3026,pbs_politics_602,PBS,WASHINGTON — President Donald Trump is exagger...
3027,pbs_politics_1198,PBS,\nPresident Donald Trump says newly confirmed ...
3028,pbs_politics_682,PBS,President Donald Trump is adding a new lawyer ...
3029,pbs_politics_1634,PBS,DALLAS — U.S. Rep. Joe Barton told a woman tha...
3030,pbs_politics_976,PBS,WASHINGTON — President Donald Trump said Tuesd...
3031,pbs_politics_590,PBS,"ALEXANDRIA, Va. — In a blistering back-and for..."
3032,pbs_politics_71,PBS,WASHINGTON (AP) — Michael Cohen’s closed-door ...
3033,pbs_politics_714,PBS,Supreme Court nominee Brett Kavanaugh says he ...


### Remove location text in PBS
This will remove, for example, "DETROIT --- Start of article..." 

In [15]:
df_all['article'] = df_all['article'].str.replace('WASHINGTON', '')
df_all[df_all['source'] == "PBS"].head(20)

Unnamed: 0,article id,source,article
3024,pbs_politics_396,PBS,President Donald Trump’s longtime personal law...
3025,pbs_politics_372,PBS,— Facing a midnight deadline to avoid a parti...
3026,pbs_politics_602,PBS,— President Donald Trump is exaggerating the ...
3027,pbs_politics_1198,PBS,\nPresident Donald Trump says newly confirmed ...
3028,pbs_politics_682,PBS,President Donald Trump is adding a new lawyer ...
3029,pbs_politics_1634,PBS,DALLAS — U.S. Rep. Joe Barton told a woman tha...
3030,pbs_politics_976,PBS,— President Donald Trump said Tuesday that th...
3031,pbs_politics_590,PBS,"ALEXANDRIA, Va. — In a blistering back-and for..."
3032,pbs_politics_71,PBS,(AP) — Michael Cohen’s closed-door testimony ...
3033,pbs_politics_714,PBS,Supreme Court nominee Brett Kavanaugh says he ...


In [16]:
df_all['clean_articles'] = df_all['article'].str.replace(u"\u2014", "")
df_all[df_all['source'] == "PBS"].head(25)

Unnamed: 0,article id,source,article,clean_articles
3024,pbs_politics_396,PBS,President Donald Trump’s longtime personal law...,President Donald Trump’s longtime personal law...
3025,pbs_politics_372,PBS,— Facing a midnight deadline to avoid a parti...,Facing a midnight deadline to avoid a partia...
3026,pbs_politics_602,PBS,— President Donald Trump is exaggerating the ...,President Donald Trump is exaggerating the n...
3027,pbs_politics_1198,PBS,\nPresident Donald Trump says newly confirmed ...,\nPresident Donald Trump says newly confirmed ...
3028,pbs_politics_682,PBS,President Donald Trump is adding a new lawyer ...,President Donald Trump is adding a new lawyer ...
3029,pbs_politics_1634,PBS,DALLAS — U.S. Rep. Joe Barton told a woman tha...,DALLAS U.S. Rep. Joe Barton told a woman that...
3030,pbs_politics_976,PBS,— President Donald Trump said Tuesday that th...,President Donald Trump said Tuesday that the...
3031,pbs_politics_590,PBS,"ALEXANDRIA, Va. — In a blistering back-and for...","ALEXANDRIA, Va. In a blistering back-and fort..."
3032,pbs_politics_71,PBS,(AP) — Michael Cohen’s closed-door testimony ...,(AP) Michael Cohen’s closed-door testimony b...
3033,pbs_politics_714,PBS,Supreme Court nominee Brett Kavanaugh says he ...,Supreme Court nominee Brett Kavanaugh says he ...


## Tokenize

In [18]:
from tqdm import tqdm

PATH = '/home/stephen/Dropbox/CodeWorkspace/data-sets/NLP/quora-questions/embeddings/glove.840B.300d/'
FILE = PATH + 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(FILE, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print(f"Found {len(embeddings_index)} word vectors")

2196017it [04:19, 8461.66it/s]

Found 2196016 word vectors





In [19]:
def text_to_array(text, article_length=ARTICLE_LENGTH):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [35]:
def batch_gen(train_df, batch_size=128):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            text_arr = np.array([text_to_array(text) for text in texts])
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            yield text_arr, targets

In [17]:
TARGETS = 5
tokenizer = Tokenizer(num_words=TARGETS)
tokenizer.fit_on_texts(df_all['source'])

targets = tokenizer.texts_to_sequences(df_all['source'])
df_all['targets'] = [i[0] for i in targets]
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,<br>\nFormer New Jersey Gov. Chris Christie sa...,3
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...","FILE--In this July 28, 2016 file photo, Sen. B...",3
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...","Howard Kurtz: How Michael Cohen, Democrats sto...",3
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,Student Union: Make UC Berkeley a sanctuary ca...,3
4,fox_politics_492,Fox,President Trump’s health care executive order:...,President Trump’s health care executive order:...,3


In [18]:
df_all[df_all['source'] == "Vox"].head()

Unnamed: 0,article id,source,article,clean_articles,targets
1024,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...,Senate Republicans on Thursday revealed the Be...,2
1025,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov....","“New York will be destroyed,” the state’s Gov....",2
1026,vox_politics_602,Vox,The Trump administration wants to send a messa...,The Trump administration wants to send a messa...,2
1027,vox_politics_1198,Vox,"Donald Trump’s long, improbable journey to pol...","Donald Trump’s long, improbable journey to pol...",2
1028,vox_politics_682,Vox,The Trump administration threw the fate of the...,The Trump administration threw the fate of the...,2


In [19]:
df_all[df_all['source'] == "PBS"].head()

Unnamed: 0,article id,source,article,clean_articles,targets
3024,pbs_politics_396,PBS,President Donald Trump’s longtime personal law...,President Donald Trump’s longtime personal law...,1
3025,pbs_politics_372,PBS,— Facing a midnight deadline to avoid a parti...,Facing a midnight deadline to avoid a partia...,1
3026,pbs_politics_602,PBS,— President Donald Trump is exaggerating the ...,President Donald Trump is exaggerating the n...,1
3027,pbs_politics_1198,PBS,\nPresident Donald Trump says newly confirmed ...,\nPresident Donald Trump says newly confirmed ...,1
3028,pbs_politics_682,PBS,President Donald Trump is adding a new lawyer ...,President Donald Trump is adding a new lawyer ...,1


### Save df

In [20]:
df_all.to_csv('clean_article_df.csv', sep='|')

## Split into test and training sets

In [23]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
4040,pbs_politics_1292,PBS,(AP) — President Donald Trump on Tuesday issu...,(AP) President Donald Trump on Tuesday issue...,1
4031,pbs_politics_648,PBS,When he lost his wife and 1-year-old daughter ...,When he lost his wife and 1-year-old daughter ...,1
4226,pbs_politics_261,PBS,— When you want results in a polarized Washin...,When you want results in a polarized Washing...,1
4185,pbs_politics_559,PBS,— President Donald Trump is edging closer to ...,President Donald Trump is edging closer to d...,1
4389,pbs_politics_589,PBS,"HELSINKI, Finland — President Donald Trump, wh...","HELSINKI, Finland President Donald Trump, whi...",1


In [24]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
2999,pbs_politics_1191,PBS,"— Nearly two years out of the White House, fo...","Nearly two years out of the White House, for...",1
2589,vox_politics_54,Vox,President Donald Trump has officially declared...,President Donald Trump has officially declared...,2
1994,vox_politics_1719,Vox,House Republicans passed their tax reform plan...,House Republicans passed their tax reform plan...,2
4086,pbs_politics_1350,PBS,— Dealing a serious blow to President Donald ...,Dealing a serious blow to President Donald T...,1
4066,pbs_politics_428,PBS,— Seeking to move past the shadow of the Russ...,Seeking to move past the shadow of the Russi...,1


### Check how similar test and train are

In [25]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,40,40,40,40
PBS,176,176,176,176
Vox,109,109,109,109


In [26]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,436,436,436,436
PBS,1563,1563,1563,1563
Vox,918,918,918,918


In [27]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,40.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0
PBS,176.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,109.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


In [28]:
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,436.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0
PBS,1563.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,918.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


## Summary 

#### We have done the following: 
- Read in the data 
- Removed missing values
- Removed duplicates
- Performed string replacements to remove idosyncracies 
- Split data into test and training
- Checked for similarity between test and training 
- Build function to tokenize articles with pretrained embeddings 
- Tokenize source

#### Next
- Set up Bidirectional LSTM 
- Play with different configurations 

## Train

In [30]:
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM

In [29]:
# witheld for validation
test_articles = np.array([text_to_array(x) for x in tqdm(test_df["clean_articles"])])
test_target = np.array(test_df["targets"])

100%|██████████| 325/325 [00:00<00:00, 469.65it/s]


In [33]:
input_shape = (ARTICLE_LENGTH, 300)

model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.05, recurrent_dropout=0.1), \
                        input_shape=input_shape))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 1000, 128)         186880    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 285,825
Trainable params: 285,825
Non-trainable params: 0
_________________________________________________________________


In [None]:
data = batch_gen(train_df)
model.fit_generator(data, epochs=2, steps_per_epoch=1000, \
                    validation_data=(test_articles, test_target), \
                    verbose=True)

Epoch 1/2
