# Tokenize and Train

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- Vox News
- PBS News

### Date 
4.8.19

## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [45]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [46]:
os.getcwd()

'/home/smlee_981/data'

In [47]:
os.chdir(FOLDER_READ)

In [48]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [49]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Remove PBS

In [50]:
df_all = df_all[df_all['source'] != "PBS"]
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,476,476,476,476
Vox,1027,1027,1027,1027


## Duplicate Fox to balance

In [51]:
fox = df_all[df_all['source'] == 'Fox']
fox.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


In [52]:
df_all = df_all.append(fox, ignore_index=True)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,952,952,952,952
Vox,1027,1027,1027,1027


## Relabel the targets

In [54]:
def label_fox(source):
    if source == "Fox":
        return 1 
    elif source == "Vox": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_fox(t) for t in tqdm(df_all["source"])])
df_all['targets'] = targets
df_all.head()

100%|██████████| 1979/1979 [00:00<00:00, 963497.11it/s]


Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,1
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,1
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,1
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,1
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,1


In [55]:
df_all.groupby('source').mean()

Unnamed: 0_level_0,targets
source,Unnamed: 1_level_1
Fox,1
Vox,0


## Split into test and training

In [56]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
164,fox_politics_496,Fox,'Draft Beto' grassroots effort urges Texas Dem...,'Draft Beto' grassroots effort urges Texas Dem...,1
1814,fox_politics_403,Fox,President Trump calls for border security in m...,President Trump calls for border security in m...,1
377,fox_politics_398,Fox,Sen. Graham slams Nancy Pelosi for being 'absu...,Sen. Graham slams Nancy Pelosi for being 'absu...,1
88,fox_politics_600,Fox,Sen. Kamala Harris: Facts still unfolding in J...,Sen. Kamala Harris: Facts still unfolding in J...,1
1616,fox_politics_598,Fox,Howard Kurtz: A 77-year-old curmudgeon in a cr...,Howard Kurtz: A 77-year-old curmudgeon in a cr...,1


In [57]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
1172,vox_politics_709,Vox,This is an effort to imagine the Affordable Ca...,This is an effort to imagine the Affordable Ca...,0
1055,vox_politics_703,Vox,Senate Republican leaders are postponing a vot...,Senate Republican leaders are postponing a vot...,0
1622,fox_politics_574,Fox,Is Gov. Hickenlooper looking ahead to 2020?\nA...,Is Gov. Hickenlooper looking ahead to 2020?\nA...,1
11,fox_politics_451,Fox,Trump 2020 campaign vows to go toe-to-toe with...,Trump 2020 campaign vows to go toe-to-toe with...,1
648,vox_politics_943,Vox,Vermont is now the ninth state to legalize mar...,Vermont is now the ninth state to legalize mar...,0


### Check for similarity between test and training

In [58]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,102,102,102,102
Vox,96,96,96,96


In [59]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,850,850,850,850
Vox,931,931,931,931


## Get Embeddings, Build Functions

In [60]:
from tqdm import tqdm
import numpy as np

In [61]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:15, 11240.02it/s]

Found 2196016 word vectors





In [62]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [63]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

## Setup Model

In [64]:
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.1
REC_DROPOUT = 0.1

In [65]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 198/198 [00:00<00:00, 588.94it/s]


In [66]:
# note...
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model = Sequential()
model.add(Bidirectional(LSTM(lstm_in, return_sequences=True, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))
model.add(Bidirectional(LSTM(lstm_in)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 500, 64)           85248     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 110,145
Trainable params: 110,145
Non-trainable params: 0
_________________________________________________________________


In [67]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f5811d8e4a8>

In [79]:
y_pred = model.predict(x_test)
y_pred[:7] > 0.5

array([[False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [ True]], dtype=bool)

In [81]:
test_df[['source', 'clean_articles', 'targets']].head(7)

Unnamed: 0,source,clean_articles,targets
1172,Vox,This is an effort to imagine the Affordable Ca...,0
1055,Vox,Senate Republican leaders are postponing a vot...,0
1622,Fox,Is Gov. Hickenlooper looking ahead to 2020?\nA...,1
11,Fox,Trump 2020 campaign vows to go toe-to-toe with...,1
648,Vox,Vermont is now the ninth state to legalize mar...,0
454,Fox,Trump on Rashida Tlaib’s impeachment comments:...,1
1775,Fox,Judge delays sentencing for Michael Flynn\nSen...,1


In [72]:
for i in np.arange(0.25, 0.75, 0.05):
    res = metrics.f1_score(y_test, y_pred > i)
    print("Threshold {i}, f1 score {res}".format(i=round(i,2),res=res))

Threshold 0.25, f1 score 0.9852216748768472
Threshold 0.3, f1 score 0.9852216748768472
Threshold 0.35, f1 score 0.9852216748768472
Threshold 0.4, f1 score 0.9852216748768472
Threshold 0.45, f1 score 0.9852216748768472
Threshold 0.5, f1 score 0.9852216748768472
Threshold 0.55, f1 score 0.9852216748768472
Threshold 0.6, f1 score 0.9852216748768472
Threshold 0.65, f1 score 0.9852216748768472
Threshold 0.7, f1 score 0.9852216748768472
