# Vox and PBS

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Vox News
- PBS News

### Date 
4.8.19

## Read Data

In [18]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Remove Fox

In [7]:
df_all = df_all[df_all['source'] != "Fox"]
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PBS,1739,1739,1739,1739
Vox,1027,1027,1027,1027


## Duplicate Vox to balance

In [8]:
vox = df_all[df_all['source'] == 'Vox'][:700]
vox.head()

Unnamed: 0,article id,source,article,clean_articles,targets
476,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...,Senate Republicans on Thursday revealed the Be...,2
477,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov....","“New York will be destroyed,” the state’s Gov....",2
478,vox_politics_602,Vox,The Trump administration wants to send a messa...,The Trump administration wants to send a messa...,2
479,vox_politics_1198,Vox,"Donald Trump’s long, improbable journey to pol...","Donald Trump’s long, improbable journey to pol...",2
480,vox_politics_682,Vox,The Trump administration threw the fate of the...,The Trump administration threw the fate of the...,2


In [9]:
df_all = df_all.append(vox, ignore_index=True)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PBS,1739,1739,1739,1739
Vox,1727,1727,1727,1727


## Relabel the targets

In [10]:
from tqdm import tqdm
import numpy as np

In [11]:
def label_vox(source):
    if source == "Vox":
        return 1 
    elif source == "PBS": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_vox(t) for t in tqdm(df_all["source"])])
df_all['targets'] = targets
df_all.head()

100%|██████████| 3466/3466 [00:00<00:00, 1219994.77it/s]


Unnamed: 0,article id,source,article,clean_articles,targets
0,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...,Senate Republicans on Thursday revealed the Be...,1
1,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov....","“New York will be destroyed,” the state’s Gov....",1
2,vox_politics_602,Vox,The Trump administration wants to send a messa...,The Trump administration wants to send a messa...,1
3,vox_politics_1198,Vox,"Donald Trump’s long, improbable journey to pol...","Donald Trump’s long, improbable journey to pol...",1
4,vox_politics_682,Vox,The Trump administration threw the fate of the...,The Trump administration threw the fate of the...,1


In [12]:
df_all.groupby('source').mean()

Unnamed: 0_level_0,targets
source,Unnamed: 1_level_1
PBS,0
Vox,1


## Split into test and training

In [13]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
2690,pbs_politics_70,PBS,— Senior White House officials pushed a proje...,Senior White House officials pushed a projec...,0
576,vox_politics_1296,Vox,The Congressional Budget Office’s forecast tha...,The Congressional Budget Office’s forecast tha...,1
2493,pbs_politics_1152,PBS,— Senate Majority Leader Mitch McConnell on T...,Senate Majority Leader Mitch McConnell on Tu...,0
3141,vox_politics_867,Vox,Donald Trump has won the White House. But it’s...,Donald Trump has won the White House. But it’s...,1
3451,vox_politics_1504,Vox,Georgia secretary of state and Republican gube...,Georgia secretary of state and Republican gube...,1


In [14]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
1634,pbs_politics_1353,PBS,— A former staffer to Rep. John Conyers says ...,A former staffer to Rep. John Conyers says h...,0
342,vox_politics_895,Vox,Republicans are taking a victory lap after Bre...,Republicans are taking a victory lap after Bre...,1
2162,pbs_politics_1562,PBS,ST. LOUIS — A seven-term prosecuting attorney ...,ST. LOUIS A seven-term prosecuting attorney i...,0
283,vox_politics_1394,Vox,President Donald Trump conducted tax reform ne...,President Donald Trump conducted tax reform ne...,1
1429,pbs_politics_1402,PBS,12:35 p.m.: Senate leaders have reached an agr...,12:35 p.m.: Senate leaders have reached an agr...,0


### Check for similarity between test and training

In [15]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PBS,179,179,179,179
Vox,168,168,168,168


In [16]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PBS,1560,1560,1560,1560
Vox,1559,1559,1559,1559


## Get Embeddings, Build Functions

In [17]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:50, 9528.66it/s]

Found 2196016 word vectors





In [19]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [20]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

## Setup Model

In [21]:
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.1
REC_DROPOUT = 0.1

In [22]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 347/347 [00:00<00:00, 526.19it/s]


In [23]:
# note...
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model = Sequential()
model.add(Bidirectional(LSTM(lstm_in, return_sequences=True, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(Bidirectional(LSTM(lstm_in)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 500, 64)           85248     
_________________________________________________________________
activation_1 (Activation)    (None, 500, 64)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 110,145
Trainable params: 110,145
Non-trainable params: 0
_________________________________________________________________


In [24]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7efe3fb5db00>

In [49]:
y_pred = model.predict(x_test)
y_pred[10:17] > 0.5

array([[ True],
       [False],
       [ True],
       [ True],
       [ True],
       [ True],
       [False]], dtype=bool)

In [50]:
test_df[['source', 'clean_articles', 'targets']][10:17]

Unnamed: 0,source,clean_articles,targets
452,Vox,North Carolina is in the middle of at least th...,1
1815,PBS,The Republican-led Congress narrowly passed ...,0
3437,Vox,Republicans really want to cut the corporate t...,1
3299,Vox,Donald Trump claimed $916 million in losses fr...,1
2999,Vox,"On Tuesday, as the Trump administration sunk f...",1
1314,PBS,Days after a failed mail bomb plot and a deadl...,0
1540,PBS,Democrats on the House Financial Services Co...,0


In [51]:
for i in np.arange(0.25, 0.80, 0.05):
    res = metrics.f1_score(y_test, y_pred > i)
    print("Threshold {i}, f1 score {res}".format(i=round(i,2),res=res))

Threshold 0.25, f1 score 0.8619047619047618
Threshold 0.3, f1 score 0.86810551558753
Threshold 0.35, f1 score 0.8695652173913043
Threshold 0.4, f1 score 0.8801955990220048
Threshold 0.45, f1 score 0.8861386138613861
Threshold 0.5, f1 score 0.8827930174563592
Threshold 0.55, f1 score 0.885
Threshold 0.6, f1 score 0.8911392405063291
Threshold 0.65, f1 score 0.8877551020408164
Threshold 0.7, f1 score 0.8923076923076922
Threshold 0.75, f1 score 0.8935064935064936
