# FOX and PBS

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- PBS News

### Date 
4.8.19

## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Remove Vox

In [7]:
df_all = df_all[df_all['source'] != "Vox"]
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,476,476,476,476
PBS,1739,1739,1739,1739


## Duplicate Fox and to balance

In [11]:
fox = df_all[df_all['source'] == 'Fox']
fox.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


In [12]:
df_all = df_all.append([fox]*3, ignore_index=True)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,1904,1904,1904,1904
PBS,1739,1739,1739,1739


## Relabel the targets

In [14]:
from tqdm import tqdm
import numpy as np

In [15]:
def label_fox(source):
    if source == "Fox":
        return 1 
    elif source == "PBS": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_fox(t) for t in tqdm(df_all["source"])])
df_all['targets'] = targets
df_all.head()

100%|██████████| 3643/3643 [00:00<00:00, 1007772.69it/s]


Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,1
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,1
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,1
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,1
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,1


In [16]:
df_all.groupby('source').mean()

Unnamed: 0_level_0,targets
source,Unnamed: 1_level_1
Fox,1
PBS,0


## Split into test and training

In [17]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
379,fox_politics_196,Fox,CNN's Jim Acosta takes heat after showing bord...,CNN's Jim Acosta takes heat after showing bord...,1
1657,pbs_politics_306,PBS,— The House voted overwhelmingly Thursday to ...,The House voted overwhelmingly Thursday to m...,0
2723,fox_politics_532,Fox,"Frosh Dems rock Capitol boat with brash, confr...","Frosh Dems rock Capitol boat with brash, confr...",1
2682,fox_politics_546,Fox,"Michael Flynn, President Trump's former nation...","Michael Flynn, President Trump's former nation...",1
1707,pbs_politics_1395,PBS,A veteran Kansas legislator has won the Democr...,A veteran Kansas legislator has won the Democr...,0


In [18]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
2326,fox_politics_691,Fox,Trump expected to reallocate $8 billion from o...,Trump expected to reallocate $8 billion from o...,1
2049,pbs_politics_27,PBS,"EL PASO, Texas — Top defense officials toured ...","EL PASO, Texas Top defense officials toured s...",0
392,fox_politics_26,Fox,2020 presidential hopeful Kamala Harris secure...,2020 presidential hopeful Kamala Harris secure...,1
2482,fox_politics_715,Fox,"Kamala Harris’ career, from California distric...","Kamala Harris’ career, from California distric...",1
1636,pbs_politics_346,PBS,— President Donald Trump escalated his long-r...,President Donald Trump escalated his long-ru...,0


### Check for similarity between test and training

In [19]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,187,187,187,187
PBS,178,178,178,178


In [20]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,1717,1717,1717,1717
PBS,1561,1561,1561,1561


## Get Embeddings, Build Functions

In [21]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:56, 9303.61it/s]

Found 2196016 word vectors





In [22]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [23]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

## Setup Model

In [24]:
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.1
REC_DROPOUT = 0.1

In [25]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 365/365 [00:00<00:00, 568.93it/s]


In [26]:
# note...
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model = Sequential()
model.add(Bidirectional(LSTM(lstm_in, return_sequences=True, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(Bidirectional(LSTM(lstm_in)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 500, 64)           85248     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 110,145
Trainable params: 110,145
Non-trainable params: 0
_________________________________________________________________


In [27]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb036ff6748>

In [31]:
y_pred = model.predict(x_test)
y_pred[:7] > 0.5

array([[ True],
       [ True],
       [ True],
       [ True],
       [False],
       [ True],
       [False]], dtype=bool)

In [33]:
test_df[['source', 'clean_articles', 'targets']].head(7)

Unnamed: 0,source,clean_articles,targets
2326,Fox,Trump expected to reallocate $8 billion from o...,1
2049,PBS,"EL PASO, Texas Top defense officials toured s...",0
392,Fox,2020 presidential hopeful Kamala Harris secure...,1
2482,Fox,"Kamala Harris’ career, from California distric...",1
1636,PBS,President Donald Trump escalated his long-ru...,0
3578,Fox,Trump responds Pelosi by on Scribd,1
731,PBS,Congressional Republicans have distanced the...,0


In [32]:
for i in np.arange(0.25, 0.75, 0.05):
    res = metrics.f1_score(y_test, y_pred > i)
    print("Threshold {i}, f1 score {res}".format(i=round(i,2),res=res))

Threshold 0.25, f1 score 0.9664082687338501
Threshold 0.3, f1 score 0.9664082687338501
Threshold 0.35, f1 score 0.9664082687338501
Threshold 0.4, f1 score 0.9664082687338501
Threshold 0.45, f1 score 0.9664082687338501
Threshold 0.5, f1 score 0.9664082687338501
Threshold 0.55, f1 score 0.9664082687338501
Threshold 0.6, f1 score 0.9664082687338501
Threshold 0.65, f1 score 0.9664082687338501
Threshold 0.7, f1 score 0.9714285714285714
