# Fox and PBS

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- PBS News

### Date 
First  : 4.8.19

Update : 6.24.19


## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...","FILE--In this July 28, 2016 file photo, Sen. B...",2
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...","Howard Kurtz: How Michael Cohen, Democrats sto...",2
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,Student Union: Make UC Berkeley a sanctuary ca...,2
4,fox_politics_492,Fox,President Trump’s health care executive order:...,President Trump’s health care executive order:...,2


# MAKE DATASETS

## First Dataset

#### Remove Vox

In [7]:
df_all = df_all[df_all['source'] != "Vox"].drop('article', axis=1)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,1739,1739,1739


In [8]:
pbs = df_all[df_all['source'] == 'PBS']
fox = df_all[df_all['source'] == 'Fox']

#### Duplicate Fox to balance

In [11]:
# balance df by resampling from fox 
len_pbs = len(pbs)

df_first = pbs.copy()
df_first = df_first.append(fox.sample(len_pbs, replace=True), ignore_index=True)
df_first.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,1739,1739,1739
PBS,1739,1739,1739


#### Relabel the targets

In [12]:
from tqdm import tqdm
import numpy as np

In [13]:
def label_fox(source):
    if source == "Fox":
        return 1 
    elif source == "PBS": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_fox(t) for t in tqdm(df_first["source"])])
df_first['targets'] = targets

100%|██████████| 3478/3478 [00:00<00:00, 1397030.20it/s]


In [14]:
# make sure that the targets are correct
df_first.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,1739.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,1739.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Second Dataset
#### Sample from Vox to balance

In [16]:
fox.count()

article id        661
source            661
clean_articles    661
targets           661
dtype: int64

In [17]:
pbs.count()

article id        1739
source            1739
clean_articles    1739
targets           1739
dtype: int64

In [18]:
# make second dataset for training
# start with fox and append with a sample of PBS
len_fox = len(fox)

df_second = fox.copy()
df_second = df_second.append(pbs.sample(len_fox), ignore_index=True)
df_second.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,661,661,661


#### Relabel the targets

In [20]:
targets = np.array([label_fox(t) for t in tqdm(df_second["source"])])
df_second['targets'] = targets
df_second.head()

100%|██████████| 1322/1322 [00:00<00:00, 1098429.06it/s]


Unnamed: 0,article id,source,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,1
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",1
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...",1
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,1
4,fox_politics_492,Fox,President Trump’s health care executive order:...,1


In [21]:
# make sure that the targets are correct
df_second.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,661.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,661.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get Embeddings and Define Helper Functions

In [22]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:41, 9905.89it/s] 

Found 2196016 word vectors





In [23]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 500
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [24]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['clean_articles'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

# Train Dataset 1

### Define Models

#### Model 1: Bidirectional LSTM

In [25]:
# parameters
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.2
REC_DROPOUT = 0.1

In [26]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [27]:
# SINGLE LAYER LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [43]:
train_df, test_df = train_test_split(df_first, test_size=0.1)

#### Check for similarity between test and training

In [44]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,178.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,1561.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,1569.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Prepare test set for validation

In [46]:
# witheld for validation

x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 348/348 [00:00<00:00, 585.87it/s]


#### Train

In [47]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff869102518>

In [48]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff863897b38>

#### Look at predictions

In [49]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[ 0.98009944],
       [ 0.99831539],
       [ 0.03991568],
       [ 0.02625093],
       [ 0.00290164],
       [ 0.00283876],
       [ 0.99653018]], dtype=float32)

In [50]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.78650844],
       [ 0.73700398],
       [ 0.18937233],
       [ 0.0717358 ],
       [ 0.02441296],
       [ 0.02862272],
       [ 0.9660058 ]], dtype=float32)

In [51]:
test_df[['source', 'clean_articles', 'targets']].head(7)

Unnamed: 0,source,clean_articles,targets
3233,Fox,The investigation into Russian involvement in ...,1
3382,Fox,House Speaker-to-be Nancy Pelosi vacations in ...,1
1304,PBS,The Republican-led House oversight committee...,0
856,PBS,As he departed for his speech at the NRA conve...,0
219,PBS,"RICHMOND, Va. A panel of federal judges has c...",0
1513,PBS,"DES MOINES, Iowa After months of speculation ...",0
2126,Fox,"Beto O'Rourke plans solo road trip, sparking n...",1


In [55]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.962 
F1 score model 2: 0.818 

Threshold 0.3 
F1 score model 1: 0.97 
F1 score model 2: 0.821 

Threshold 0.35 
F1 score model 1: 0.973 
F1 score model 2: 0.826 

Threshold 0.4 
F1 score model 1: 0.975 
F1 score model 2: 0.828 

Threshold 0.45 
F1 score model 1: 0.975 
F1 score model 2: 0.836 

Threshold 0.5 
F1 score model 1: 0.981 
F1 score model 2: 0.822 

Threshold 0.55 
F1 score model 1: 0.978 
F1 score model 2: 0.81 

Threshold 0.6 
F1 score model 1: 0.978 
F1 score model 2: 0.802 

Threshold 0.65 
F1 score model 1: 0.981 
F1 score model 2: 0.798 

Threshold 0.7 
F1 score model 1: 0.983 
F1 score model 2: 0.798 

Threshold 0.75 
F1 score model 1: 0.983 
F1 score model 2: 0.638 



# Train Dataset 2

### Refresh Models

#### Model 1: Bidirectional LSTM

In [56]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [59]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [62]:
train_df, test_df = train_test_split(df_second, test_size=0.1)
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,592.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,597.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for similarity between test and training

In [63]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,69.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
PBS,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### prepare test set for validation

In [64]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 133/133 [00:00<00:00, 763.77it/s]


#### train

In [65]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff861cc4080>

In [66]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff8628da630>

#### Look at predictions

In [67]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[ 0.9978168 ],
       [ 0.99771464],
       [ 0.06080681],
       [ 0.9991371 ],
       [ 0.41710827],
       [ 0.00227717],
       [ 0.99331832]], dtype=float32)

In [68]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.90293288],
       [ 0.35094097],
       [ 0.47305709],
       [ 0.79048741],
       [ 0.95655388],
       [ 0.05289701],
       [ 0.8803134 ]], dtype=float32)

In [70]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.944 
F1 score model 2: 0.71 

Threshold 0.3 
F1 score model 1: 0.964 
F1 score model 2: 0.707 

Threshold 0.35 
F1 score model 1: 0.964 
F1 score model 2: 0.699 

Threshold 0.4 
F1 score model 1: 0.957 
F1 score model 2: 0.69 

Threshold 0.45 
F1 score model 1: 0.964 
F1 score model 2: 0.681 

Threshold 0.5 
F1 score model 1: 0.964 
F1 score model 2: 0.696 

Threshold 0.55 
F1 score model 1: 0.964 
F1 score model 2: 0.686 

Threshold 0.6 
F1 score model 1: 0.964 
F1 score model 2: 0.696 

Threshold 0.65 
F1 score model 1: 0.964 
F1 score model 2: 0.687 

Threshold 0.7 
F1 score model 1: 0.971 
F1 score model 2: 0.672 

Threshold 0.75 
F1 score model 1: 0.971 
F1 score model 2: 0.436 

