# Fox and Vox

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- PBS News

### Date 
First  : 4.8.19

Update : 6.24.19


## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...","FILE--In this July 28, 2016 file photo, Sen. B...",2
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...","Howard Kurtz: How Michael Cohen, Democrats sto...",2
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,Student Union: Make UC Berkeley a sanctuary ca...,2
4,fox_politics_492,Fox,President Trump’s health care executive order:...,President Trump’s health care executive order:...,2


# MAKE DATASETS

## First Dataset

#### Remove PBS

In [7]:
df_all = df_all[df_all['source'] != "PBS"].drop('article', axis=1)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
Vox,1027,1027,1027


In [8]:
fox = df_all[df_all['source'] == 'Fox']
vox = df_all[df_all['source'] == 'Vox']

#### Duplicate Fox to balance

In [9]:
# balance df be resampling from fox 
diff = len(vox) - len(fox)
df_first = df_all.append(fox.sample(diff), ignore_index=True)
df_first.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,1027,1027,1027
Vox,1027,1027,1027


#### Relabel the targets

In [10]:
from tqdm import tqdm
import numpy as np

In [11]:
def label_fox(source):
    if source == "Fox":
        return 1 
    elif source == "Vox": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_fox(t) for t in tqdm(df_first["source"])])
df_first['targets'] = targets

100%|██████████| 2054/2054 [00:00<00:00, 1161690.99it/s]


In [12]:
# make sure that the targets are correct
df_first.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,1027.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,1027.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Second Dataset
#### Sample from Vox to balance

In [13]:
vox.count()

article id        1027
source            1027
clean_articles    1027
targets           1027
dtype: int64

In [14]:
fox.count()

article id        661
source            661
clean_articles    661
targets           661
dtype: int64

In [15]:
# make second dataset for training
# start with fox and append with a sample of PBS
df_second = fox.copy()

len_fox = len(fox)
df_second = df_second.append(vox.sample(len_fox), ignore_index=True)
df_second.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
Vox,661,661,661


#### Relabel the targets

In [17]:
targets = np.array([label_fox(t) for t in tqdm(df_second["source"])])
df_second['targets'] = targets
df_second.head()

100%|██████████| 1322/1322 [00:00<00:00, 991926.63it/s]


Unnamed: 0,article id,source,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,1
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",1
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...",1
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,1
4,fox_politics_492,Fox,President Trump’s health care executive order:...,1


In [18]:
# make sure that the targets are correct
df_second.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,661.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,661.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get Embeddings and Define Helper Functions

In [19]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:06, 11777.41it/s]

Found 2196016 word vectors





In [20]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 500
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [21]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['clean_articles'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

# Train Dataset 1

### Define Models

#### Model 1: Bidirectional LSTM

In [22]:
# parameters
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.2
REC_DROPOUT = 0.1

In [23]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [24]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [35]:
train_df, test_df = train_test_split(df_first, test_size=0.1)
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,100.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,106.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for similarity between test and training

In [36]:
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,927.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,921.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Prepare test set for validation

In [37]:
# witheld for validation

x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 206/206 [00:00<00:00, 635.84it/s]


#### Train

In [38]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fbd22498d30>

In [39]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fbd20c51ba8>

#### Look at predictions

In [40]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[  6.01682067e-03],
       [  9.97356653e-01],
       [  8.78721476e-04],
       [  9.86332655e-01],
       [  2.59389281e-02],
       [  9.98768806e-01],
       [  6.67256117e-03]], dtype=float32)

In [41]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.18196282],
       [ 0.6149677 ],
       [ 0.08993238],
       [ 0.49294728],
       [ 0.13965061],
       [ 0.81147218],
       [ 0.12582016]], dtype=float32)

In [42]:
test_df[['source', 'clean_articles', 'targets']].head(7)

Unnamed: 0,source,clean_articles,targets
1323,Vox,"In 2016, this drug was linked to more deaths t...",0
437,Fox,Trump may veto legislation that blocks nationa...,1
1169,Vox,The Republican Party’s polling for the 2018 el...,0
1719,Fox,Trump makes remarks after a working visit to C...,1
993,Vox,Members of the Republican Steering Committee v...,0
412,Fox,Klobuchar: We need to stop governing from chao...,1
1491,Vox,President Trump likes to say that he doesn’t w...,0


In [43]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.942 
F1 score model 2: 0.797 

Threshold 0.3 
F1 score model 1: 0.947 
F1 score model 2: 0.814 

Threshold 0.35 
F1 score model 1: 0.956 
F1 score model 2: 0.788 

Threshold 0.4 
F1 score model 1: 0.961 
F1 score model 2: 0.793 

Threshold 0.45 
F1 score model 1: 0.956 
F1 score model 2: 0.785 

Threshold 0.5 
F1 score model 1: 0.96 
F1 score model 2: 0.766 

Threshold 0.55 
F1 score model 1: 0.955 
F1 score model 2: 0.758 

Threshold 0.6 
F1 score model 1: 0.96 
F1 score model 2: 0.748 

Threshold 0.65 
F1 score model 1: 0.955 
F1 score model 2: 0.707 

Threshold 0.7 
F1 score model 1: 0.955 
F1 score model 2: 0.637 

Threshold 0.75 
F1 score model 1: 0.949 
F1 score model 2: 0.571 



# Train Dataset 2

### Refresh Models

#### Model 1: Bidirectional LSTM

In [44]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [46]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT, \
                        input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [47]:
train_df, test_df = train_test_split(df_second, test_size=0.1)
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,599.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,590.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for similarity between test and training

In [48]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,62.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### prepare test set for validation

In [49]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 133/133 [00:00<00:00, 695.28it/s]


#### train

In [50]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fbcfcf2a240>

In [51]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fbcfcf2a518>

#### Look at predictions

In [52]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[  9.97459412e-01],
       [  9.97844100e-01],
       [  8.10474157e-04],
       [  9.17318702e-01],
       [  3.55854630e-03],
       [  9.98294830e-01],
       [  1.42860413e-03]], dtype=float32)

In [53]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.953192  ],
       [ 0.953192  ],
       [ 0.27449441],
       [ 0.24609151],
       [ 0.18887109],
       [ 0.953192  ],
       [ 0.69498497]], dtype=float32)

In [54]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.937 
F1 score model 2: 0.809 

Threshold 0.3 
F1 score model 1: 0.945 
F1 score model 2: 0.806 

Threshold 0.35 
F1 score model 1: 0.945 
F1 score model 2: 0.812 

Threshold 0.4 
F1 score model 1: 0.945 
F1 score model 2: 0.812 

Threshold 0.45 
F1 score model 1: 0.952 
F1 score model 2: 0.84 

Threshold 0.5 
F1 score model 1: 0.952 
F1 score model 2: 0.846 

Threshold 0.55 
F1 score model 1: 0.952 
F1 score model 2: 0.859 

Threshold 0.6 
F1 score model 1: 0.935 
F1 score model 2: 0.839 

Threshold 0.65 
F1 score model 1: 0.927 
F1 score model 2: 0.829 

Threshold 0.7 
F1 score model 1: 0.918 
F1 score model 2: 0.836 

Threshold 0.75 
F1 score model 1: 0.924 
F1 score model 2: 0.821 

