# PBS and Vox

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- PBS News

### Date 
First  : 4.8.19

Update : 6.24.19


## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...","FILE--In this July 28, 2016 file photo, Sen. B...",2
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...","Howard Kurtz: How Michael Cohen, Democrats sto...",2
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,Student Union: Make UC Berkeley a sanctuary ca...,2
4,fox_politics_492,Fox,President Trump’s health care executive order:...,President Trump’s health care executive order:...,2


# MAKE DATASETS

## First Dataset

#### Remove Fox

In [7]:
df_all = df_all[df_all['source'] != "Fox"].drop('article', axis=1)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PBS,1739,1739,1739
Vox,1027,1027,1027


In [8]:
pbs = df_all[df_all['source'] == 'PBS']
vox = df_all[df_all['source'] == 'Vox']

#### Duplicate Vox to balance

In [9]:
# balance df by resampling from vox 
diff = len(pbs) - len(vox)

df_first = df_all.append(vox.sample(diff), ignore_index=True)
df_first.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PBS,1739,1739,1739
Vox,1739,1739,1739


#### Relabel the targets

In [10]:
from tqdm import tqdm
import numpy as np

In [11]:
def label_fox(source):
    if source == "PBS":
        return 1 
    elif source == "Vox": 
        return 0
    else: 
        print(source)
        return None

targets = np.array([label_fox(t) for t in tqdm(df_first["source"])])
df_first['targets'] = targets

100%|██████████| 3478/3478 [00:00<00:00, 1345985.36it/s]


In [12]:
# make sure that the targets are correct
df_first.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,1739.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,1739.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Second Dataset
#### Sample from Vox to balance

In [13]:
vox.count()

article id        1027
source            1027
clean_articles    1027
targets           1027
dtype: int64

In [14]:
pbs.count()

article id        1739
source            1739
clean_articles    1739
targets           1739
dtype: int64

In [15]:
# make second dataset for training
# start with vox and append with a sample of PBS
df_second = vox.copy()

len_vox = len(vox)
df_second = df_second.append(pbs.sample(len_vox), ignore_index=True)
df_second.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PBS,1027,1027,1027
Vox,1027,1027,1027


#### Relabel the targets

In [16]:
targets = np.array([label_fox(t) for t in tqdm(df_second["source"])])
df_second['targets'] = targets
df_second.head()

100%|██████████| 2054/2054 [00:00<00:00, 1187798.21it/s]


Unnamed: 0,article id,source,clean_articles,targets
0,vox_politics_396,Vox,Senate Republicans on Thursday revealed the Be...,0
1,vox_politics_372,Vox,"“New York will be destroyed,” the state’s Gov....",0
2,vox_politics_602,Vox,The Trump administration wants to send a messa...,0
3,vox_politics_1198,Vox,"Donald Trump’s long, improbable journey to pol...",0
4,vox_politics_682,Vox,The Trump administration threw the fate of the...,0


In [17]:
# make sure that the targets are correct
df_second.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,1027.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,1027.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get Embeddings and Define Helper Functions

In [18]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:39, 9994.96it/s] 

Found 2196016 word vectors





In [19]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 500
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [20]:
def batch_gen(train_df, batch_size=64, article_length=500):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['clean_articles'][i*batch_size: (i+1)*batch_size]
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

# Train Dataset 1

### Define Models

#### Model 1: Bidirectional LSTM

In [21]:
# parameters
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.2
REC_DROPOUT = 0.1

In [22]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [23]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_2 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [24]:
train_df, test_df = train_test_split(df_first, test_size=0.1)

#### Check for similarity between test and training

In [25]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,173.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,1566.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,1564.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Prepare test set for validation

In [27]:
# witheld for validation

x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 348/348 [00:00<00:00, 530.80it/s]


#### Train

In [28]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba67588f28>

In [29]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba80887f98>

#### Look at predictions

In [30]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[ 0.07707784],
       [ 0.01962113],
       [ 0.29655272],
       [ 0.9961046 ],
       [ 0.99453592],
       [ 0.07331619],
       [ 0.07310385]], dtype=float32)

In [31]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.2888881 ],
       [ 0.10282174],
       [ 0.16594806],
       [ 0.24206144],
       [ 0.95199358],
       [ 0.0716047 ],
       [ 0.24206111]], dtype=float32)

In [32]:
test_df[['source', 'clean_articles', 'targets']].head(7)

Unnamed: 0,source,clean_articles,targets
3235,Vox,"For the past three years, the US Health and Hu...",0
2904,Vox,"With a repeal bill off the table, the Trump ad...",0
929,Vox,Starbucks executive chair Howard Schultz had e...,0
1427,PBS,Supreme Court nominee Brett Kavanaugh is den...,1
1817,PBS,Federal prosecutors say in a court filing that...,1
3249,Vox,Barreling toward a noon vote that could reopen...,0
2846,Vox,Vermont is now the ninth state to legalize mar...,0


In [33]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.885 
F1 score model 2: 0.689 

Threshold 0.3 
F1 score model 1: 0.892 
F1 score model 2: 0.657 

Threshold 0.35 
F1 score model 1: 0.906 
F1 score model 2: 0.641 

Threshold 0.4 
F1 score model 1: 0.924 
F1 score model 2: 0.636 

Threshold 0.45 
F1 score model 1: 0.929 
F1 score model 2: 0.615 

Threshold 0.5 
F1 score model 1: 0.937 
F1 score model 2: 0.595 

Threshold 0.55 
F1 score model 1: 0.931 
F1 score model 2: 0.59 

Threshold 0.6 
F1 score model 1: 0.927 
F1 score model 2: 0.578 

Threshold 0.65 
F1 score model 1: 0.919 
F1 score model 2: 0.578 

Threshold 0.7 
F1 score model 1: 0.912 
F1 score model 2: 0.569 

Threshold 0.75 
F1 score model 1: 0.889 
F1 score model 2: 0.508 



# Train Dataset 2

### Refresh Models

#### Model 1: Bidirectional LSTM

In [34]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 85,313
Trainable params: 85,313
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [37]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT, \
                        input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 93,505
Trainable params: 93,505
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [42]:
train_df, test_df = train_test_split(df_second, test_size=0.1)
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,926.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,922.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Check for similarity between test and training

In [43]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PBS,101.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Vox,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### prepare test set for validation

In [44]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array(test_df["targets"])

100%|██████████| 206/206 [00:00<00:00, 684.22it/s]


#### train

In [45]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba643cd198>

In [46]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                    validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fba4416d320>

#### Look at predictions

In [47]:
y_pred_1 = model_1.predict(x_test)
y_pred_1[:7]

array([[ 0.00570861],
       [ 0.63534874],
       [ 0.61335051],
       [ 0.0974611 ],
       [ 0.80614477],
       [ 0.05343789],
       [ 0.83199769]], dtype=float32)

In [48]:
y_pred_2 = model_2.predict(x_test)
y_pred_2[:7]

array([[ 0.0710642 ],
       [ 0.41897047],
       [ 0.95573109],
       [ 0.41897756],
       [ 0.85572755],
       [ 0.11307797],
       [ 0.8565073 ]], dtype=float32)

In [49]:
for i in np.arange(0.25, 0.8, 0.05):
    res_1 = metrics.f1_score(y_test, y_pred_1 > i)
    res_2 = metrics.f1_score(y_test, y_pred_2 > i)
    print("Threshold {i} \nF1 score model 1: {res_1} \nF1 score model 2: {res_2} \n".format(\
                                                                                       i=round(i,2), \
                                                                                       res_1 = round(res_1, 3), \
                                                                                       res_2 = round(res_2, 3)))

Threshold 0.25 
F1 score model 1: 0.851 
F1 score model 2: 0.797 

Threshold 0.3 
F1 score model 1: 0.86 
F1 score model 2: 0.803 

Threshold 0.35 
F1 score model 1: 0.864 
F1 score model 2: 0.805 

Threshold 0.4 
F1 score model 1: 0.863 
F1 score model 2: 0.822 

Threshold 0.45 
F1 score model 1: 0.86 
F1 score model 2: 0.792 

Threshold 0.5 
F1 score model 1: 0.853 
F1 score model 2: 0.804 

Threshold 0.55 
F1 score model 1: 0.856 
F1 score model 2: 0.781 

Threshold 0.6 
F1 score model 1: 0.843 
F1 score model 2: 0.772 

Threshold 0.65 
F1 score model 1: 0.822 
F1 score model 2: 0.763 

Threshold 0.7 
F1 score model 1: 0.827 
F1 score model 2: 0.747 

Threshold 0.75 
F1 score model 1: 0.814 
F1 score model 2: 0.73 

