# Tokenize and Train

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- Vox News
- PBS News

### Date 
4.8.19

## Read Data

In [3]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation
from keras.utils import to_categorical

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [1]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'
ARTICLE_LENGTH = 500    # max length for an article

In [4]:
os.getcwd()

'/home/smlee_981'

In [5]:
os.chdir(FOLDER_READ)

In [6]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [7]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1).drop('article', axis=1)
df_all.head()

Unnamed: 0,article id,source,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",2
2,fox_politics_423,Fox,"Howard Kurtz: How Michael Cohen, Democrats sto...",2
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,2
4,fox_politics_492,Fox,President Trump’s health care executive order:...,2


#### Relabel targets from 0 - 2 for one-hot vectors

In [8]:
df_all['targets'] = df_all['targets'].replace(3,0)
df_all.groupby('source').mean()

Unnamed: 0_level_0,targets
source,Unnamed: 1_level_1
Fox,2
PBS,0
Vox,1


# MAKE DATASETS

## First dataset
#### Bootstrap data for balance

In [9]:
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,1739,1739,1739
Vox,1027,1027,1027


#### Add more Fox and Vox

In [10]:
fox = df_all[df_all['source'] == 'Fox']
vox = df_all[df_all['source'] == 'Vox']
pbs = df_all[df_all['source'] == 'PBS']

In [11]:
# start with the full dataset and append with 
# shorter targets
pbs_fox_diff = len(pbs) - len(fox)
pbs_vox_diff = len(pbs) - len(vox)

fox_append = fox.sample(pbs_fox_diff, replace=True)
vox_append = vox.sample(pbs_vox_diff)

df_first = df_all.copy()
df_first = df_first.append(fox_append, ignore_index=True)
df_first = df_first.append(vox_append, ignore_index=True)
df_first.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,1739,1739,1739
PBS,1739,1739,1739
Vox,1739,1739,1739


## Second dataset
#### Reduce to min (i.e. fox) for balance

In [12]:
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,1739,1739,1739
Vox,1027,1027,1027


#### Sample from PBS and Vox

In [13]:
# start with the smallest dataset and 
# sample from larger sets
len_fox = len(fox)

df_second = fox.copy()
df_second = df_second.append(pbs.sample(len_fox), ignore_index=True)
df_second = df_second.append(vox.sample(len_fox), ignore_index=True)
df_second.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,661,661,661
Vox,661,661,661


## Get Embeddings, Build Functions

In [14]:
from tqdm import tqdm
import numpy as np

In [15]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:06, 11772.23it/s]

Found 2196016 word vectors





In [16]:
def target_to_one_hot(target, num_classes=3):
    return to_categorical(target, num_classes=num_classes)

In [17]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 500
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [18]:
def batch_gen(train_df, batch_size=64, article_length=500, num_classes=3):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['clean_articles'][i*batch_size: (i+1)*batch_size]
            targets = train_df['targets'][i*batch_size: (i+1)*batch_size]
            
            targets = np.array([target_to_one_hot(t, num_classes) for t in targets])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

# Train Dataset 1

### Define Models

#### Model 1: Bidirectional LSTM

In [19]:
ARTICLE_LENGTH = 500
BATCH_SIZE = 64
DROPOUT = 0.2
REC_DROPOUT = 0.1

In [26]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(3, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 85,443
Trainable params: 85,443
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [27]:
# SINGLE LAYER LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(3, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 64)                93440     
_________________________________________________________________
activation_4 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 195       
Total params: 93,635
Trainable params: 93,635
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [28]:
train_df, test_df = train_test_split(df_first, test_size=0.1)
train_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,1575.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0
PBS,1562.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Vox,1558.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


#### Check for similarity between test and training

In [29]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,164.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0
PBS,177.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Vox,181.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


#### Prepare test set for validation

In [30]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array([target_to_one_hot(t) for t in tqdm(test_df["targets"])])

100%|██████████| 522/522 [00:00<00:00, 658.03it/s]
100%|██████████| 522/522 [00:00<00:00, 53490.67it/s]


#### Train

In [None]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=500, \
                      validation_data=None, verbose=True)

Epoch 1/2

In [None]:
model_2.fit_generator(data, epochs=2, steps_per_epoch=500, \
                      validation_data=None, verbose=True)

#### Look at predictions

In [34]:
y_pred_1 = model_1.predict(x_test, batch_size=BATCH_SIZE)
y_pred_class_1 = np.argmax(y_pred_1, axis=1)
y_pred_one_hot_1 = to_categorical(y_pred_class_1, num_classes=3)
y_pred_1[0:5]

array([[ 0.14582711,  0.89899731,  0.01299682],
       [ 0.06213471,  0.92457575,  0.01566279],
       [ 0.09577557,  0.77901024,  0.15444529],
       [ 0.18759233,  0.69716638,  0.02712774],
       [ 0.50319964,  0.03062788,  0.47726333]], dtype=float32)

In [35]:
y_pred_2 = model_2.predict(x_test, batch_size=BATCH_SIZE)
y_pred_class_2 = np.argmax(y_pred_2, axis=1)
y_pred_one_hot_2 = to_categorical(y_pred_class_2, num_classes=3)
y_pred_2[0:5]

array([[ 0.35714245,  0.24999118,  0.36904681],
       [ 0.41697901,  0.24259883,  0.38628492],
       [ 0.45707053,  0.48080242,  0.4636448 ],
       [ 0.43490097,  0.20910415,  0.36009645],
       [ 0.51947153,  0.08273306,  0.37040639]], dtype=float32)

In [36]:
test_df[['source', 'clean_articles', 'targets']].head()

Unnamed: 0,source,clean_articles,targets
2734,PBS,Congress appears headed towards a government s...,0
1599,Vox,The Trump administration is paving the road fo...,1
5129,Vox,"Warren Buffett’s multinational conglomerate, B...",1
5118,Vox,Donald Trump won the 2016 presidential electio...,1
2424,PBS,Democratic Sen. Cory Booker of New Jersey says...,0


In [37]:
res_1 = metrics.f1_score(y_test, y_pred_one_hot_1, average='micro')
res_2 = metrics.f1_score(y_test, y_pred_one_hot_2, average='micro')
print("F1 score model 1: {res_1} \nF1 score model 2: {res_2}".format(res_1=res_1, 
                                                                     res_2=res_2))

F1 score model 1: 0.8869731800766284 
F1 score model 2: 0.5172413793103449


## Train Dataset 2

### Refresh Models

#### Model 1: Bidirectional LSTM

In [25]:
# SINGLE LAYER BIDIRECTIONAL LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model_1 = Sequential()
model_1.add(Bidirectional(LSTM(lstm_in, return_sequences=False, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))

model_1.add(Activation('relu'))
#model.add(Bidirectional(LSTM(lstm_in)))

model_1.add(Dense(1, activation="sigmoid"))
model_1.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 85,443
Trainable params: 85,443
Non-trainable params: 0
_________________________________________________________________


#### Model 2: Regular LSTM

In [25]:
# SINGLE LAYER LTSM
# 
# note...
#
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE)

model_2 = Sequential()
model_2.add(LSTM(lstm_in, return_sequences=False, dropout=DROPOUT, \
                 recurrent_dropout=REC_DROPOUT, input_shape=input_shape))

model_2.add(Activation('relu'))

model_2.add(Dense(1, activation="sigmoid"))
model_2.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])

model_2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 85,443
Trainable params: 85,443
Non-trainable params: 0
_________________________________________________________________


#### Split into test and training

In [14]:
train_df, test_df = train_test_split(df_second, test_size=0.1)
train_df.groupby('source').describe()

Unnamed: 0,article id,source,clean_articles,targets
4571,vox_politics_1803,Vox,"On January 31, Reuters released a survey that ...",1
4889,vox_politics_246,Vox,Sen. Elizabeth Warren (D-MA) isn’t ready to ce...,1
4160,fox_politics_890,Fox,Michael Cohen testifies behind closed doors on...,2
2916,pbs_politics_777,PBS,Democratic senators plan to press Attorney G...,0
4747,vox_politics_1931,Vox,"Buried in Republicans’ tax bill, in a provisio...",1


#### Check for similarity between test and training

In [16]:
test_df.groupby('source').describe()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,173,173,173
PBS,156,156,156
Vox,193,193,193


#### Prepare test set for validation

In [24]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array([target_to_one_hot(t) for t in tqdm(test_df["targets"])])

100%|██████████| 522/522 [00:00<00:00, 576.99it/s]
100%|██████████| 522/522 [00:00<00:00, 60797.14it/s]


#### Train

In [28]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model_1.fit_generator(data, epochs=2, steps_per_epoch=250, \
                      validation_data=None, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ffa9e40d860>

In [None]:
model_2.fit_generator(data, epochs=2, steps_per_epoch=250, \
                      validation_data=None, verbose=True)

#### Look at predictions

In [29]:
y_pred_1 = model_1.predict(x_test, batch_size=BATCH_SIZE)
y_pred_class_1 = np.argmax(y_pred, axis=1)
y_pred_one_hot_1 = to_categorical(y_pred_class, num_classes=3)
y_pred_1[0:5]

In [30]:
y_pred_2 = model_2.predict(x_test, batch_size=BATCH_SIZE)
y_pred_class_2 = np.argmax(y_pred, axis=1)
y_pred_one_hot_2 = to_categorical(y_pred_class, num_classes=3)
y_pred_2[0:5]

array([[  8.88640702e-01,   1.08859196e-01,   2.50014453e-03],
       [  9.99586936e-03,   2.20355950e-03,   9.87800598e-01],
       [  1.00982422e-02,   9.89862919e-01,   3.88756453e-05],
       [  9.98584032e-01,   7.99316331e-04,   6.16667385e-04],
       [  9.91307318e-01,   8.12769495e-03,   5.65017574e-04]], dtype=float32)

In [31]:
test_df[['source', 'clean_articles', 'targets']].head()

Unnamed: 0,source,clean_articles,targets
2990,PBS,The 24-hour news cycle is filled with politics...,0
411,Fox,Pelosi works on Democratic leadership term-lim...,2
1209,Vox,In an interview that aired Sunday on 60 Minute...,1
1925,PBS,White House press secretary Sarah Huckabee S...,0
3166,PBS,President Donald Trump says he has a “great lo...,0


In [33]:
res_1 = metrics.f1_score(y_test, y_pred_one_hot_1, average='micro')
res_2 = metrics.f1_score(y_test, y_pred_one_hot_2, average='micro')
print("F1 score model 1: {res_1} \nF1 score model 2: {res_2}".format(res_1=res_1, 
                                                                     res_2=res_2))

f1 score 0.9425287356321839
