# Tokenize and Train

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- Vox News
- PBS News

### Date 
4.8.19

## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM, Activation
from keras.utils import to_categorical

import os 
import math 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn import metrics

Using TensorFlow backend.
  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
FOLDER_READ = '/home/smlee_981/data'
FILE = 'clean_article_df.csv'
ARTICLE_LENGTH = 1000    # max length for an article

In [3]:
os.getcwd()

'/home/smlee_981'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['clean_article_df.csv', 'glove.840B.300d.txt']

In [14]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Relabel targets from 0 - 2 for one-hot vectors

In [19]:
df_all['targets'] = df_all['targets'].replace(3,0)
df_all.groupby('source').mean()

Unnamed: 0_level_0,targets
source,Unnamed: 1_level_1
Fox,0
PBS,1
Vox,2


## Bootstrap data for balance

In [31]:
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,476,476,476,476
PBS,1739,1739,1739,1739
Vox,1027,1027,1027,1027


## Add more Fox and Vox

In [32]:
fox = df_all[df_all['source'] == 'Fox']
vox = df_all[df_all['source'] == 'Vox'][:700]

In [33]:
df_all = df_all.append([fox]*3, ignore_index=True)
df_all = df_all.append(vox, ignore_index=True)
df_all.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,1904,1904,1904,1904
PBS,1739,1739,1739,1739
Vox,1727,1727,1727,1727


## Split into test and training

In [34]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
4876,vox_politics_1487,Vox,Mitt Romney is making the case that he’s more ...,Mitt Romney is making the case that he’s more ...,2
1856,pbs_politics_970,PBS,— President Donald Trump said Thursday that p...,President Donald Trump said Thursday that pe...,1
1219,vox_politics_1045,Vox,There are some posts so strange you don’t quit...,There are some posts so strange you don’t quit...,2
484,vox_politics_71,Vox,Republicans and Democrats in Congress have fin...,Republicans and Democrats in Congress have fin...,2
5103,vox_politics_701,Vox,Just hours after Senate Republican leaders rel...,Just hours after Senate Republican leaders rel...,2


In [35]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
755,vox_politics_693,Vox,The biggest policy fight left in 2017 is stuck...,The biggest policy fight left in 2017 is stuck...,2
4557,fox_politics_625,Fox,DOJ asks Supreme Court to take up transgender ...,DOJ asks Supreme Court to take up transgender ...,0
1153,vox_politics_1399,Vox,Senate Republicans are in such a rush to pass ...,Senate Republicans are in such a rush to pass ...,2
2480,pbs_politics_1623,PBS,The desperate sobbing of 10 Central American c...,The desperate sobbing of 10 Central American c...,1
4989,vox_politics_1212,Vox,"On Friday, the Department of Justice sent lett...","On Friday, the Department of Justice sent lett...",2


### Check for similarity between test and training

In [36]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,198,198,198,198
PBS,161,161,161,161
Vox,178,178,178,178


In [37]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,1706,1706,1706,1706
PBS,1578,1578,1578,1578
Vox,1549,1549,1549,1549


## Get Embeddings, Build Functions

In [38]:
from tqdm import tqdm
import numpy as np

In [39]:
# glove embeddings and data are in same folder

EMBEDS = 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(EMBEDS, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print("Found {n} word vectors".format(n=len(embeddings_index)))

2196017it [03:46, 9676.34it/s]

Found 2196016 word vectors





In [44]:
def target_to_one_hot(target, num_classes=3):
    return to_categorical(target, num_classes=num_classes)

In [41]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [42]:
def batch_gen(train_df, batch_size=64, article_length=500, num_classes=3):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            targets = train_df['targets'][i*batch_size: (i+1)*batch_size]
            
            targets = np.array([target_to_one_hot(t, num_classes) for t in targets])
            text_arr = np.array([text_to_array(text, article_length=article_length) for text in texts])
            yield text_arr, targets

## Setup Model

In [47]:
ARTICLE_LENGTH = 500
BATCH_SIZE = 128
DROPOUT = 0.1
REC_DROPOUT = 0.1

In [48]:
# witheld for validation
 
x_test = np.array([text_to_array(x, article_length=ARTICLE_LENGTH) \
                          for x in tqdm(test_df["clean_articles"])])
y_test = np.array([target_to_one_hot(t) for t in tqdm(test_df["targets"])])


  0%|          | 0/537 [00:00<?, ?it/s][A
 13%|█▎        | 70/537 [00:00<00:00, 692.13it/s][A
 26%|██▌       | 139/537 [00:00<00:00, 689.05it/s][A
 39%|███▊      | 208/537 [00:00<00:00, 687.04it/s][A
 52%|█████▏    | 280/537 [00:00<00:00, 692.76it/s][A
 65%|██████▌   | 350/537 [00:00<00:00, 694.85it/s][A
 77%|███████▋  | 416/537 [00:00<00:00, 677.97it/s][A
 89%|████████▉ | 479/537 [00:00<00:00, 660.17it/s][A
100%|██████████| 537/537 [00:00<00:00, 674.92it/s][A
  0%|          | 0/537 [00:00<?, ?it/s][A
100%|██████████| 537/537 [00:00<00:00, 68518.53it/s][A

In [55]:
# note...
#      batch_size         -> words per batch
#      article_length     -> words per article
#      embed_length       -> vector length per word

input_shape = (ARTICLE_LENGTH, 300)
lstm_in = int(BATCH_SIZE/2)

model = Sequential()
model.add(Bidirectional(LSTM(lstm_in, return_sequences=True, \
                        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT), \
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(Bidirectional(LSTM(lstm_in)))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_9 (Bidirection (None, 500, 128)          186880    
_________________________________________________________________
activation_3 (Activation)    (None, 500, 128)          0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total params: 286,083
Trainable params: 286,083
Non-trainable params: 0
_________________________________________________________________


In [56]:
data = batch_gen(train_df, batch_size=BATCH_SIZE, article_length=ARTICLE_LENGTH)
model.fit_generator(data, epochs=2, steps_per_epoch=500, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8b459b9630>

In [58]:
y_pred = model.predict(x_test, batch_size=BATCH_SIZE)
y_pred_class = np.argmax(y_pred, axis=1)
y_pred_one_hot = to_categorical(y_pred_class, num_classes=3)

In [65]:
y_pred[0:5]

array([[  1.77483298e-06,   1.17231739e-05,   9.99986529e-01],
       [  9.99990702e-01,   9.16947010e-06,   8.62704042e-08],
       [  7.02655758e-04,   1.10057428e-01,   8.89239907e-01],
       [  2.69893244e-05,   1.90524617e-03,   9.98067796e-01],
       [  2.75058817e-04,   9.16835248e-01,   8.28896910e-02]], dtype=float32)

In [60]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
755,vox_politics_693,Vox,The biggest policy fight left in 2017 is stuck...,The biggest policy fight left in 2017 is stuck...,2
4557,fox_politics_625,Fox,DOJ asks Supreme Court to take up transgender ...,DOJ asks Supreme Court to take up transgender ...,0
1153,vox_politics_1399,Vox,Senate Republicans are in such a rush to pass ...,Senate Republicans are in such a rush to pass ...,2
2480,pbs_politics_1623,PBS,The desperate sobbing of 10 Central American c...,The desperate sobbing of 10 Central American c...,1
4989,vox_politics_1212,Vox,"On Friday, the Department of Justice sent lett...","On Friday, the Department of Justice sent lett...",2


In [64]:
res = metrics.f1_score(y_test, y_pred_one_hot, average='micro')
print("f1 score {res}".format(res=res))

f1 score 0.8901303538175046
