# Tokenize and Train

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- Vox News
- PBS News

### Date 
3.20.19

## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM

import os 
import math 
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
FOLDER_READ = '/home/stephen/Dropbox/CodeWorkspace/data-sets/Thesis/'
FILE = 'clean_article_df.csv'
ARTICLE_LENGTH = 1000    # max length for an article

In [3]:
os.getcwd()

'/home/stephen/Dropbox/CodeWorkspace/other/Anaconda/NLP/thesis'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['vox-politics.tar.gz',
 'fox-entertainment.tar.gz',
 'vox-culture.tar.gz',
 'fox-tech.tar.gz',
 'pbs-politics.tar.gz',
 'fox-politics.tar.gz',
 'bbc-articles.tar.gz',
 'clean_article_df.csv',
 'articles.csv']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Split into test and training

In [7]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
3240,pbs_politics_1317,PBS,"URBANA, Ill. (AP) — Former President Barack Ob...","URBANA, Ill. (AP) Former President Barack Oba...",1
135,fox_politics_32,Fox,Democratic Sens. Elizabeth Warren of Massachus...,Democratic Sens. Elizabeth Warren of Massachus...,3
2829,pbs_politics_428,PBS,— Seeking to move past the shadow of the Russ...,Seeking to move past the shadow of the Russi...,1
1464,vox_politics_1767,Vox,President Trump signed a new executive order T...,President Trump signed a new executive order T...,2
984,vox_politics_1585,Vox,The Republican Party’s polling for the 2018 el...,The Republican Party’s polling for the 2018 el...,2


In [8]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
2682,pbs_politics_614,PBS,— President Donald Trump leaves out a big com...,President Donald Trump leaves out a big comp...,1
300,fox_politics_568,Fox,Judge Nap on New York Times Report Trump Inter...,Judge Nap on New York Times Report Trump Inter...,3
1345,vox_politics_1500,Vox,The White House threw in the towel on two of P...,The White House threw in the towel on two of P...,2
523,vox_politics_1820,Vox,Two more Senate Republicans came out against t...,Two more Senate Republicans came out against t...,2
3238,pbs_politics_1730,PBS,"WEST PALM BEACH, Fla. — President Donald Trump...","WEST PALM BEACH, Fla. President Donald Trump ...",1


### Check for similarity between test and training

In [9]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,50,50,50,50
PBS,175,175,175,175
Vox,100,100,100,100


In [10]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,426,426,426,426
PBS,1564,1564,1564,1564
Vox,927,927,927,927


## Get Embeddings, Build Functions

In [11]:
from tqdm import tqdm

In [None]:
PATH = '/home/stephen/Dropbox/CodeWorkspace/data-sets/NLP/quora-questions/embeddings/glove.840B.300d/'
FILE = PATH + 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(FILE, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print(f"Found {len(embeddings_index)} word vectors")

1046541it [04:55, 5055.69it/s]

In [13]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [18]:
def batch_gen(train_df, batch_size=64):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            text_arr = np.array([text_to_array(text) for text in texts])
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            yield text_arr, targets

## Setup Model

In [19]:
# witheld for validation
test_articles = np.array([text_to_array(x) for x in tqdm(test_df["clean_articles"])])
test_target = np.array(test_df["targets"])

100%|██████████| 325/325 [00:00<00:00, 928.34it/s]


In [20]:
# note batch_size = 64         -> words per batch
#      article_length = 500    -> words per article
#      embed_length = 300      -> vector length per word

input_shape = (500, 300)

model = Sequential()
model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.05, recurrent_dropout=0.1), \
                        input_shape=input_shape))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 500, 64)           85248     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 110,145
Trainable params: 110,145
Non-trainable params: 0
_________________________________________________________________


In [21]:
data = batch_gen(train_df)
model.fit_generator(data, epochs=2, steps_per_epoch=500, \
                    validation_data=(test_articles, test_target), \
                    verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa5a6435278>