# Tokenize and Train

### Author 
Stephen Lee

### Goal
Classify news source based on the article text. Training data: 
- Fox News
- Vox News
- PBS News

### Date 
3.20.19

## Read Data

In [1]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import GRU, Dense, Bidirectional, LSTM

import os 
import math 
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
FOLDER_READ = '/home/stephen/Dropbox/CodeWorkspace/data-sets/Thesis/'
FILE = 'clean_article_df.csv'
ARTICLE_LENGTH = 1000    # max length for an article

In [3]:
os.getcwd()

'/home/stephen/Dropbox/CodeWorkspace/other/Anaconda/NLP/thesis'

In [4]:
os.chdir(FOLDER_READ)

In [5]:
os.listdir()

['vox-politics.tar.gz',
 'fox-entertainment.tar.gz',
 'vox-culture.tar.gz',
 'fox-tech.tar.gz',
 'pbs-politics.tar.gz',
 'fox-politics.tar.gz',
 'bbc-articles.tar.gz',
 'clean_article_df.csv',
 'articles.csv']

In [6]:
df_all = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)
df_all.head()

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,Bolton warns Venezuela's Maduro to stay away f...,Bolton warns Venezuela's Maduro to stay away f...,3
1,fox_politics_390,Fox,Ocasio-Cortez rallies to stop all fossil fuel ...,Ocasio-Cortez rallies to stop all fossil fuel ...,3
2,fox_politics_423,Fox,The Pentagon announced Sunday the deployment o...,The Pentagon announced Sunday the deployment o...,3
3,fox_politics_102,Fox,Mayor Bill de Blasio says that US Rep. Alexand...,Mayor Bill de Blasio says that US Rep. Alexand...,3
4,fox_politics_492,Fox,Who is EPA's Andrew Wheeler?\nEPA administrato...,Who is EPA's Andrew Wheeler?\nEPA administrato...,3


## Split into test and training

In [7]:
train_df, test_df = train_test_split(df_all, test_size=0.1)
train_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
2261,pbs_politics_25,PBS,House Democrats formally kicked off the next r...,House Democrats formally kicked off the next r...,1
1858,pbs_politics_1250,PBS,Americans remain divided on the issue of gun c...,Americans remain divided on the issue of gun c...,1
2452,pbs_politics_730,PBS,"— In this election season, lawmakers are taki...","In this election season, lawmakers are takin...",1
1129,vox_politics_1327,Vox,"As we learned on Election Day 2016, national p...","As we learned on Election Day 2016, national p...",2
3096,pbs_politics_265,PBS,— President Donald Trump’s nominee for attorn...,President Donald Trump’s nominee for attorne...,1


In [8]:
test_df.head()

Unnamed: 0,article id,source,article,clean_articles,targets
3115,pbs_politics_1002,PBS,"RICHMOND, Va. — Virginia Gov. Ralph Northam to...","RICHMOND, Va. Virginia Gov. Ralph Northam tol...",1
1615,pbs_politics_334,PBS,NEW DELHI (AP) — The Democratic victory in the...,NEW DELHI (AP) The Democratic victory in the ...,1
1097,vox_politics_1575,Vox,Take a step back and appreciate the vertiginou...,Take a step back and appreciate the vertiginou...,2
308,fox_politics_143,Fox,Andrew McCabe believes 'it's possible' Preside...,Andrew McCabe believes 'it's possible' Preside...,3
1954,pbs_politics_1721,PBS,ATLANTA — A U.S. Supreme Court ruling has clea...,ATLANTA A U.S. Supreme Court ruling has clear...,1


### Check for similarity between test and training

In [9]:
test_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,58,58,58,58
PBS,159,159,159,159
Vox,108,108,108,108


In [10]:
train_df.groupby('source').count()

Unnamed: 0_level_0,article id,article,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fox,418,418,418,418
PBS,1580,1580,1580,1580
Vox,919,919,919,919


## Get Embeddings, Build Functions

In [11]:
from tqdm import tqdm

In [12]:
PATH = '/home/stephen/Dropbox/CodeWorkspace/data-sets/NLP/quora-questions/embeddings/glove.840B.300d/'
FILE = PATH + 'glove.840B.300d.txt'

embeddings_index = {}
 
with open(FILE, encoding='utf8') as embed:
    for line in tqdm(embed):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print(f"Found {len(embeddings_index)} word vectors")

2196017it [04:22, 8360.84it/s]

Found 2196016 word vectors





In [13]:
def text_to_array(text, article_length=500):
    empty_emb = np.zeros(300)                   # each word is represented by a length 300 vector
    text = text[:-1].split()[:article_length]   # each article is length 10000
    
    # look for word embedding, return zero array otherwise. 
    embeds = [embeddings_index.get(x, empty_emb) for x in text]
    embeds += [empty_emb] * (article_length - len(embeds))
    return np.array(embeds)

In [14]:
def batch_gen(train_df, batch_size=64):
    n = math.ceil(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.0)
        
        for i in range(n):
            texts = train_df['article'][i*batch_size: (i+1)*batch_size]
            text_arr = np.array([text_to_array(text) for text in texts])
            targets = np.array(train_df['targets'][i*batch_size: (i+1)*batch_size])
            yield text_arr, targets

## Setup Model

In [15]:
# witheld for validation
test_articles = np.array([text_to_array(x) for x in tqdm(test_df["clean_articles"])])
test_target = np.array(test_df["targets"])

100%|██████████| 325/325 [00:00<00:00, 841.85it/s]


In [16]:
input_shape = (500, 300)

model = Sequential()
model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.05, recurrent_dropout=0.1), \
                        input_shape=input_shape))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", \
              optimizer="adam", \
              metrics=["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 500, 128)          186880    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 285,825
Trainable params: 285,825
Non-trainable params: 0
_________________________________________________________________


In [None]:
data = batch_gen(train_df)
model.fit_generator(data, epochs=2, steps_per_epoch=500, \
                    validation_data=(test_articles, test_target), \
                    verbose=True)

Epoch 1/2