# LSTM

First read the json files containing the news article data into a Pandas DataFrame.

In [1]:
import os
import pandas as pd

directory = 'data\jsons'

dfs = [] # an empty list to store the data frames
for f in os.listdir(directory):
    file = os.path.join(directory, f)
    data = pd.read_json(file, typ='series').to_frame().T # read data frame from json file
    dfs.append(data) # append the data frame to the list

df = pd.concat(dfs, ignore_index=True)
df

Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,terrorism,New York Times - News,0,http://www.nytimes.com/2016/09/20/nyregion/ahm...,"Bomb Suspect Changed After Trip Abroad, Friend...",2016-09-20,N. R. Kleinfield,"Besides his most recent trip to Quetta , Mr. R...","Besides his most recent trip to Quetta, Mr. Ra...",www.nytimes.com,left,004Gt3gcsotuiYmz
1,supreme_court,Vox,0,https://www.vox.com/policy-and-politics/2018/9...,Why Susan Collins claims she’s being bribed ov...,2018-09-12,"Emily Stewart, Terry Nguyen, Rebecca Jennings,...",Is Maine Republican Sen. Susan Collins being b...,Is Maine Republican Sen. Susan Collins being b...,www.vox.com,left,00eP4XD3VdMmHITE
2,education,Ezra Klein,0,http://www.npr.org/blogs/thetwo-way/2014/05/06...,Poll: Prestigious Colleges Won't Make You Happ...,2014-05-06,Anya Kamenetz,Poll : Prestigious Colleges Wo n't Make You Ha...,Poll: Prestigious Colleges Won't Make You Happ...,www.npr.org,left,00FTGIZEd6B8zQ4U
3,us_house,Breitbart News,2,http://www.breitbart.com/big-government/2017/0...,Paul Ryan Reportedly Says No Chance for Border...,2017-09-12,Ian Mason,"House Speaker Paul Ryan , at a private dinner ...","House Speaker Paul Ryan, at a private dinner e...",www.breitbart.com,right,00HGGqBRf1kzPRlg
4,white_house,Guest Writer - Left,0,https://www.cnn.com/2019/07/11/politics/donald...,OPINION: Trump seeking change of legal fortune...,2019-07-11,Analysis Stephen Collinson,( CNN ) President Donald Trump has reason to h...,(CNN) President Donald Trump has reason to hop...,www.cnn.com,left,00IzI5ynahBVtC9l
...,...,...,...,...,...,...,...,...,...,...,...,...
37549,race_and_racism,Yahoo! The 360,1,https://news.yahoo.com/how-do-we-address-racis...,How do we address racism in 'Gone With the Wind'?,2020-06-18,Julia Munslow,“ The 360 ” shows you diverse perspectives on ...,“The 360” shows you diverse perspectives on th...,www.news.yahoo.com,center,zzwPV6NCYsMiDb0a
37550,elections,The Hill,1,https://thehill.com/homenews/campaign/445504-t...,The top 10 Democrats in the 2020 race,2019-05-28,,The race for the Democratic presidential nomin...,The race for the Democratic presidential nomin...,www.thehill.com,center,zZwwVzN0ZltBq302
37551,violence_in_america,Townhall,2,https://townhall.com/tipsheet/leahbarkoukis/20...,Report: Police Questioned YouTube Shooter Morn...,2018-04-04,"Leah Barkoukis, Matt Vespa, Timothy Meads, Kat...",Police confirmed they found and questioned Nas...,Police confirmed they found and questioned Nas...,www.townhall.com,right,ZzXppS8L4v4WsVWq
37552,free_speech,NPR Online News,1,http://www.npr.org/blogs/parallels/2015/02/10/...,The French Debate: Free Speech Versus Hate Speech,2015-02-10,Eleanor Beardsley,When terrorists attacked a satirical magazine ...,The French Debate: Free Speech Versus Hate Spe...,www.npr.org,center,zzZMTdCEiAiVRRO3


For this proof of concept we are going to use the preprocessed data provided in the dataset so we can drop the unused columns.

In [2]:
df.drop(['topic', 'source', 'url', 'title', 'date', 'authors', 'content_original', 'source_url', 'bias_text', 'ID'],
        axis=1, inplace=True)
df

Unnamed: 0,bias,content
0,0,"Besides his most recent trip to Quetta , Mr. R..."
1,0,Is Maine Republican Sen. Susan Collins being b...
2,0,Poll : Prestigious Colleges Wo n't Make You Ha...
3,2,"House Speaker Paul Ryan , at a private dinner ..."
4,0,( CNN ) President Donald Trump has reason to h...
...,...,...
37549,1,“ The 360 ” shows you diverse perspectives on ...
37550,1,The race for the Democratic presidential nomin...
37551,2,Police confirmed they found and questioned Nas...
37552,1,When terrorists attacked a satirical magazine ...


Next, remove all punctuation from the text, make all the words lowercase, remove stop words, tokenize and one-hot-encode the sentences.

In [3]:
df['content'] = df['content'].str.replace('[^\w\s]','')
df['content'] = df['content'].str.lower()

df.head()

  df['content'] = df['content'].str.replace('[^\w\s]','')


Unnamed: 0,bias,content
0,0,besides his most recent trip to quetta mr rah...
1,0,is maine republican sen susan collins being br...
2,0,poll prestigious colleges wo nt make you happ...
3,2,house speaker paul ryan at a private dinner e...
4,0,cnn president donald trump has reason to hop...


In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [14]:
import numpy as np
'''
get average article word length to use as word count limit for LSTM, 
articles that are shorter than this will be zero-padded
'''
max_length = int(np.round(df['content'].apply(len).mean()))

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

X = df['content'].to_numpy()

NUM_TOP_WORDS = None

tokenizer = Tokenizer(num_words=max_length)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

X = sequence.pad_sequences(sequences, maxlen=max_length)

print('Shape of data tensor:', X.shape)
print(np.max(X))

Found 203115 unique tokens. Distilled to 203115 top words.
Shape of data tensor: (37554, 3970)
3969


Let's check the class distribution for our target variable.

In [16]:
np.unique(df['bias'].to_numpy(), return_counts=True)

(array([0, 1, 2], dtype=object), array([13005, 10815, 13734], dtype=int64))

As we can see they are relatively balanced so we can go ahead and split this into our train, validation, and test sets.

In [18]:
from sklearn.model_selection import train_test_split
import keras
from keras import backend

X_train, X_test, y_train, y_test = train_test_split(X, df['bias'].to_numpy(), test_size=0.1, stratify=df['bias'].to_numpy())
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1.0/9.0, stratify=y_train)

NUM_CLASSES = len(np.unique(df['bias']))
y_train_ohe = keras.utils.np_utils.to_categorical(y_train, NUM_CLASSES)
y_test_ohe = keras.utils.np_utils.to_categorical(y_test, NUM_CLASSES)
y_val_ohe = keras.utils.np_utils.to_categorical(y_val, NUM_CLASSES)

print(len(X_train), len(X_test), len(X_val))

30042 3756 3756


With our dataset split we will now create a pre-trained embedding matrix using GloVe to be used with the LSTM.

In [21]:
%%time
EMBED_SIZE = 300
# the embed size should match the file you load glove from
embeddings_index = {}
f = open('glove.6B.300d.txt', encoding="utf8")
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
found_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be ALL-ZEROS
        embedding_matrix[i] = embedding_vector
        found_words = found_words+1

print("Embedding Shape:",embedding_matrix.shape, "\n",
      "Total words found:",found_words, "\n",
      "Percentage:",100*found_words/embedding_matrix.shape[0])

Found 400000 word vectors.
Embedding Shape: (203116, 300) 
 Total words found: 94645 
 Percentage: 46.596526123003606
CPU times: total: 20.1 s
Wall time: 20.1 s


In [23]:
from tensorflow.keras.layers import Embedding

# create pre-trained embedding layer
embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],# here is the embedding getting saved
                            input_length=max_length,
                            trainable=False)

Now let's build and train the LSTM.

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

lstm = Sequential()
lstm.add(embedding_layer)
lstm.add(LSTM(300,dropout=0.2, recurrent_dropout=0.2))
lstm.add(Dense(NUM_CLASSES, activation='sigmoid'))
lstm.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop', 
              metrics=['accuracy'])
lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3970, 300)         60934800  
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dense (Dense)                (None, 3)                 903       
Total params: 61,656,903
Trainable params: 722,103
Non-trainable params: 60,934,800
_________________________________________________________________


In [25]:
history = lstm.fit(X_train, y_train_ohe, validation_data=(X_val, y_val_ohe), epochs=10, batch_size=64)

Epoch 1/10


ResourceExhaustedError:  OOM when allocating tensor with shape[300,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node sequential/lstm/while/body/_1/sequential/lstm/while/lstm_cell/split}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_3921]

Function call stack:
train_function
