In [1]:
import csv
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [2]:
#Remove stop words from data and fill in the article and labels lists

articles = []
labels = []

with open("/Users/alikazmi/Desktop/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in stop_words:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [3]:
#Set parameters required to build and train the model.

vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8

In [4]:
#Create Training and Testing Set

train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

test_articles = articles[train_size:]
test_labels = labels[train_size:]

In [5]:
print("train_size", train_size)
print(f"train_articles {len(train_articles)}")
print("train_labels", len(train_labels))
print("test_articles", len(test_articles))
print("test_labels", len(test_labels))

train_size 1780
train_articles 1780
train_labels 1780
test_articles 445
test_labels 445


In [10]:
#Tokenization, Sequencing and Padding

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_articles)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_articles)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
#Create labels

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

In [13]:
#Create the Model

model = Sequential()

model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(6, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
dropout (Dropout)            (None, None, 64)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
Total params: 386,822
Trainable params: 386,822
Non-trainable params: 0
_________________________________________________________________


In [14]:
#Compile the model

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [19]:
#Train the model

num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(test_padded, test_label_seq), verbose=2)


Epoch 1/10
56/56 - 6s - loss: 1.5849 - accuracy: 0.3107 - val_loss: 1.3481 - val_accuracy: 0.3775
Epoch 2/10
56/56 - 5s - loss: 1.2430 - accuracy: 0.5213 - val_loss: 0.8115 - val_accuracy: 0.7685
Epoch 3/10
56/56 - 5s - loss: 0.6894 - accuracy: 0.7944 - val_loss: 1.1697 - val_accuracy: 0.6517
Epoch 4/10
56/56 - 5s - loss: 0.5651 - accuracy: 0.8517 - val_loss: 0.4854 - val_accuracy: 0.8494
Epoch 5/10
56/56 - 5s - loss: 0.3777 - accuracy: 0.8775 - val_loss: 0.5309 - val_accuracy: 0.8067
Epoch 6/10
56/56 - 6s - loss: 0.3371 - accuracy: 0.8820 - val_loss: 0.4008 - val_accuracy: 0.9079
Epoch 7/10
56/56 - 5s - loss: 0.1601 - accuracy: 0.9697 - val_loss: 0.3196 - val_accuracy: 0.9303
Epoch 8/10
56/56 - 6s - loss: 0.0975 - accuracy: 0.9831 - val_loss: 0.3195 - val_accuracy: 0.9079
Epoch 9/10
56/56 - 6s - loss: 0.0609 - accuracy: 0.9899 - val_loss: 0.2347 - val_accuracy: 0.9438
Epoch 10/10
56/56 - 5s - loss: 0.0385 - accuracy: 0.9933 - val_loss: 0.2135 - val_accuracy: 0.9416


In [22]:
txt = ["Ryerson's basketball team won against Hamilton Warriors on Sunday, July 3rd 2020."]

seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['sport', 'bussiness', 'politics', 'tech', 'entertainment'] 

print(pred)
print(np.argmax(pred))
print(labels[np.argmax(pred)-1])

[[3.9621271e-04 9.7422421e-01 6.6736010e-03 5.8249221e-04 9.7991928e-05
  1.8025404e-02]]
1
sport
