<a href="https://colab.research.google.com/github/projjal1/News-Category-Prediction/blob/master/BBC_News_Category_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing all modules and checking the tensorflow version

In [None]:
import tensorflow as tf
print(tf.__version__)

2.3.0


Downloading NLTK Corpora

In [None]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Importing needed modules

In [None]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

Declaring hyperparameters for text processing

In [None]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

Parsing the CSV file for labels and text content

In [None]:
articles = []
labels = []

with open("bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print(len(labels))
print(len(articles))

2225
2225


Displaying few labels

In [None]:
labels[:3]

['tech', 'business', 'sport']

Splitting dataset into training and testing vectors

In [None]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

1780
1780
1780
445
445


Tokenizing word content

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [None]:
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'also': 6,
 'mr': 3,
 'new': 8,
 'one': 10,
 'people': 7,
 'said': 2,
 'us': 9,
 'would': 4,
 'year': 5}

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_articles)

In [None]:
print(train_sequences[10])

[2432, 1, 225, 4994, 22, 642, 587, 225, 4994, 1, 1, 1661, 1, 1, 2432, 22, 565, 1, 1, 140, 278, 1, 140, 278, 796, 823, 662, 2308, 1, 1145, 1692, 1, 1719, 4995, 1, 1, 1, 1, 1, 4737, 1, 1, 122, 4513, 1, 2, 2875, 1505, 352, 4738, 1, 52, 341, 1, 352, 2171, 3962, 41, 22, 3794, 1, 1, 1, 1, 543, 1, 1, 1, 835, 631, 2367, 347, 4739, 1, 365, 22, 1, 787, 2368, 1, 4301, 138, 10, 1, 3665, 682, 3531, 1, 22, 1, 414, 823, 662, 1, 90, 13, 633, 1, 225, 4994, 1, 599, 1, 1692, 1021, 1, 4996, 807, 1863, 117, 1, 1, 1, 2975, 22, 1, 99, 278, 1, 1606, 4997, 543, 492, 1, 1443, 4740, 779, 1320, 1, 1860, 10, 33, 642, 319, 1, 62, 478, 565, 301, 1506, 22, 479, 1, 1, 1664, 1, 797, 1, 3067, 1, 1364, 6, 1, 2432, 565, 22, 2972, 4734, 1, 1, 1, 1, 1, 850, 39, 1824, 675, 297, 26, 979, 1, 882, 22, 361, 22, 13, 301, 1506, 1342, 374, 20, 63, 883, 1096, 4302, 247]


Padding extra characters at the end of the dataset of text

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

425
200
192
200
186
200


In [None]:
print(train_sequences[10])

[2432, 1, 225, 4994, 22, 642, 587, 225, 4994, 1, 1, 1661, 1, 1, 2432, 22, 565, 1, 1, 140, 278, 1, 140, 278, 796, 823, 662, 2308, 1, 1145, 1692, 1, 1719, 4995, 1, 1, 1, 1, 1, 4737, 1, 1, 122, 4513, 1, 2, 2875, 1505, 352, 4738, 1, 52, 341, 1, 352, 2171, 3962, 41, 22, 3794, 1, 1, 1, 1, 543, 1, 1, 1, 835, 631, 2367, 347, 4739, 1, 365, 22, 1, 787, 2368, 1, 4301, 138, 10, 1, 3665, 682, 3531, 1, 22, 1, 414, 823, 662, 1, 90, 13, 633, 1, 225, 4994, 1, 599, 1, 1692, 1021, 1, 4996, 807, 1863, 117, 1, 1, 1, 2975, 22, 1, 99, 278, 1, 1606, 4997, 543, 492, 1, 1443, 4740, 779, 1320, 1, 1860, 10, 33, 642, 319, 1, 62, 478, 565, 301, 1506, 22, 479, 1, 1, 1664, 1, 797, 1, 3067, 1, 1364, 6, 1, 2432, 565, 22, 2972, 4734, 1, 1, 1, 1, 1, 850, 39, 1824, 675, 297, 26, 979, 1, 882, 22, 361, 22, 13, 301, 1506, 1342, 374, 20, 63, 883, 1096, 4302, 247]


In [None]:
print(train_padded[10])

[2432    1  225 4994   22  642  587  225 4994    1    1 1661    1    1
 2432   22  565    1    1  140  278    1  140  278  796  823  662 2308
    1 1145 1692    1 1719 4995    1    1    1    1    1 4737    1    1
  122 4513    1    2 2875 1505  352 4738    1   52  341    1  352 2171
 3962   41   22 3794    1    1    1    1  543    1    1    1  835  631
 2367  347 4739    1  365   22    1  787 2368    1 4301  138   10    1
 3665  682 3531    1   22    1  414  823  662    1   90   13  633    1
  225 4994    1  599    1 1692 1021    1 4996  807 1863  117    1    1
    1 2975   22    1   99  278    1 1606 4997  543  492    1 1443 4740
  779 1320    1 1860   10   33  642  319    1   62  478  565  301 1506
   22  479    1    1 1664    1  797    1 3067    1 1364    6    1 2432
  565   22 2972 4734    1    1    1    1    1  850   39 1824  675  297
   26  979    1  882   22  361   22   13  301 1506 1342  374   20   63
  883 1096 4302  247    0    0    0    0    0    0    0    0    0    0
    0 

In [None]:
print(train_sequences[0])

[91, 160, 1142, 1106, 49, 979, 755, 1, 89, 1304, 4288, 129, 175, 3654, 1215, 1196, 1576, 42, 7, 893, 91, 1, 334, 85, 20, 14, 130, 3262, 1216, 2422, 570, 451, 1375, 58, 3378, 3521, 1659, 8, 921, 730, 10, 844, 1, 9, 598, 1577, 1107, 395, 1939, 1106, 731, 49, 537, 1397, 2010, 1621, 134, 249, 113, 2356, 795, 4979, 980, 584, 10, 3956, 3957, 921, 2563, 129, 344, 175, 3654, 1, 1, 39, 62, 2868, 28, 9, 4722, 18, 1305, 136, 416, 7, 143, 1422, 71, 4500, 436, 4980, 91, 1107, 77, 1, 82, 2011, 53, 1, 91, 6, 1008, 609, 89, 1304, 91, 1961, 131, 137, 420, 9, 2869, 38, 152, 1234, 89, 1304, 4723, 7, 436, 4980, 3154, 6, 2493, 1, 431, 1127, 1, 1423, 571, 1261, 1901, 1, 766, 9, 537, 1397, 2010, 134, 2068, 400, 845, 1962, 1599, 34, 1715, 2870, 1, 1, 2423, 244, 9, 2625, 82, 732, 6, 1173, 1197, 152, 720, 591, 1, 124, 28, 1305, 1688, 432, 83, 933, 115, 20, 14, 18, 3155, 1, 37, 1484, 1, 23, 37, 87, 335, 2357, 37, 467, 255, 1962, 1358, 328, 1, 299, 732, 1174, 18, 2871, 1715, 1, 294, 756, 1074, 395, 2012, 387, 431

In [None]:
print(train_padded[0])

[  91  160 1142 1106   49  979  755    1   89 1304 4288  129  175 3654
 1215 1196 1576   42    7  893   91    1  334   85   20   14  130 3262
 1216 2422  570  451 1375   58 3378 3521 1659    8  921  730   10  844
    1    9  598 1577 1107  395 1939 1106  731   49  537 1397 2010 1621
  134  249  113 2356  795 4979  980  584   10 3956 3957  921 2563  129
  344  175 3654    1    1   39   62 2868   28    9 4722   18 1305  136
  416    7  143 1422   71 4500  436 4980   91 1107   77    1   82 2011
   53    1   91    6 1008  609   89 1304   91 1961  131  137  420    9
 2869   38  152 1234   89 1304 4723    7  436 4980 3154    6 2493    1
  431 1127    1 1423  571 1261 1901    1  766    9  537 1397 2010  134
 2068  400  845 1962 1599   34 1715 2870    1    1 2423  244    9 2625
   82  732    6 1173 1197  152  720  591    1  124   28 1305 1688  432
   83  933  115   20   14   18 3155    1   37 1484    1   23   37   87
  335 2357   37  467  255 1962 1358  328    1  299  732 1174   18 2871
 1715 

Padding the validation portion of the text content

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

445
(445, 200)


In [None]:
print(set(labels))

{'business', 'tech', 'entertainment', 'politics', 'sport'}


In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [None]:
print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[4]
[2]
[1]
(1780, 1)
[5]
[4]
[3]
(445, 1)


In [None]:
validation_label_seq[:3]

array([[5],
       [4],
       [3]])

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(train_articles[10])

berlin <OOV> anti nazi film german movie anti nazi <OOV> <OOV> drawn <OOV> <OOV> berlin film festival <OOV> <OOV> final days <OOV> final days member white rose movement <OOV> 21 arrested <OOV> brother hans <OOV> <OOV> <OOV> <OOV> <OOV> tyranny <OOV> <OOV> director marc <OOV> said feeling responsibility keep legacy <OOV> going must <OOV> keep ideas alive added film drew <OOV> <OOV> <OOV> <OOV> trial <OOV> <OOV> <OOV> east germany secret police discovery <OOV> behind film <OOV> worked closely <OOV> relatives including one <OOV> sisters ensure historical <OOV> film <OOV> members white rose <OOV> group first started <OOV> anti nazi <OOV> summer <OOV> arrested dropped <OOV> munich university calling day <OOV> <OOV> <OOV> regime film <OOV> six days <OOV> arrest intense trial saw <OOV> initially deny charges ended <OOV> appearance one three german films <OOV> top prize festival south african film version <OOV> <OOV> opera <OOV> shot <OOV> town <OOV> language also <OOV> berlin festival film en

Building the model with relu and softmax activation

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


Compilation of the model using Sparse Categorical Cross-Entropy loss for multi-class classifier, using Adam as a optimizer 

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the model

In [None]:
model.fit(train_padded, training_label_seq, epochs=10, validation_data=(validation_padded, validation_label_seq))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa596477ef0>

Predicting few examples 

In [None]:
txt = ["A WeWork shareholder has taken the company to court over the near-$1.7bn (£1.3bn) leaving package approved for ousted co-founder Adam Neumann."]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['sport', 'bussiness', 'politics', 'tech', 'entertainment']
print( labels[np.argmax(pred)])

politics
