### Multiclass Text Classification using LSTM

In [1]:
import pandas as pd
import numpy as np

In [2]:
news_data = pd.read_csv('https://raw.githubusercontent.com/susanli2016/PyCon-Canada-2019-NLP-Tutorial/master/bbc-text.csv')

In [3]:
news_data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
news_data.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

#### Check count of each category

In [5]:
#pip install plotly

In [6]:
# import plotly.express as px
# import plotly
# plotly.offline.init_notebook_mode(connected=True)

# df = px.data.tips()
# # fig = px.histogram(df, x="sex", y="tip", histfunc="avg", color="smoker", barmode="group",
# #              facet_row="time", facet_col="day", category_orders={"day": ["Thur", "Fri", "Sat", "Sun"],
# #                                                                 "time": ["Lunch", "Dinner"]})
# fig = px.bar(news_data )
# fig.show()

In [7]:
import spacy
from nltk.corpus import stopwords

In [8]:
stopwords = stopwords.words('english')

In [9]:
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import string

# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

# Remove Punctuations
def punctuation_remover(text):
    punctuation_free_text = " ".join([word for word in text if word \
                                    not in string.punctuation])
    return punctuation_free_text


# Stop Word Removal
# cached_stop_words = stopwords.words('english') # Provides 70 X Speedup
def stop_words_remover(text):
#     text = text.lower().split()
    words = [word for word in text if \
             word not in stopwords]
    return words

# Convert to lower case
def convert_to_lowercase(tokens):
    low = []
    for tok in tokens:
        low.append(tok.lower().strip())
    return low

# # Lemmatization
# def lemmatize_words(text):
#     words = nlp(str(text))
#     return [word.lemma_ for word in words if word.lemma_ != '-PRON-']  

# def replace_urls(tokens):
#     re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", tokens)
#     return tokens


# def remove_short_strings(text, length):
#     array = []
#     for word in text:
#         if len(word) > length:
#             array.append(word)
#     return array

In [10]:
def clean_text_for_tasks(text, for_pos_tagging = False):
    cleaned_text = remove_html(text)
#     cleaned_text = replace_urls(cleaned_text)
    cleaned_text = word_tokenize(cleaned_text)
    cleaned_text = stop_words_remover(cleaned_text)
    
#     if for_pos_tagging is False:
#         cleaned_text = convert_to_lowercase(cleaned_text)
#         cleaned_text = stop_words_remover(cleaned_text)
#         cleaned_text = lemmatize_words(cleaned_text)
#     cleaned_text = remove_short_strings(cleaned_text,2)
    cleaned_text = punctuation_remover(cleaned_text)
    
    return cleaned_text

In [11]:
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)

news_data['cleaned_text'] = news_data.text.progress_apply(clean_text_for_tasks)
# tokenized_documents['cleaned_text'] = tokenized_documents.text.parallel_apply(clean_text_for_tasks,axis = 1 )

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=2225.0), HTML(value='')))




In [12]:
news_data.head()

Unnamed: 0,category,text,cleaned_text
0,tech,tv future in the hands of viewers with home th...,tv future hands viewers home theatre systems p...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldcom...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary farrell gamble leicester say rushe...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raids box office ocean twelve cri...


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_valid,y_train,y_valid = train_test_split(news_data['cleaned_text'],news_data['category'],test_size = 0.3)

In [15]:
import tensorflow as tf

In [16]:
vocab_size = 5000 # Choose 5000 top words in the Vocabulary
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [17]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size,
                                                  oov_token=oov_tok
                                                 )
tokenizer.fit_on_texts(X_train)

In [18]:
word_index = tokenizer.word_index

In [19]:
dict(list(word_index.items())[:15])

{'<OOV>': 1,
 'said': 2,
 'mr': 3,
 'would': 4,
 'year': 5,
 'also': 6,
 'us': 7,
 'new': 8,
 'people': 9,
 'one': 10,
 'could': 11,
 'last': 12,
 'first': 13,
 'time': 14,
 'two': 15}

In [20]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)

In [21]:
print(X_train_sequences[10])

[1, 1390, 903, 611, 1, 1, 49, 2020, 24, 113, 132, 1610, 2598, 1, 2776, 1013, 2020, 1, 16, 36, 1167, 545, 4732, 903, 611, 5, 466, 174, 94, 1, 703, 1, 647, 1, 209, 1, 4059, 1, 1, 403, 3859, 2074, 1, 1, 128, 1, 1, 1036, 5, 110, 1, 2, 334, 1, 3860, 105, 4245, 2777, 1, 1, 762, 241, 520, 39, 1432, 168, 264, 2, 24, 140, 667, 4, 616, 1432, 648, 174, 6, 762, 241, 718]


#### We will add padding to the input sequences so as to make all inputs of the same size

In [22]:
X_train_padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequences,
                                                                      maxlen=max_length,
                                                                      truncating=trunc_type,
                                                                      padding=padding_type)

In [23]:
print(f"Length of input before padding {len(X_train_sequences[1])}")

Length of input before padding 492


In [24]:
print(f"Length of input after padding {len(X_train_padded_inputs[1])}")

Length of input after padding 200


In [25]:
X_valid_sequences = tokenizer.texts_to_sequences(X_valid)
X_valid_padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(X_valid_sequences,
                                                                      maxlen = max_length,
                                                                      truncating=trunc_type,
                                                                      padding=padding_type)

In [26]:
X_valid_padded_inputs.shape

(668, 200)

#### Now all input sequences are of the same length

In [27]:
#----------------------------------

In [28]:
#----------------------------------

#### Let's tokenize the labels

In [29]:
label_tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [30]:
label_tokenizer.fit_on_texts(news_data.category)

In [31]:
y_train_sequences = np.array(label_tokenizer.texts_to_sequences(y_train))
y_valid_sequences = np.array(label_tokenizer.texts_to_sequences(y_valid))


In [32]:
np.unique(y_train_sequences)

array([1, 2, 3, 4, 5])

In [33]:
np.unique(y_valid_sequences)

array([1, 2, 3, 4, 5])

#### So the 5 textual categories have been converted into numeric ones

In [34]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])
def decode_article(text):
    return ' '.join([reverse_word_index.get(i,'?') for i in text])

print(decode_article(X_train_padded_inputs[10]))

<OOV> suspended drugs test <OOV> <OOV> says banned three months international tennis federation <OOV> testing positive banned <OOV> world number 60 failed routine drugs test year french open plans <OOV> appeal <OOV> believes <OOV> given <OOV> doctor <OOV> <OOV> injury blame producing <OOV> <OOV> system <OOV> <OOV> 27 year old <OOV> said statement <OOV> defeated britain greg rusedski <OOV> <OOV> davis cup september set miss start season said three month ban would mean miss australian open also davis cup australia ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?


#### Many words have been replaced by OOV as they do not constitute the top 5000 words of the Vocabulary

#### Initializing Tensorboard

In [35]:
%load_ext tensorboard

In [36]:
import os
logs_base_dir = "logs"
os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}

Reusing TensorBoard on port 6006 (pid 25328), started 0:03:02 ago. (Use '!kill 25328' to kill it.)

#### Model

In [37]:
model = tf.keras.Sequential([
    # Embedding Layer 
    tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim),
    # Bidiretional LSTM for learning Long term dependencies
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    # Dense Layer with RELU
    tf.keras.layers.Dense(embedding_dim,activation='relu'),
    # Ouput layer with 6 units beacuse label Tokenizer starts with 1 but 
    # sparse_categorical_crossentropy loss function thinks 0 as a possible 
    # label as well so we have to give labels [0,1,2,3,4,5] even though 
    # 0 is not used
    tf.keras.layers.Dense(6,activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy']
             )

In [39]:
import datetime
log_dir = "logs\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

In [40]:
try:
    history = model.fit(X_train_padded_inputs, y_train_sequences,
                        epochs=10, batch_size=64,
                        validation_data = (X_valid_padded_inputs,y_valid_sequences),
                        callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10),
                                   tensorboard_callback],
                        verbose=1)

except KeyboardInterrupt:
    model.save('multiclass_classification_model.h5')
    print('Model Saved because of user input')   

Train on 1557 samples, validate on 668 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [71]:
test_sentence = """Fifa tournament 'a great idea'
Along with keeping fit so he is ready if football resumes, Jota has been busy competing in the ePremier League Invitational Fifa 20 tournament.

He beat Alexander-Arnold with a golden goal in Saturday's final, a match the Portugal international described as “very tense”.

Jota says his Fifa success is partly down to using set-pieces he would play in real life on the video game and hailed the tournament as “a great idea”.

“Trent Alexander-Arnold was a very good competitor,” he added.

“I needed to be very focussed because one mistake would cost me the tournament.

“We were playing for a good cause, to help the NHS, and give the fans something to watch. It’s sad for everyone this period we are living and we need to keep the fans busy.

"This was a great idea. To be crowned champion makes everything even better."""

In [72]:
test_sentence_cleaned = clean_text_for_tasks(test_sentence)

In [73]:
test_sentence_cleaned

"Fifa tournament great idea' Along keeping fit ready football resumes Jota busy competing ePremier League Invitational Fifa 20 tournament He beat Alexander-Arnold golden goal Saturday 's final match Portugal international described “ tense ” Jota says Fifa success partly using set-pieces would play real life video game hailed tournament “ great idea ” “ Trent Alexander-Arnold good competitor ” added “ I needed focussed one mistake would cost tournament “ We playing good cause help NHS give fans something watch It ’ sad everyone period living need keep fans busy `` This great idea To crowned champion makes everything even better"

In [74]:
test_sentence_sequence = tokenizer.texts_to_sequences(test_sentence_cleaned)

In [76]:
import itertools

def flatten_text_sequence(text):
    flatten = itertools.chain.from_iterable
    text = list(flatten(text))
    return text

test_sentence_sequence = flatten_text_sequence(test_sentence_sequence)

print(test_sentence_sequence)

[3112, 1, 3112, 2464, 1488, 1, 1, 1119, 2011, 2464, 1, 239, 2011, 1488, 1008, 1119, 239, 2464, 1488, 1, 1, 239, 2464, 1, 2464, 1873, 1, 2011, 1008, 3332, 239, 239, 1950, 1, 2011, 1008, 3112, 1, 1488, 1119, 239, 2464, 1, 1, 3112, 1, 1, 1488, 1140, 2464, 1873, 1873, 1119, 239, 4560, 1, 1, 239, 4560, 975, 1, 1488, 2464, 1140, 1, 4560, 1, 1863, 1, 1, 1950, 239, 1488, 1, 2011, 1008, 239, 1950, 1119, 239, 1, 1, 239, 1119, 1873, 239, 2464, 1008, 1, 239, 1, 2011, 943, 1, 1488, 2464, 1488, 1, 1, 2011, 2464, 1873, 3112, 1, 3112, 2464, 44, 287, 1488, 1, 1, 1119, 2011, 2464, 1, 239, 2011, 1488, 3689, 239, 1140, 239, 2464, 1488, 2464, 1873, 239, 2708, 2464, 2011, 1, 239, 1119, 2464, 1119, 2011, 1, 1873, 1, 1008, 1, 1873, 1, 239, 2011, 1008, 1, 2464, 1873, 4560, 2464, 1488, 1, 1119, 1, 2464, 1, 1, 4560, 3112, 1, 2011, 2464, 1873, 1, 2464, 1488, 1863, 3689, 1950, 1, 1119, 1488, 1, 1008, 2464, 1873, 1, 2011, 1488, 239, 1119, 2011, 2464, 1488, 1, 1, 2011, 2464, 1873, 1, 239, 4560, 1863, 1119, 1, 1140, 

In [77]:
test_sentence_padded = tf.keras.preprocessing.sequence.pad_sequences([test_sentence_sequence],
                                                                      maxlen=max_length,
                                                                      truncating=trunc_type,
                                                                      padding=padding_type)

In [78]:
test_sentence_padded

array([[3112,    1, 3112, 2464, 1488,    1,    1, 1119, 2011, 2464,    1,
         239, 2011, 1488, 1008, 1119,  239, 2464, 1488,    1,    1,  239,
        2464,    1, 2464, 1873,    1, 2011, 1008, 3332,  239,  239, 1950,
           1, 2011, 1008, 3112,    1, 1488, 1119,  239, 2464,    1,    1,
        3112,    1,    1, 1488, 1140, 2464, 1873, 1873, 1119,  239, 4560,
           1,    1,  239, 4560,  975,    1, 1488, 2464, 1140,    1, 4560,
           1, 1863,    1,    1, 1950,  239, 1488,    1, 2011, 1008,  239,
        1950, 1119,  239,    1,    1,  239, 1119, 1873,  239, 2464, 1008,
           1,  239,    1, 2011,  943,    1, 1488, 2464, 1488,    1,    1,
        2011, 2464, 1873, 3112,    1, 3112, 2464,   44,  287, 1488,    1,
           1, 1119, 2011, 2464,    1,  239, 2011, 1488, 3689,  239, 1140,
         239, 2464, 1488, 2464, 1873,  239, 2708, 2464, 2011,    1,  239,
        1119, 2464, 1119, 2011,    1, 1873,    1, 1008,    1, 1873,    1,
         239, 2011, 1008,    1, 2464, 

In [79]:
model.predict_classes(test_sentence_padded)

array([5], dtype=int64)

In [80]:
label_tokenizer.word_index

{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}

In [81]:
test_sentence2 = """For decades, the West, led by US strategic thinking, bet that full-on engagement with Beijing would alter the opaque nature of Chinese politics, making it more liberal and open. The onset of the Covid-19 pandemic should ensure a quick burial to this belief. The free and open liberal world order has run into the great political wall of China with deleterious consequences. Not only did the intense engagement with China fail to alter its politics, but many liberal democracies have also adopted Chinese-style industrial planning policies. The irony of today’s geopolitical moment is that Western taxpayers underwrote China’s bid for global influence. Successive US administrations, egged on by Big Business and Big Finance, played a crucial role in bringing China into the global community, culminating in Bill Clinton’s decision to welcome China into the WorldTrade Organisation (WTO) system.
"""

In [82]:
test_sentence_cleaned = clean_text_for_tasks(test_sentence2)

In [83]:
test_sentence_sequence = tokenizer.texts_to_sequences(test_sentence_cleaned)

In [84]:
test_sentence_sequence = flatten_text_sequence(test_sentence_sequence)

In [85]:
test_sentence_padded = tf.keras.preprocessing.sequence.pad_sequences([test_sentence_sequence],
                                                                      maxlen=max_length,
                                                                      truncating=trunc_type,
                                                                      padding=padding_type)

In [86]:
model.predict_classes(test_sentence_padded)

array([3], dtype=int64)

#### The Model Correctly predicted the label for a news related to Politics but it labelled a News Related to FIFA as entertainment.