# ARD - project: RNN

Author: Brenda Lesniczakova, LES0045 <br>
Dataset: BBC articles

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re # regular expression
from nltk.corpus import stopwords
from nltk.stem import wordnet
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

from tensorflow import string as tf_string
from tensorflow import keras
from keras.models import Model
from keras.layers.experimental.preprocessing import TextVectorization
from keras.layers import LSTM, Bidirectional, Input, Embedding, Dropout, Dense
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# BBC articles

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/bbc-fulltext-and-category/bbc-text.csv')
data.head()

In [None]:
data.info()

# Categories

In [None]:
data['category'].value_counts()

In [None]:
fig = plt.figure(figsize = (12,5))
ax = fig.add_subplot(111)
sns.countplot(data.category)
plt.xlabel('Category', size = 15)
plt.ylabel('Count', size= 15)
plt.xticks(size = 12)
plt.title("Count of articles by categories" , size = 18)
plt.show()

In [None]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data.category)
class_names = data.groupby(['category', 'label']).count().reset_index().loc[:,['category', 'label']]
class_names

# Cleaning data

In [None]:
print(data.text[0])

In [None]:
data['clean_txt'] = data['text'].apply(lambda x: re.sub(r'[^A-Za-z]+', ' ', x))
data['clean_txt'] = data['clean_txt'].apply(lambda x: x.lower())
data['clean_txt'] = data['clean_txt'].apply(lambda x: x.strip())

stop_words = stopwords.words('english')
data['clean_txt'] = data['clean_txt'].apply(lambda x: ' '.join([
    words for words in x.split() if words not in stop_words]))
lem = wordnet.WordNetLemmatizer()
data['clean_txt'] = data['clean_txt'].apply(lambda x: ' '.join([
    lem.lemmatize(item, pos='v') for item in x.split()]))

In [None]:
print(data.clean_txt[0])

In [None]:
data.head()

# Vocabulary

In [None]:
word_freq = {}
for txt in data.clean_txt:
    words = pd.Series(txt.split(' ')).value_counts()
    for word in words.index:
        if word.index in word_freq:
            word_freq[word.index] += words[word]
        else: word_freq[word.index] = words[word]
print('Count of unique words:', len(word_freq))

In [None]:
embedding_dim = 128 
vocab_size = len(word_freq)
sequence_length = 64 
vect_layer = TextVectorization(max_tokens=vocab_size, output_mode='int',
                               output_sequence_length=sequence_length)
vect_layer.adapt(data.clean_txt.values)

print('Vocabulary example: ', vect_layer.get_vocabulary()[:10])
print('Vocabulary shape: ', len(vect_layer.get_vocabulary()))

# Splitting data to training, validation and testing part

In [None]:
X = data.clean_txt
y = data.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1,
                                                      random_state=13, stratify=y_train)
print('Train:', X_train.shape, y_train.shape)
print('Test:', X_test.shape, y_test.shape)
print('Validation:', X_valid.shape, y_valid.shape)

y_train_vect = to_categorical(y_train)
y_valid_vect = to_categorical(y_valid)
print('\nEncoding labels example:')
for i in range(5):
    print('  ', list(y_train)[i], '  ', y_train_vect[i])

# Model

In [None]:
def show_history(history):
    plt.figure()
    for key in history.history.keys():
        plt.plot(history.epoch, history.history[key], label=key)
    plt.legend()
    plt.tight_layout()

In [None]:
input_layer = Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = Embedding(vocab_size, embedding_dim)(x_v)
x = Bidirectional(LSTM(128, return_sequences=True))(emb)
x = Dropout(0.5)(x)
x = Bidirectional(LSTM(64))(x)
x = Dropout(0.5)(x)
x = Dense(64, 'relu')(x)
output_layer = Dense(5, 'softmax')(x)

model = Model(input_layer, output_layer)
model.summary()
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)
batch_size = 128
epochs = 50
history = model.fit(X_train.values, y_train_vect, validation_data=(X_valid.values, y_valid_vect), 
                    callbacks=[es], epochs=epochs, batch_size=batch_size)

In [None]:
show_history(history)

# Classification report

In [None]:
def class_report(y_test, y_pred_vect):
    y_pred = np.argmax(y_pred_vect, axis=1)
    test_accuracy = np.sum(y_pred == y_test.values) / y_test.size
    print('Test accuracy:', test_accuracy)
    print('Accuracy score: ', accuracy_score(y_test, y_pred))
    print('F1 score: ', f1_score(y_test, y_pred, average='macro'), '\n')
    print(classification_report(y_true=y_test, y_pred=y_pred))

    conf_mtx = confusion_matrix(y_test, y_pred)
    df_conf_mtx = pd.DataFrame(conf_mtx, index=class_names.category, columns=class_names.category)
    plt.figure(figsize=(12,5))
    sns.heatmap(df_conf_mtx, fmt='d', annot=True, cmap='Blues')
    plt.xlabel('Predicted label', size = 15)
    plt.ylabel('True label', size= 15)
    plt.title('Confusion matrix', size=15)
    plt.show()

In [None]:
class_report(y_test, model.predict(X_test))

# Embedding file: GloVe Dictionary
File **glove.840B.300d.pkl** was imported from https://www.kaggle.com/authman/pickled-glove840b300d-for-10sec-loading

In [None]:
glove_embeddings = np.load('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl',
                           allow_pickle=True)
embedding_dim = len(glove_embeddings['the'])
print("There are", len(glove_embeddings), "words and", embedding_dim, "dimensions in Glove Dictionary.")

In [None]:
tokenizer_keras = Tokenizer(oov_token = "<OOV>")
tokenizer_keras.fit_on_texts(data.clean_txt)
word_index = tokenizer_keras.word_index
vocab_size_token = len(word_index)
print('Vocabulary shape:', vocab_size_token)
list(word_index.items())[:10]

In [None]:
embedding_mtx = np.zeros((vocab_size_token+1, embedding_dim))
for word, idx in word_index.items():
    if word in glove_embeddings:
        embedding_mtx[idx] = glove_embeddings[word]
        
tokenized = pd.DataFrame([word_index]).T.reset_index()
tokenized.columns = ['words','index']
temp_mtx = pd.DataFrame(embedding_mtx).reset_index()
temp_mtx = temp_mtx.drop(0, axis = 0)
df_embedding_mtx = pd.merge(tokenized, temp_mtx, on = 'index')
df_embedding_mtx

In [None]:
def prepare_data(X, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(X)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded

In [None]:
max_len = 512
X_train_vect = prepare_data(X_train, tokenizer_keras, max_len)
X_valid_vect = prepare_data(X_valid, tokenizer_keras, max_len)
X_test_vect = prepare_data(X_test, tokenizer_keras, max_len)

# Model with embedding file

In [None]:
input_layer = Input(shape=(max_len,))
emb = Embedding(vocab_size_token+1, embedding_dim, weights=[embedding_mtx], trainable=False)(input_layer)
x = Bidirectional(LSTM(128, return_sequences=True))(emb)
x = Dropout(0.5)(x)
x = Bidirectional(LSTM(64))(x)
x = Dropout(0.5)(x)
x = Dense(64, 'relu')(x)
output_layer = Dense(5, 'softmax')(x)

model_glove = Model(input_layer, output_layer)
model_glove.summary()
model_glove.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)
batch_size = 128
epochs = 50
history = model_glove.fit(X_train_vect, y_train_vect, validation_data = (X_valid_vect, y_valid_vect),
                          callbacks=[es], epochs=epochs, batch_size=batch_size)

In [None]:
show_history(history)

In [None]:
class_report(y_test, model_glove.predict(X_test_vect))