In [None]:
# Data Manupilation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Basket Analysis
from mlxtend.frequent_patterns import apriori, association_rules

# Deep Learning
import tensorflow as tf
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn import metrics
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
from tqdm import tqdm

# Text
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

In [None]:
df = pd.read_csv('../input/topic-modeling-for-research-articles-20/Train.csv')
df_test = pd.read_csv('../input/topic-modeling-for-research-articles-20/Test.csv')
df.head()

Each text has more than one topic. For example, one text can be about computer science, statistics and machine learning at the same time. 

Let's find out the total number of topics for each text with the sum function.

In [None]:
df['no_topics'] = df.iloc[:,2:].sum(axis = 1)
df.sort_values(by = 'no_topics', ascending = False).head()

In [None]:
df['no_topics'].value_counts()

* As you can see, highest total number of topic is 7. Actually, all of the texts have more than one topics.
* I picked one of the text that has 7 different topics. You can see whole text and text topics below.

In [None]:
df.loc[12706,'ABSTRACT' ]

In [None]:
df.loc[df['id'] == 11080].T

# Basket Analysis
* I want to reduce the number of topic for related topics.
* Basically, I want to create main category for each text so that I can classifiy them. 
* To find the relationships of topics, I used basket analysis.

In [None]:
frequency_result = apriori(df.iloc[:,2:31], min_support = 0.01, use_colnames=True).sort_values('support', ascending = False).reset_index(drop= True)
association = association_rules(frequency_result, metric = 'lift', min_threshold =1 )

In [None]:
#print(association.sort_values(by = ['confidence','support'], ascending = False).to_string())

In [None]:
association.sort_values(by = ['confidence','support'], ascending = False).head()

**Result of Basket Analysis**


* We can see most related categories from the table.
* I looked confidence value to find out the relationship.
* For example, fourth row shows that Strongly Collerated Electrons and Physics was together 6.3% of the whole dataset. And based on the confidence figure, Strongly Collerated Electronslso and Physics are highly correlated.
* Hence, we can say that Strongly Collerated Electrons is subcategory of Physics.

In [None]:
df.columns

* Based on confidence figure, I have decided to shrink the categories into 3 main categories.
* Machine Learning and AI -> Statistics, Statistics Theory, Machine Learning, Artificial Intelligence, Computer Vision and Pattern Recognition
* Physics -> Physics, Superconductivity, Strongly Correlated Electrons,  Astrophysics of Galaxies, Cosmology and Nongalactic Astrophysics, Earth and Planetary Astrophysics, Fluid Dynamics, Instrumentation and Methods for Astrophysics, Materials Science
* Mathematics -> Mathematics, Number Theory, Analysis of PDEs, Differential Geometry, Optimization and Control, Representation Theory

In [None]:
df_test.head()

In [None]:
df['category'] = df.iloc[:,2:31].apply((lambda x: 'Machine Learning and AI' if (x['Statistics'] == 1 or x['Statistics Theory']==1 or x['Machine Learning']==1 or x['Artificial Intelligence'] == 1 or x['Computer Vision and Pattern Recognition'] ==1) 
                                        else 'Physics' if (x['Physics'] ==1 or x['Superconductivity'] == 1 or x['Strongly Correlated Electrons'] == 1 or  x['Astrophysics of Galaxies'] == 1 or x['Cosmology and Nongalactic Astrophysics'] ==1 or x['Earth and Planetary Astrophysics'] ==1 or x['Fluid Dynamics'] ==1 or x['Instrumentation and Methods for Astrophysics'] ==1 or x['Materials Science'] ==1) 
                                        else 'Mathematics' if(x['Mathematics'] ==1 or x['Number Theory'] == 1 or x['Analysis of PDEs'] == 1 or x['Differential Geometry'] ==1 or x['Optimization and Control'] == 1 or x['Representation Theory'] ==1) else 'other') , axis = 1)

df_test['category'] = df_test.iloc[:,2:31].apply((lambda x: 'Machine Learning and AI' if x['Statistics'] == 1 or x['Computer Science']==1 else 'Physics' if x['Physics'] else 'Mathematics' if x['Mathematics'] else 'other'), axis = 1)

In [None]:
df.head()

In [None]:
df['category'].value_counts()

* As we can see, almost half of the texts are about machine learning and AI. 
* I dropped the other category for train dataset. So, we will have three main categories.

In [None]:
df_train = df[['ABSTRACT', 'category']]
df_train = df_train[df_train['category'] != 'other' ]
df_train =df_train.rename(columns = {'ABSTRACT' : 'text'})

df_test = df_test[['ABSTRACT', 'category']]
df_test = df_test[df_test['category'] != 'other' ]
df_test =df_test.rename(columns = {'ABSTRACT' : 'text'})

In [None]:
df_train.head()

# The total number of words 

In [None]:
df_train['len_text'] = df_train['text'].apply(lambda x: len(x.split()) )
df_test['len_text'] = df_test['text'].apply(lambda x: len(x.split()) )

In [None]:
df_train.head()

In [None]:
len(df_train)

In [None]:
sns.set(font_scale=1.2)
sns.set_style("white")
fig, ax = plt.subplots(2,sharex = True, figsize = (8,8), gridspec_kw={"height_ratios": (.2, .85)})
sns.boxplot(df_train['len_text'], ax = ax[0])
sns.distplot(df_train['len_text'], ax = ax[1], kde =False)
ax[0].set(xlabel = '', yticks = [])
ax[0].set_title('The Distibution of Length of Texts', fontsize = 20)
ax[1].set_xlabel('Length of Text', fontsize = 14)
ax[1].set_ylabel('Count', fontsize = 14)
sns.despine(ax = ax[0])
sns.despine(ax=ax[0], left=True)
plt.subplots_adjust(hspace = .05)
plt.show()

* As you can see from the chart above, most of the text lengths are between 0 and 300. There 3 outliers in the texts.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
enc = LabelEncoder()
df_train['label'] = enc.fit_transform(df_train['category'])
df_test['label'] = enc.fit_transform(df_test['category'])

In [None]:
df_train['category'].value_counts()

In [None]:
df_train.head()

In [None]:
fig ,ax = plt.subplots(figsize = (8,8))
for col in df_train['category'].unique():
    ax = sns.distplot(df_train.loc[(df_train['category'] == col) & (df_train['len_text'] < 350), 'len_text'], kde = False)
    
ax.legend(df_train['category'].unique())
ax.set_title('The Distribution of Length of Texts by Category', fontsize = 18)
ax.set_xlabel('Length of Texts')
sns.despine(left = True)
ax.set_ylabel('Count')
plt.show()

The length of texts for Mathemathics is lower than other categories.

In [None]:
category_encoding = df_train.groupby(['category'])['label'].min().rename('label').reset_index()
category_encoding

* This is the example of bert tokenization. The category names are encoded. [input_ids, token_type_ids, attention_mask].

# **Cleaning the Text**
* It is better to get rid of punctuation and stopwords from the text.

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\']',' ',text)
    text = text.split()
    text = [word for word in text if word not in stopw]
    text = ' '.join(text)
    text = re.sub(r'  ', ' ', text)
    text = re.sub(r'   ', ' ', text)
    return text

In [None]:
print("---- Text ----\n")
print(df_train.loc[0, 'text'])
print("\n---- Cleaned Text ----\n")
print(clean_text(df_train.loc[0, 'text']))

In [None]:
df_train['cleaned_text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['cleaned_text'] = df_test['text'].apply(lambda x: clean_text(x))

# N-Grams
* This function returns the unique words(bigram) or sequence of words and its occurrence frequency.

In [None]:
def n_gram(df, category = '', n_gram = 1, text_column= 'cleaned_text'):
    n_gram_result = {}
    
    if category != '':
        df = df[df['category'] == category]
    
    for text in tqdm(df[text_column]):
        for i in range(len(text)):
            words = text.split()[i:i+n_gram]
            words = ' '.join(words)

            if (len(words.split()) % n_gram) > 0 or words == '':
                continue

            elif words in n_gram_result.keys():
                n_gram_result[words] += 1
            else:
                n_gram_result[words] = 1
                
                
    n_gram_result = pd.DataFrame([n_gram_result]).T.reset_index()
    n_gram_result.columns = [category + '_n_grams_'+ str(n_gram), category+ '_counts_' + str(n_gram)]


    return n_gram_result

# Bigram, 2-Gram, and 3-Gram
* The new dataframe is created to find top 30 sequence of words in terms of categories.

In [None]:
temp_result = {}
for category in df_train['category'].unique():
    for i in range(1,4):
        result = n_gram(df_train, category = category, n_gram = i, text_column = 'cleaned_text').sort_values(by = category + "_counts_"+ str(i), ascending = False).head(30).reset_index(drop =True)
        temp_result[category + "_", str(i)] = result


n_gram_result = pd.DataFrame({})
for i in temp_result.keys():
    n_gram_result = pd.concat([n_gram_result , temp_result[i]], axis = 1)

# Physics N-Gram

In [None]:
n_gram_result.iloc[:,0:6]

# Machine Learning and AI N-Gram

In [None]:
n_gram_result.iloc[:,6:12]

# Mathematics N-Gram

In [None]:
n_gram_result.iloc[:,12:18]

# Tensorflow with Glove Embeddings

* Glove provides vector representation for words. I will use Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors).
* It provides 300 dimensinal vector. Every dimension represents different feature of word. For example, one is positivity value of word, another one is negativity value of word, and so on.
* We will find all words in the dataset and will create emmbedding matrix for all words.
* The bidirectional layer will be used in neural network. Bidirectional layer is impoartant to train text data because it trains the sentence or text as is and reversed version of sentence.
* **For example, "I like a bar that plays jazz music" - "I like a bar of white chocolate not small piece". As you can see first 3 words are same. If we don't train dataset with bidirectional layers, we cannot understand real meaning of the bar.**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
glove_embeddings = np.load('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', allow_pickle=True)

In [None]:
print("There are {} words and every word has {} dimensions in Glove Dictionary. I used the word 'sister' as an example.".format(len(glove_embeddings.keys()),len(glove_embeddings['sister']),))

# Uncovered words in the Glove Embeddings
* We will find the words that Glove doesn't include and proportion of uncovered words.

In [None]:
covered_words_by_category = {'Physics' : {},'Machine Learning and AI':{},'Mathematics' : {} }
uncovered_words_by_category = {'Physics' : {},'Machine Learning and AI':{},'Mathematics' : {} }
for category in df_train['category'].unique():
    for text in tqdm(df_train.loc[df['category'] == category, 'cleaned_text']):
        text = text.split()
        for word in text:
            if word in glove_embeddings.keys():
                if word not in covered_words_by_category[category].keys():
                    covered_words_by_category[category][word] = 1
                else:
                    covered_words_by_category[category][word] += 1

            else:
                if word not in uncovered_words_by_category[category].keys():
                    uncovered_words_by_category[category][word] = 1
                else:
                    uncovered_words_by_category[category][word] += 1
                    
                    
covered_words ={}
uncovered_words = {}
for text in tqdm(df_train['cleaned_text']):
    text = text.split()
    for word in text:
        if word in glove_embeddings.keys():
            if word not in covered_words.keys():
                covered_words[word] = 1
            else:
                covered_words[word] += 1

        else:
            if word not in uncovered_words.keys():
                uncovered_words[word] = 1
            else:
                uncovered_words[word] += 1

In [None]:
print("---There are {} words in the whole dataset, and {:.2f}% of the words aren't covered by Glove---".format((len(uncovered_words) + len(covered_words)),len(uncovered_words) / (len(uncovered_words)+len(covered_words))*100))
print("---There are {} words in the Physics category, and {:.2f}% of the words aren't covered by Glove---".format((len(uncovered_words_by_category['Physics']) + len(covered_words_by_category['Physics'])),len(uncovered_words_by_category['Physics']) / (len(uncovered_words_by_category['Physics'])+len(covered_words_by_category['Physics']))*100))
print("---There are {} words in the Machine Learning and AI category, and {:.2f}% of the words aren't covered by Glove---".format((len(uncovered_words_by_category['Machine Learning and AI']) + len(covered_words_by_category['Machine Learning and AI'])),len(uncovered_words_by_category['Machine Learning and AI']) / (len(uncovered_words_by_category['Machine Learning and AI'])+len(covered_words_by_category['Machine Learning and AI']))*100))
print("---There are {} words in the Mathematics category, and {:.2f}% of the words aren't covered by Glove---".format((len(uncovered_words_by_category['Mathematics']) + len(covered_words_by_category['Mathematics'])),len(uncovered_words_by_category['Mathematics']) / (len(uncovered_words_by_category['Mathematics'])+len(covered_words_by_category['Mathematics']))*100))

* In glove embeddings, it is helpful to clean the text. Glove embeddings doesn't cover 25% of the words because there are too many words in these three topics. If we don't clean the text, proportion of uncovered words is around 70%.

In [None]:
covered_words_by_category = pd.DataFrame(covered_words_by_category).reset_index().rename(columns = {'index' :'words'})
uncovered_words_by_category = pd.DataFrame(uncovered_words_by_category).reset_index().rename(columns = {'index' :'words'})

**Physics uncoverd words**

In [None]:
uncovered_words_by_category[['words', 'Physics']].sort_values(by = 'Physics', ascending = False).head(20)

**Machine Learning and AI uncoverd words**

In [None]:
uncovered_words_by_category[['words', 'Machine Learning and AI']].sort_values(by = 'Machine Learning and AI', ascending = False).head(20)

**Mathematics uncovered words**

In [None]:
uncovered_words_by_category[['words', 'Mathematics']].sort_values(by = 'Mathematics', ascending = False).head(20)

# Embedding Matrix
**Creating embedding matrix for all the words that are in the whole dataset.**

In [None]:
tokenizer_keras = Tokenizer(num_words = 36073, oov_token = '<OOV>' )
tokenizer_keras.fit_on_texts(df_train['text'])
word_index = tokenizer_keras.word_index
vocab_size_keras = len(word_index)
embedding_dim = 300
word_embeddings = np.zeros((vocab_size_keras , embedding_dim))

for word, i in word_index.items():
    if word in glove_embeddings.keys():
        word_embeddings[i-1] = glove_embeddings[word]

In [None]:
word_tokens = pd.DataFrame([word_index]).T.reset_index().rename(columns = {'index': 'words', 0: 'tokens'})
keras_word_embeddings = pd.DataFrame(word_embeddings).reset_index().rename(columns = {'index':'tokens'})
keras_word_embeddings['tokens'] = keras_word_embeddings['tokens'] + 1

In [None]:
word_tokens.merge(keras_word_embeddings , how = 'left' , on = 'tokens')

In [None]:
def prepare_data(df, tokenizer, text_column = 'cleaned_text',label_column = 'labels', max_len = 256):
    '''
    This function converts the text data into tokens. max_len is the number of words that we want to use in each text. 
    So, if the text includes more than 256 words, the post words of the text is going to be deleted.(trun)
    If the text includes less than 256 words, 0 will be added into vector. (padding)
    '''
    sequences = tokenizer.texts_to_sequences(df[text_column])
    padded = pad_sequences(sequences, maxlen = max_len, padding = 'post', truncating = 'post')
    labels = tf.keras.utils.to_categorical(df[label_column])
    return padded, labels

In [None]:
df_train.head()

In [None]:
df_train.head()

In [None]:
max_len = 128
padded, labels = prepare_data(df_train, tokenizer_keras, text_column = 'cleaned_text', label_column = 'label', max_len= max_len)

**Creating the validation data**

In [None]:
training_portion =0.75
training_size = int(len(df_train) * training_portion)
padded_training = padded[:training_size]
labels_training = labels[:training_size]
padded_val = padded[training_size:]
labels_val = labels[training_size:]

In [None]:
len(word_embeddings)

In [None]:
padded_training[6]

In [None]:
model_glove = tf.keras.Sequential()
model_glove.add(tf.keras.layers.Embedding(vocab_size_keras , embedding_dim,input_length = max_len, weights = [word_embeddings], trainable = False))
model_glove.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)))
model_glove.add(tf.keras.layers.Dropout(0.5))
model_glove.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model_glove.add(tf.keras.layers.Dropout(0.5))
model_glove.add(tf.keras.layers.Dense(32, activation = 'relu'))
model_glove.add(tf.keras.layers.Dropout(0.5))
model_glove.add(tf.keras.layers.Dense(3, activation = 'softmax'))

model_glove.compile(optimizer = 'Adam', loss= 'categorical_crossentropy', metrics = ['accuracy'])
model_glove.summary()

In [None]:
model_glove.fit(padded_training, labels_training, epochs = 10, verbose = 1, validation_data = (padded_val, labels_val))

**Testing Model with Glove**

In [None]:
test_data, test_label = prepare_data(df_test, tokenizer_keras, text_column = 'cleaned_text', label_column = 'label', max_len = 128)
output_label = np.argmax(test_label, axis =1 ).flatten()
predicted_label = np.argmax(model_glove.predict(test_data), axis = 1).flatten()
test_accuracy = np.sum(output_label == predicted_label) / len(output_label)
print("The test set includes {} texts and the accuracy is {:.2f}".format(len(output_label), test_accuracy * 100))

# Tensorflow with Bert

In [None]:
tf.keras.backend.clear_session()
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold

In [None]:
def processing_data( row, row_label,category_encoding = category_encoding ,max_len = max_len):
    
    '''
    Bert needs 3 different token vectors, so we need to convert data.
    '''
    label = category_encoding[category_encoding['category'] == row_label]['label'].values[0]
    temp_input_ids = tokenizer.encode(row, max_length = max_len)
    pad_len = max_len - len(temp_input_ids)  
    input_ids =temp_input_ids + [0] * pad_len
    attention_masks= [1] * len(temp_input_ids)+ [0] * pad_len
    token_type_ids =  [0] * max_len
    return np.array(input_ids), np.array(attention_masks), np.array(token_type_ids), np.array(label)


def encode(df, max_len = max_len, text_column = 'cleaned_text', category_encoding = category_encoding):
    '''
    This function is created to convert 3 special vectors into numpy array.
    '''
    ids, masks, token_ids, labels  = map(list, zip(*df[[text_column, 'category']].apply(lambda x: processing_data(x[text_column], x.category, category_encoding, max_len = max_len) , axis = 1)))
    ids = np.array(ids, dtype = 'float32')
    masks = np.array(masks, dtype = 'float32')
    token_ids = np.array(token_ids, dtype = 'float32')
    labels = tf.keras.utils.to_categorical(labels)
    return ids, masks, token_ids, labels

In [None]:
category_encoding

In [None]:
def build_model(max_len = max_len, no_category = 3):
    ids = tf.keras.Input(shape = (max_len, ), dtype = tf.int32)
    masks = tf.keras.Input(shape = (max_len, ), dtype = tf.int32)
    token_ids = tf.keras.Input(shape = (max_len), dtype = tf.int32)
    
    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",  trainable=True)
    
    pooled_output , sequence_output = bert_layer([ids, masks, token_ids])
    
    output = sequence_output[:,0,:]

    out = tf.keras.layers.Dense(no_category, activation = 'softmax')(output)
    
    model = tf.keras.models.Model(inputs = [ids, masks, token_ids], outputs = out)
    
    model.compile(loss = 'categorical_crossentropy', optimizer= tf.optimizers.Adam(learning_rate = 3e-5), metrics = ['accuracy'])
                  
    return model

In [None]:
tf.keras.backend.clear_session()
skf = StratifiedKFold(n_splits = 4, shuffle = True)
ids, masks, token_ids, labels = encode(df_train, max_len = max_len)

for i , (train_index, val_index) in enumerate(skf.split(ids, labels.argmax(1))):
    ids_train = ids[train_index,:]
    masks_train = masks[train_index,:]
    token_ids_train = token_ids[train_index, :]
    labels_train = labels[train_index, :]
    ids_val = ids[val_index,:]
    masks_val = masks[val_index, :] 
    token_ids_val = token_ids[val_index, :]
    labels_val = labels[val_index, :]
    print("Fold :{}".format(i+1))
    model = build_model(max_len = max_len)
    
    model.fit((ids_train, masks_train, token_ids_train), labels_train, verbose = 1, epochs = 4, batch_size = 32, validation_data = ((ids_val, masks_val, token_ids_val), labels_val))

In [None]:
def flat_accuracy(output, prediction):
    prediction_flat = np.argmax(prediction, axis=1).flatten()
    output_flat = np.argmax(output, axis=1).flatten()
    return np.sum(prediction_flat == output_flat) / len(output_flat)

In [None]:
ids_test, masks_test, token_ids_test, labels_test = encode(df_test, max_len = 128)
prediction = model.predict((ids_test, masks_test, token_ids_test))
output = tf.keras.utils.to_categorical(df_test['label'])
test_result = flat_accuracy(output, prediction)
print("The test set includes {} texts and the accuracy is {:.2f}".format(len(output), test_result * 100))

# Classification for both Research and BBC Datasets
* I am adding extra data for broader classification.
* I have already implemented Bert Classification in another noteboook. You can reach from the link below.
* https://www.kaggle.com/cempek/bbc-multiclass-glove-tf-vs-bert-pytorch-tf-99-5
* We had 3 categories in previous dataset. I am now adding 5 more categories from BBC dataset includes less technical topics.
* BBC dataset includes news about politics, entertainment, sport, tech, and business.

In [None]:
df_bbc = pd.read_csv('../input/bbc-fulltext-and-category/bbc-text.csv')
df_bbc.head()

* We retrieve sample data from the research dataset since BBC dataset has around 450 news in each category.

In [None]:
df_research = df_train.groupby('category').sample(n = 600, random_state = 1)
#df_research = df_train.groupby('category').apply(lambda x: x.sample(500)).reset_index(drop=True)

In [None]:
df_merge = pd.concat([df_research[['text', 'category']], df_bbc]).reset_index(drop = True)
df_merge['category'].value_counts()

**Splitting data into test and training**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_merge['text'], df_merge['category'], test_size = 0.05, random_state = 42)

In [None]:
df_merge = pd.concat([X_train, y_train], axis = 1)
df_merge_test = pd.concat([X_test, y_test], axis = 1)

In [None]:
enc2 = LabelEncoder()
df_merge['label'] = enc2.fit_transform(df_merge['category'])
df_merge['cleaned_text'] = df_merge['text'].apply(lambda x: clean_text(x))
df_merge_test['label'] = enc2.fit_transform(df_merge_test['category'])
df_merge_test['cleaned_text'] = df_merge_test['text'].apply(lambda x: clean_text(x))
df_merge_test = df_merge_test.reset_index(drop= True)

In [None]:
category_encoding_merge = df_merge.groupby(['category'])['label'].min().rename('label').reset_index()
category_encoding_merge

I have used same model for this merged dataset, I just changed the number category in the model.

In [None]:
tf.keras.backend.clear_session()
skf = StratifiedKFold(n_splits = 4, shuffle = True)
ids, masks, token_ids, labels = encode(df_merge, max_len = max_len, category_encoding = category_encoding_merge)

for i , (train_index, val_index) in enumerate(skf.split(ids, labels.argmax(1))):
    ids_train = ids[train_index,:]
    masks_train = masks[train_index,:]
    token_ids_train = token_ids[train_index, :]
    labels_train = labels[train_index, :]
    ids_val = ids[val_index,:]
    masks_val = masks[val_index, :] 
    token_ids_val = token_ids[val_index, :]
    labels_val = labels[val_index, :]
    print("Fold :{}".format(i+1))
    model_merge = build_model(max_len = max_len, no_category = 8)
    
    model_merge.fit((ids_train, masks_train, token_ids_train), labels_train, verbose = 1, epochs = 4, batch_size = 32, validation_data = ((ids_val, masks_val, token_ids_val), labels_val))

In [None]:
ids_test, masks_test, token_ids_test, labels_test = encode(df_merge_test, max_len = max_len, category_encoding = category_encoding_merge)
prediction = model_merge.predict((ids_test, masks_test, token_ids_test))
output = tf.keras.utils.to_categorical(df_merge_test['label'])
test_result = flat_accuracy(output, prediction)
print("The test set includes {} texts and the accuracy is {:.2f}".format(len(output), test_result * 100))

# Result
* In the research dataset, Bert performed better than bidirectional LSTM model.
* After I merged the Research and BBC datasets, I have received better results(more than 95%) in the test data. 
* It is hard to predict the text that has very similar topic like Math and Physics, so the accuracy increased in the merged dataset which includes topics such as politics, entertainment, sport, business, machine learning and so on.