**Multiclass Text Classification DISTILBERT**

Dataset Link - https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv

In [None]:
!pip uninstall transformers==4.20.1 -y

In [None]:
!python3 -m pip install transformers==4.22.1 -q

In [None]:
import transformers

In [None]:
!pip install tensorflow_probability==0.13.0

In [None]:
!pip install -U tensorflow==2.10

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm

In [None]:
root_path = '/content/bbc-text.csv'

In [None]:
df = pd.read_csv(root_path)
df.head()

In [None]:
df.shape

**Histogram of the count of text**

In [None]:
df['count'] = df['text'].apply(lambda x: len(x.split()))

In [None]:
df.head()

In [None]:
plt.figure(figsize= (8, 8))

sns.displot(df['count'])

plt.xlim(0, 1000)

plt.xlabel('The num of words ', fontsize = 16)
plt.title("The Number of Words Distribution", fontsize = 18)
plt.show()

**Bar plot for each of the new category**

In [None]:
category_count = df['category'].value_counts()

categories = category_count.index

categories

In [None]:
category_count

In [None]:
category_count.index

In [None]:
fig = plt.figure(figsize= (12, 5))

ax = fig.add_subplot(111)

sns.barplot(x = category_count.index, y = category_count )

for a, p in enumerate(ax.patches):
    ax.annotate(f'{categories[a]}\n' + format(p.get_height(), '.0f'), xy = (p.get_x() + p.get_width() / 2.0, p.get_height()), xytext = (0,-25), size = 13, color = 'white' , ha = 'center', va = 'center', textcoords = 'offset points', bbox = dict(boxstyle = 'round', facecolor='none',edgecolor='white', alpha = 0.5) )

plt.xlabel('Categories', size = 15)

plt.ylabel('The Number of News', size= 15)

plt.xticks(size = 12)

plt.title("The number of News by Categories" , size = 18)

plt.show()

In [None]:
df['category'].unique()

In [None]:
df['encoded_text'] = df['category'].astype('category').cat.codes

df.head(10)

In [None]:
data_texts = df['text'].to_list()

data_labels = df['encoded_text'].to_list()

**Train Test SPlit**

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0 )


train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size = 0.01, random_state = 0 )

**Model Definition**

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation = True, padding = True  )

val_encodings = tokenizer(val_texts, truncation = True, padding = True )

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))


val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

**Fine-tuning with the TFTrainer class**

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments


training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1e-5,
    logging_dir='./logs',
    eval_steps=100
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 5 )


trainer = TFTrainer(
    model=trainer_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

**Saving & Loading the model**

In [None]:
save_directory = "/saved_models"

model.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

**Loading Pre-Trained Model**

In [None]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)

model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
test_text = test_texts[1]

test_text

In [None]:
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'
)

output = model_fine_tuned(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value