## **Installation of Required Packages**

In [1]:
!pip install transformers
!pip install memory_profiler
!pip install emoji



## **Importing Libraries**

In [2]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
import re
import emoji
import string
import time
from memory_profiler import memory_usage


## **Loading TIL Test Data**

In [3]:
df_test=pd.read_csv("/content/drive/MyDrive/IABTestdata - IABTestdata.csv")


In [4]:
df_test

Unnamed: 0.1,Unnamed: 0,URL,msid,data
0,0,https://timesofindia.indiatimes.com/city/gurga...,103823534,GURGAON: The state government is planning to c...
1,1,https://timesofindia.indiatimes.com/education/...,103827662,AP EAPCET 2nd Phase Allotment Result 2023: The...
2,2,https://m.timesofindia.com/sports/cricket/news...,103826953,NEW DELHI: West Indies and Kolkata Knight Ride...
3,3,https://timesofindia.indiatimes.com/city/rajko...,103822709,RAJKOT: Three more young men succumbed to card...
4,4,https://timesofindia.indiatimes.com/city/delhi...,103829697,NEW DELHI: Gangster Lawrence Bishnoi on Thursd...
...,...,...,...,...
3396,3396,https://m.timesofindia.com/life-style/events/w...,103965613,"Eid Milad-un-Nabi, also known as Mawlid al-Nab..."
3397,3397,https://m.timesofindia.com/city/guwahati/assam...,104004515,GUWAHATI: Day after Karbi villagers of Assam’s...
3398,3398,https://timesofindia.indiatimes.com/city/hyder...,104008059,HYDERABAD: The popular Ganesh laddu of Hyderab...
3399,3399,https://timesofindia.indiatimes.com/entertainm...,104008630,"Chandramukhi 2', the highly anticipated sequel..."


In [5]:
df_test['data'] = df_test['data'].astype(str)

## **DistilBERT Text Classification Pipeline**

In [16]:
class DistilBERTClassifier:
    def __init__(self, model_path):
        # Load the pre-trained DistilBERT model
        self.model = TFDistilBertForSequenceClassification.from_pretrained(model_path)
        # Load the tokenizer
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)

    # def preprocess_text(self, text):
    #     # Convert text to lowercase
    #     text = text.lower()

    #     # Remove punctuation (except full stops)
    #     translation_table = str.maketrans("", "", string.punctuation.replace(".", ""))
    #     text = text.translate(translation_table)

    #     # Remove URLs and emojis
    #     text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    #     text = emoji.demojize(text)
    #     text = re.sub(r":[^:]+:", "", text)

    #     # Remove emails and numbers
    #     email_pattern = r'\S+@\S+\.\S+'
    #     number_pattern = r'\b\d+\b'
    #     text = re.sub(email_pattern, '', text)
    #     text = re.sub(number_pattern, '', text)

    #     # Remove special characters
    #     pattern = r'[^a-zA-Z\s.]'
    #     text = re.sub(pattern, '', text)
    #     text = text.replace('\n', ' ')

    #     return text

    def tokenize_text(self, text):
        # Tokenize the text using DistilBERT tokenizer
        tokens = self.tokenizer(text, truncation=True, padding=True, return_tensors="tf")
        return tokens


    def predict_categories(self, texts, batch_size=64):
        # Tokenize the text data without padding
        tokens = self.tokenizer(texts, truncation=True, padding=True, return_tensors="tf")

        # Create a TensorFlow Dataset
        test_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokens['input_ids'],
                                                            'attention_mask': tokens['attention_mask']})).batch(batch_size)

        test_predictions = self.model.predict(test_dataset)
        logits = np.array(test_predictions.logits)

        # Decode predictions to get category labels
        predicted_labels = np.argmax(logits, axis=1)

        # Map predicted labels to target categories
        label_mapping = {
            0: 'academic interests', 1: 'arts and culture', 2: 'automotives',
            3: 'books and literature', 4: 'business and finance', 5: 'careers',
            6: 'family and relationships', 7: 'food and drinks', 8: 'health',
            9: 'healthy living', 10: 'hobbies and interests', 11: 'home and garden',
            12: 'movies', 13: 'music and audio', 14: 'news and politics',
            15: 'personal finance', 16: 'pets',
            17: 'pharmaceuticals, conditions, and symptoms', 18: 'real estate',
            19: 'shopping', 20: 'sports', 21: 'style and fashion',
            22: 'technology and computing', 23: 'television', 24: 'travel', 25: 'video gaming'
        }

        # Map predicted labels to categories
        predicted_categories = label_mapping[predicted_labels[0]]

        return predicted_categories


## **Model Loading, Inference, and Results**

In [7]:
# Load your fine-tuned DistilBERT model from a directory
model_path = "/content/drive/MyDrive/model_on_unp_2000_10_.h5"
classifier = DistilBERTClassifier(model_path)

start_time=time.time()
df_test["predicted_categories"]=df_test['data'].apply(classifier.predict_categories)
end_time=time.time()
inference_time=end_time-start_time
print("INFERENCE TIME (seconds) using DistilBERT model:- ", inference_time)

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/model_on_unp_2000_10_.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


INFERENCE TIME (seconds) using DistilBERT model:-  290.665429353714


## **Results in DataFrame**

In [11]:
df_test

Unnamed: 0,URL,msid,data,predicted_categories
0,https://timesofindia.indiatimes.com/city/gurga...,103823534,GURGAON: The state government is planning to c...,real estate
1,https://timesofindia.indiatimes.com/education/...,103827662,AP EAPCET 2nd Phase Allotment Result 2023: The...,academic interests
2,https://m.timesofindia.com/sports/cricket/news...,103826953,NEW DELHI: West Indies and Kolkata Knight Ride...,music and audio
3,https://timesofindia.indiatimes.com/city/rajko...,103822709,RAJKOT: Three more young men succumbed to card...,health
4,https://timesofindia.indiatimes.com/city/delhi...,103829697,NEW DELHI: Gangster Lawrence Bishnoi on Thursd...,health
...,...,...,...,...
3396,https://m.timesofindia.com/life-style/events/w...,103965613,"Eid Milad-un-Nabi, also known as Mawlid al-Nab...",health
3397,https://m.timesofindia.com/city/guwahati/assam...,104004515,GUWAHATI: Day after Karbi villagers of Assam’s...,personal finance
3398,https://timesofindia.indiatimes.com/city/hyder...,104008059,HYDERABAD: The popular Ganesh laddu of Hyderab...,health
3399,https://timesofindia.indiatimes.com/entertainm...,104008630,"Chandramukhi 2', the highly anticipated sequel...",movies


In [13]:
df_test.to_csv("/content/drive/MyDrive/TIL_Test_dataset_with_predictions.csv")