In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import string 
import emoji
import re

# Preprocessing

In [2]:
df = pd.read_csv("sample_df (4).zip")
df

Unnamed: 0,text,target,Word Count
0,NEW DELHI: The Andhra Pradesh Public Service C...,academic interests,239
1,PUNE: Two weeks after the new academic year ha...,academic interests,500
2,GUWAHATI: The results of the CBSE Class X exam...,academic interests,470
3,"admission into the IIMs,"" said Kapoor. Across ...",academic interests,108
4,Mangaluru: The Mangalore Institute of Technolo...,academic interests,336
...,...,...,...
519995,NAGPUR: Akshay Zadgaonkar is a child prodigy. ...,video gaming,500
519996,\nBayonetta lead Hideki Kamiya reckons the lat...,video gaming,290
519997,\nAl Pacino thinks the original Godfather is b...,video gaming,259
519998,The latest episode of Imlie begins with Aryan ...,video gaming,251


In [3]:
print(len(pd.unique(df.target)))

26


# Lowercase the dataset

In [4]:
df.text = df.text.str.lower()
df.text = df.text.astype(str)
df.dtypes

text          object
target        object
Word Count     int64
dtype: object

In [5]:
df

Unnamed: 0,text,target,Word Count
0,new delhi: the andhra pradesh public service c...,academic interests,239
1,pune: two weeks after the new academic year ha...,academic interests,500
2,guwahati: the results of the cbse class x exam...,academic interests,470
3,"admission into the iims,"" said kapoor. across ...",academic interests,108
4,mangaluru: the mangalore institute of technolo...,academic interests,336
...,...,...,...
519995,nagpur: akshay zadgaonkar is a child prodigy. ...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [6]:
df["text"].iloc[0]

'new delhi: the andhra pradesh public service commission (appsc) has released appsc group 2 notification on its official website on december 31, 2018. as per the notification, there are 446 vacancies to be filled through the examination. interested and eligible candidates can apply for the appsc group 2 vacancies from january 10, 2019 to january 31, 2019 through the official website — https://psc.ap.gov.in/. the ap psc will conduct screening/preliminary test on may 5, 2019 while the main exam for the successful candidates would be conducted on july 18 and 19, 2019. appsc group 2: important dates    event date   opening date of application 1-jan-19   closing date of application 31-jan-19 appsc group 2: vacancy details    type carried forward   executive posts 16   non–executive posts 94   type fresh posts   executive posts 138   non–executive posts 198   total 446 educational qualificationgraduation degree in any discipline from a recognized university or institute. age limit (as on jul

# Removing Punctuations

In [7]:
import string
import pandas as pd

def remove_punct(text):
    if isinstance(text, str):
        # Create a translation table that removes all punctuation except for full stops
        translation_table = str.maketrans("", "", string.punctuation.replace(".", ""))
        return text.translate(translation_table)
    elif pd.notna(text):
        return str(text)
    else:
        return text

# Apply the function to the 'text' column in your DataFrame
df['text'] = df['text'].apply(remove_punct)

# Now, full stops will be preserved in the text

In [8]:
df["text"].iloc[0]

'new delhi the andhra pradesh public service commission appsc has released appsc group 2 notification on its official website on december 31 2018. as per the notification there are 446 vacancies to be filled through the examination. interested and eligible candidates can apply for the appsc group 2 vacancies from january 10 2019 to january 31 2019 through the official website — httpspsc.ap.gov.in. the ap psc will conduct screeningpreliminary test on may 5 2019 while the main exam for the successful candidates would be conducted on july 18 and 19 2019. appsc group 2 important dates    event date   opening date of application 1jan19   closing date of application 31jan19 appsc group 2 vacancy details    type carried forward   executive posts 16   non–executive posts 94   type fresh posts   executive posts 138   non–executive posts 198   total 446 educational qualificationgraduation degree in any discipline from a recognized university or institute. age limit as on july 1 2018 minimum 18 y

# Removing URLs and Emojis

In [9]:
def removeURLandEmoji(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = emoji.demojize(text)
    text = re.sub(r":[^:]+:", "", text)
    
    return text
    
df.text = df.text.apply(removeURLandEmoji)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor. across 13...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy. h...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [10]:
df["text"].iloc[0]

'new delhi the andhra pradesh public service commission appsc has released appsc group 2 notification on its official website on december 31 2018. as per the notification there are 446 vacancies to be filled through the examination. interested and eligible candidates can apply for the appsc group 2 vacancies from january 10 2019 to january 31 2019 through the official website —  the ap psc will conduct screeningpreliminary test on may 5 2019 while the main exam for the successful candidates would be conducted on july 18 and 19 2019. appsc group 2 important dates    event date   opening date of application 1jan19   closing date of application 31jan19 appsc group 2 vacancy details    type carried forward   executive posts 16   non–executive posts 94   type fresh posts   executive posts 138   non–executive posts 198   total 446 educational qualificationgraduation degree in any discipline from a recognized university or institute. age limit as on july 1 2018 minimum 18 yearsmaximum 42 year

# Remove Emails and Numbers
    

In [11]:
def remove_Emails_and_Numbers(text):
    email_pattern = r'\S+@\S+\.\S+'
    number_pattern = r'\b\d+\b'

    cleaned_text = re.sub(email_pattern, '', text)
    cleaned_text = re.sub(number_pattern, '', cleaned_text)
    return cleaned_text

df.text = df.text.apply(remove_Emails_and_Numbers)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor. across i...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy. h...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [12]:
df["text"].iloc[0]

'new delhi the andhra pradesh public service commission appsc has released appsc group  notification on its official website on december  . as per the notification there are  vacancies to be filled through the examination. interested and eligible candidates can apply for the appsc group  vacancies from january   to january   through the official website —  the ap psc will conduct screeningpreliminary test on may   while the main exam for the successful candidates would be conducted on july  and  . appsc group  important dates    event date   opening date of application 1jan19   closing date of application 31jan19 appsc group  vacancy details    type carried forward   executive posts    non–executive posts    type fresh posts   executive posts    non–executive posts    total  educational qualificationgraduation degree in any discipline from a recognized university or institute. age limit as on july   minimum  yearsmaximum  years feesapplication fees rs   examination fees rs  no examinat

In [13]:
import re
import codecs

def removeSpecialChar(text):
    pattern = r'[^a-zA-Z\s.]'  # Include '.' in the pattern

    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = cleaned_text.replace('\n', ' ') # remove '\n' with ' '
    cleaned_text = codecs.decode(cleaned_text, 'unicode_escape') # remove '\' with ''
    cleaned_text = cleaned_text.replace('\xa0', ' ') # remove '\xa0' with ' '
    return cleaned_text

# Apply the modified function to the 'text' column in your DataFrame
df['text'] = df['text'].apply(removeSpecialChar)


In [14]:
df["text"].iloc[0]

'new delhi the andhra pradesh public service commission appsc has released appsc group  notification on its official website on december  . as per the notification there are  vacancies to be filled through the examination. interested and eligible candidates can apply for the appsc group  vacancies from january   to january   through the official website   the ap psc will conduct screeningpreliminary test on may   while the main exam for the successful candidates would be conducted on july  and  . appsc group  important dates    event date   opening date of application jan   closing date of application jan appsc group  vacancy details    type carried forward   executive posts    nonexecutive posts    type fresh posts   executive posts    nonexecutive posts    total  educational qualificationgraduation degree in any discipline from a recognized university or institute. age limit as on july   minimum  yearsmaximum  years feesapplication fees rs   examination fees rs  no examination fees f

In [15]:
df["text"].iloc[1]

'pune two weeks after the new academic year has begun over  seats reserved for students from the economically backward sections in private unaided schools under the right to education rte act remain vacant in pune and mumbai schools. a study by the rte resource centre established at the indian institute of management iim ahmedabad says not a single school in mumbai has filled up all its vacancies while in pune only  schools have  admissions for the  reserved quota. the study also reveals that  schools in pune and mumbai have failed to admit a single student under the act at the standard i entry level. the analysis reveals that only  seats have been filled in standard i and  in preprimary at the end of first round of rte admissions in mumbai during the  academic year. while pune received more applications than vacancies for admission in preprimary classes nearly twothirds of the seats remain vacant at the end of the first round of the admission process in the city. even in standard i on

In [16]:
df["text"].iloc[3]

'admission into the iims said kapoor. across  iims  seats have been added for admissions this year a total of  seats. the number of women appearing for cat has been steadily increasing as well  from  in  to  in . for  more than  women appeared for the cat. additionally female aspirants are achieving better success on the cat with  women scoring higher than the  percentile on cat . fraudulent universities even conducted admission melas in the city about two years ago and admitted genuine students from the state who faced deportation following the state department crackdown said a source.'

In [18]:
df.to_csv("text_preprocessed_df.csv")

In [5]:
import pandas as pd
df=pd.read_csv("text_preprocessed_df.csv")
df

Unnamed: 0.1,Unnamed: 0,text,target,Word Count
0,0,new delhi the andhra pradesh public service co...,academic interests,239
1,1,pune two weeks after the new academic year has...,academic interests,500
2,2,guwahati the results of the cbse class x exams...,academic interests,470
3,3,admission into the iims said kapoor. across i...,academic interests,108
4,4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...,...
519995,519995,nagpur akshay zadgaonkar is a child prodigy. h...,video gaming,500
519996,519996,bayonetta lead hideki kamiya reckons the late...,video gaming,290
519997,519997,al pacino thinks the original godfather is be...,video gaming,259
519998,519998,the latest episode of imlie begins with aryan ...,video gaming,251


# 26 IAB Categories 

In [17]:
categories=df.target.unique()
categories

array(['academic interests', 'arts and culture', 'automotives',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drinks', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

# 10  Sample from each Categories

In [18]:
sample_df = pd.DataFrame(columns=['text', 'target'])
for category in categories:
    category_samples = df[df['target'] == category].sample(10)
    sample_df = pd.concat([sample_df, category_samples])

sample_df.reset_index(drop=True, inplace=True)
sample_df

Unnamed: 0,text,target,Word Count
0,guwahati covid has compelled the national test...,academic interests,500.0
1,mathematics is an essential part of our everyd...,academic interests,500.0
2,the indian institute of foreign trade iift rec...,academic interests,308.0
3,tamil nadu common entrance test tancet admit ...,academic interests,246.0
4,to their consumer base. the lockdown has broug...,academic interests,320.0
...,...,...,...
255,the lord of the rings gollum has finally gott...,video gaming,364.0
256,unrequited love a friendship going awry or a m...,video gaming,233.0
257,there havent been any public updates on the d...,video gaming,341.0
258,star wars jedi survivor introduces plenty of ...,video gaming,474.0


#  Generating word embeddings using DistilBert Model

In [19]:
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import torch

# Load the pretrained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

def generate_embeddings(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True,max_length=512)
    
    # Pass the input through the model
    outputs = model(**inputs)
    
    # Extract embeddings from the hidden states (outputs[0])
    embeddings = outputs[0]
    
    return embeddings



# Apply the generate_embeddings function to the 'text' of the DataFrame
sample_df["embeddings"] =sample_df["text"].apply(generate_embeddings)
sample_df

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,text,target,Word Count,embeddings
0,guwahati covid has compelled the national test...,academic interests,500.0,"[[[tensor(-0.0325, grad_fn=<UnbindBackward0>),..."
1,mathematics is an essential part of our everyd...,academic interests,500.0,"[[[tensor(-0.1660, grad_fn=<UnbindBackward0>),..."
2,the indian institute of foreign trade iift rec...,academic interests,308.0,"[[[tensor(-0.1927, grad_fn=<UnbindBackward0>),..."
3,tamil nadu common entrance test tancet admit ...,academic interests,246.0,"[[[tensor(-0.1798, grad_fn=<UnbindBackward0>),..."
4,to their consumer base. the lockdown has broug...,academic interests,320.0,"[[[tensor(0.0231, grad_fn=<UnbindBackward0>), ..."
...,...,...,...,...
255,the lord of the rings gollum has finally gott...,video gaming,364.0,"[[[tensor(-0.2078, grad_fn=<UnbindBackward0>),..."
256,unrequited love a friendship going awry or a m...,video gaming,233.0,"[[[tensor(-0.2623, grad_fn=<UnbindBackward0>),..."
257,there havent been any public updates on the d...,video gaming,341.0,"[[[tensor(-0.2989, grad_fn=<UnbindBackward0>),..."
258,star wars jedi survivor introduces plenty of ...,video gaming,474.0,"[[[tensor(-0.0432, grad_fn=<UnbindBackward0>),..."


In [20]:
sample_df["embeddings"].iloc[0].shape

torch.Size([1, 512, 768])

In [21]:
def convert(inp):
    inp=np.array(torch.detach(inp[0]))
    return inp  

In [22]:
import torch
X=sample_df["embeddings"].apply(convert)


In [23]:
x=X.values

In [28]:
x.shape

(260,)

In [29]:
x[0].shape

(512, 768)

In [30]:
x[4].shape

(379, 768)

# Padding x to fixed dimension

In [31]:
import numpy as np

# Assuming you have a 2D NumPy array X where each row is a sequence

# Calculate the maximum sequence length
max_length = max(len(seq) for seq in x)

# Create a new array for padded sequences
padded_x = []

# Perform zero-padding for each sequence in X
for seq in x:
    # Calculate the length of the current sequence
    seq_len = len(seq)
    
    # Calculate the amount of padding needed
    padding_len = max_length - seq_len
    
    # Pad the sequence with zeros using np.pad along the second axis (axis=1)
    padded_seq = np.pad(seq, [(padding_len, 0), (0, 0)], constant_values=0.0)
    
    # Append the padded sequence to the list
    padded_x.append(padded_seq)

# Convert the list of padded sequences to a 2D NumPy array
padded_x = np.array(padded_x, dtype=np.float32)


In [32]:
padded_x[0].shape

(512, 768)

In [33]:
padded_x[4].shape

(512, 768)

In [34]:
y=sample_df["target"].values
y

array(['academic interests', 'academic interests', 'academic interests',
       'academic interests', 'academic interests', 'academic interests',
       'academic interests', 'academic interests', 'academic interests',
       'academic interests', 'arts and culture', 'arts and culture',
       'arts and culture', 'arts and culture', 'arts and culture',
       'arts and culture', 'arts and culture', 'arts and culture',
       'arts and culture', 'arts and culture', 'automotives',
       'automotives', 'automotives', 'automotives', 'automotives',
       'automotives', 'automotives', 'automotives', 'automotives',
       'automotives', 'books and literature', 'books and literature',
       'books and literature', 'books and literature',
       'books and literature', 'books and literature',
       'books and literature', 'books and literature',
       'books and literature', 'books and literature',
       'business and finance', 'business and finance',
       'business and finance', 'busin

In [35]:
from sklearn.preprocessing import OneHotEncoder
label=OneHotEncoder()
y=label.fit_transform(sample_df[["target"]])
y

<260x26 sparse matrix of type '<class 'numpy.float64'>'
	with 260 stored elements in Compressed Sparse Row format>

In [36]:
y=y.toarray()
y

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [37]:
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.2, random_state=42)

# LSTM

In [38]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense,Dropout

# Define your LSTM model
model = Sequential([
    LSTM(256, return_sequences=True, input_shape=(512,768)),
    Dropout(0.4),
    LSTM(128, return_sequences=False),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(26, activation='softmax')
])

2023-09-13 07:45:52.393040: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-13 07:45:55.156602: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-13 07:46:06.129830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-13 07:46:06.130151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train,y_train, epochs=100, validation_split=0.2, shuffle=True)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [40]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Loss: 6.4715
Test Accuracy: 21.15%
