
## **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import ast
import torch
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer

## **Load preprocessed Dataset**

In [None]:

preprocessed_df = pd.read_csv('/content/drive/MyDrive/TIL/preprocessed_df.zip')
preprocessed_df

Unnamed: 0,target,tokens
0,academic interests,"('new', 'delhi', 'andhra', 'pradesh', 'public'..."
1,academic interests,"('pune', 'two', 'week', 'new', 'academic', 'ye..."
2,academic interests,"('guwahati', 'result', 'cbse', 'class', 'x', '..."
3,academic interests,"('admission', 'iims', 'say', 'kapoor', 'across..."
4,academic interests,"('mangaluru', 'mangalore', 'institute', 'techn..."
...,...,...
518330,video gaming,"('nagpur', 'akshay', 'zadgaonkar', 'child', 'p..."
518331,video gaming,"('bayonetta', 'lead', 'hideki', 'kamiya', 'rec..."
518332,video gaming,"('al', 'pacino', 'think', 'original', 'godfath..."
518333,video gaming,"('late', 'episode', 'imlie', 'begin', 'aryan',..."


## **Convert the tokens column from string representations to Python lists**

In [None]:
preprocessed_df.tokens = preprocessed_df.tokens.apply(ast.literal_eval)
preprocessed_df

Unnamed: 0,target,tokens
0,academic interests,"(new, delhi, andhra, pradesh, public, service,..."
1,academic interests,"(pune, two, week, new, academic, year, begin, ..."
2,academic interests,"(guwahati, result, cbse, class, x, exam, annou..."
3,academic interests,"(admission, iims, say, kapoor, across, iims, s..."
4,academic interests,"(mangaluru, mangalore, institute, technology, ..."
...,...,...
518330,video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c..."
518331,video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat..."
518332,video gaming,"(al, pacino, think, original, godfather, well,..."
518333,video gaming,"(late, episode, imlie, begin, aryan, receive, ..."


## **26 IAB CATEGORIES**

In [None]:
preprocessed_df['target'].unique()

array(['academic interests', 'arts and culture', 'automotives',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drinks', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

## **Glove Model**

In [None]:
# Load GloVe embeddings
glove=GloVe(name='6B',dim=200)

## **Save the Model Using Joblib**

In [None]:
!pip install joblib
import joblib as jb
jb.dump(glove, '/content/drive/MyDrive/TIL/golve_model.joblib')



['/content/drive/MyDrive/TIL/golve_model.joblib']

## **Load The Glove Model**

In [None]:
glove = jb.load('/content/drive/MyDrive/TIL/golve_model.joblib')

## **generate_glove_embeddings function**

In [None]:
import torch
import numpy as np
#glove = jb.load('golve model.joblib')
#glove = GloVe(name='6B', dim=100)

# Function to generate GloVe embeddings
def generate_glove_embeddings(tokens):
    embeddings = [glove[token] for token in tokens]
    embeddings_tensor = torch.stack(embeddings)
    embeddings_tensor=np.mean(embeddings_tensor.numpy(),axis=1)
    return embeddings_tensor

Calculate and store the count of tokens in each row

In [None]:
preprocessed_df['Count'] = preprocessed_df.tokens.apply(len)
preprocessed_df

Unnamed: 0,target,tokens,Count
0,academic interests,"(new, delhi, andhra, pradesh, public, service,...",151
1,academic interests,"(pune, two, week, new, academic, year, begin, ...",269
2,academic interests,"(guwahati, result, cbse, class, x, exam, annou...",261
3,academic interests,"(admission, iims, say, kapoor, across, iims, s...",56
4,academic interests,"(mangaluru, mangalore, institute, technology, ...",203
...,...,...,...
518330,video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c...",291
518331,video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat...",176
518332,video gaming,"(al, pacino, think, original, godfather, well,...",147
518333,video gaming,"(late, episode, imlie, begin, aryan, receive, ...",130



Filter the preprocessed DataFrame to keep only rows with a Count value greater than or equal to 50

In [None]:
preprocessed_df=preprocessed_df[preprocessed_df.Count >=50]
preprocessed_df


Unnamed: 0,target,tokens,Count
0,academic interests,"(new, delhi, andhra, pradesh, public, service,...",151
1,academic interests,"(pune, two, week, new, academic, year, begin, ...",269
2,academic interests,"(guwahati, result, cbse, class, x, exam, annou...",261
3,academic interests,"(admission, iims, say, kapoor, across, iims, s...",56
4,academic interests,"(mangaluru, mangalore, institute, technology, ...",203
...,...,...,...
518330,video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c...",291
518331,video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat...",176
518332,video gaming,"(al, pacino, think, original, godfather, well,...",147
518333,video gaming,"(late, episode, imlie, begin, aryan, receive, ...",130


In [None]:
preprocessed_df.drop(columns=['Count'], inplace=True)
preprocessed_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_df.drop(columns=['Count'], inplace=True)


Unnamed: 0,target,tokens
0,academic interests,"(new, delhi, andhra, pradesh, public, service,..."
1,academic interests,"(pune, two, week, new, academic, year, begin, ..."
2,academic interests,"(guwahati, result, cbse, class, x, exam, annou..."
3,academic interests,"(admission, iims, say, kapoor, across, iims, s..."
4,academic interests,"(mangaluru, mangalore, institute, technology, ..."
...,...,...
518330,video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c..."
518331,video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat..."
518332,video gaming,"(al, pacino, think, original, godfather, well,..."
518333,video gaming,"(late, episode, imlie, begin, aryan, receive, ..."


## **Define a function to create category dataframes**

In [None]:

def category_dataframes(preprocessed_df):
    unique_categories = preprocessed_df['target'].unique()
    category_dataframes = {}

    for category in unique_categories:
        filtered_df = preprocessed_df[preprocessed_df['target'] == category].copy()
        filtered_df.reset_index(drop=True, inplace=True)
        category_dataframes[category] = filtered_df
    return category_dataframes

## **Get category dataframes**

In [None]:
category_dfs = category_dataframes(preprocessed_df)

## **Academic Interests**

In [None]:
category_dfs['academic interests']
embedding1 = category_dfs['academic interests'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['academic interests'].copy()
embedding
final_df1 = pd.concat([em, embedding], axis = 1)
final_df1

Unnamed: 0,embeddings,target,tokens
0,"[-0.06352922, 0.001389485, 0.093901254, 0.1092...",academic interests,"(new, delhi, andhra, pradesh, public, service,..."
1,"[0.027382972, 0.020358846, -0.063034385, -0.06...",academic interests,"(pune, two, week, new, academic, year, begin, ..."
2,"[0.016307771, -0.057271723, -0.07627775, -0.03...",academic interests,"(guwahati, result, cbse, class, x, exam, annou..."
3,"[0.008284974, -0.0022982287, -0.08882646, 0.07...",academic interests,"(admission, iims, say, kapoor, across, iims, s..."
4,"[0.0, 0.051746417, -0.035366558, -0.021043224,...",academic interests,"(mangaluru, mangalore, institute, technology, ..."
...,...,...,...
19979,"[0.013106747, -0.009391723, -0.057271723, -0.0...",academic interests,"(odisha, th, result, board, secondary, educati..."
19980,"[-0.059588894, -0.057428002, -0.041993886, 0.0...",academic interests,"(fund, federal, government, programme, urge, b..."
19981,"[0.027382972, -0.021834278, -0.03161209, -0.01...",academic interests,"(pune, student, appear, secondary, school, cer..."
19982,"[-0.06352922, 0.001389485, -0.016731532, 0.020...",academic interests,"(new, delhi, past, two, day, see, surge, onlin..."


In [None]:
final_df1.embeddings.iloc[1].shape

(269,)

In [None]:
print(final_df1.embeddings.iloc[1])

[ 0.02738297  0.02035885 -0.06303439 -0.06352922 -0.05823029 -0.05605913
 -0.07062508  0.0587532  -0.08236111 -0.02183428 -0.05159032  0.00243444
  0.07144253  0.0084388   0.07432721 -0.07898504  0.01385185 -0.0796576
  0.06786099 -0.06644168 -0.01161659  0.05743524  0.02738297  0.01952537
 -0.07898504 -0.07167516  0.06786099 -0.03867244  0.03625012 -0.074986
  0.03330103 -0.03536656 -0.06112597 -0.03986193  0.02797004 -0.08882646
  0.09384246 -0.07898504  0.01952537 -0.02319264 -0.04185401  0.02738297
 -0.07898504  0.00828497 -0.08236111 -0.10614736 -0.07167516 -0.01805367
 -0.00282266 -0.07898504  0.02738297  0.01952537 -0.0972591  -0.02188545
  0.09384246 -0.02183428 -0.06644168 -0.00416452  0.01388022 -0.03194802
 -0.05079876  0.0137391   0.0587532  -0.02319264 -0.00416452  0.
 -0.06096484 -0.00746276 -0.04450922  0.06786099  0.00828497  0.01952537
 -0.05823029 -0.05605913  0.02738297 -0.00219455  0.051827   -0.04185401
  0.00828497  0.         -0.03895469  0.0911735   0.          

In [None]:
final_df1.embeddings.shape

(19984,)

In [None]:
# Print the embeddings for the first row
print(final_df1.embeddings)

0        [-0.06352922, 0.001389485, 0.093901254, 0.1092...
1        [0.027382972, 0.020358846, -0.063034385, -0.06...
2        [0.016307771, -0.057271723, -0.07627775, -0.03...
3        [0.008284974, -0.0022982287, -0.08882646, 0.07...
4        [0.0, 0.051746417, -0.035366558, -0.021043224,...
                               ...                        
19979    [0.013106747, -0.009391723, -0.057271723, -0.0...
19980    [-0.059588894, -0.057428002, -0.041993886, 0.0...
19981    [0.027382972, -0.021834278, -0.03161209, -0.01...
19982    [-0.06352922, 0.001389485, -0.016731532, 0.020...
19983    [0.010947246, -0.021801576, -0.00022430137, 0....
Name: embeddings, Length: 19984, dtype: object


In [None]:
final_df1.embeddings.iloc[1].shape

(269,)

In [None]:
final_df1.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/academic_interests_.csv")

## **Read The File**

In [None]:
dff=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/academic_interests_.csv")
dff

Unnamed: 0.1,Unnamed: 0,embeddings,target,tokens
0,0,[-0.06352922 0.00138948 0.09390125 0.109289...,academic interests,"('new', 'delhi', 'andhra', 'pradesh', 'public'..."
1,1,[ 0.02738297 0.02035885 -0.06303439 -0.063529...,academic interests,"('pune', 'two', 'week', 'new', 'academic', 'ye..."
2,2,[ 1.63077712e-02 -5.72717227e-02 -7.62777478e-...,academic interests,"('guwahati', 'result', 'cbse', 'class', 'x', '..."
3,3,[ 8.2849739e-03 -2.2982287e-03 -8.8826463e-02 ...,academic interests,"('admission', 'iims', 'say', 'kapoor', 'across..."
4,4,[ 0. 0.05174642 -0.03536656 -0.021043...,academic interests,"('mangaluru', 'mangalore', 'institute', 'techn..."
...,...,...,...,...
19979,19979,[ 0.01310675 -0.00939172 -0.05727172 -0.079574...,academic interests,"('odisha', 'th', 'result', 'board', 'secondary..."
19980,19980,[-0.05958889 -0.057428 -0.04199389 0.006478...,academic interests,"('fund', 'federal', 'government', 'programme',..."
19981,19981,[ 0.02738297 -0.02183428 -0.03161209 -0.011474...,academic interests,"('pune', 'student', 'appear', 'secondary', 'sc..."
19982,19982,[-0.06352922 0.00138948 -0.01673153 0.020358...,academic interests,"('new', 'delhi', 'past', 'two', 'day', 'see', ..."


In [None]:
def convert_embeddings(embeddings):
  embeddings=embeddings.strip("[]")
  return np.array([float(value) for value in embeddings.split()])
dff["embeddings"]=dff["embeddings"].apply(convert_embeddings)

In [None]:
dff.embeddings.iloc[1].shape

(269,)

In [None]:
print(final_df1.embeddings.iloc[1])

[ 0.02738297  0.02035885 -0.06303439 -0.06352922 -0.05823029 -0.05605913
 -0.07062508  0.0587532  -0.08236111 -0.02183428 -0.05159032  0.00243444
  0.07144253  0.0084388   0.07432721 -0.07898504  0.01385185 -0.0796576
  0.06786099 -0.06644168 -0.01161659  0.05743524  0.02738297  0.01952537
 -0.07898504 -0.07167516  0.06786099 -0.03867244  0.03625012 -0.074986
  0.03330103 -0.03536656 -0.06112597 -0.03986193  0.02797004 -0.08882646
  0.09384246 -0.07898504  0.01952537 -0.02319264 -0.04185401  0.02738297
 -0.07898504  0.00828497 -0.08236111 -0.10614736 -0.07167516 -0.01805367
 -0.00282266 -0.07898504  0.02738297  0.01952537 -0.0972591  -0.02188545
  0.09384246 -0.02183428 -0.06644168 -0.00416452  0.01388022 -0.03194802
 -0.05079876  0.0137391   0.0587532  -0.02319264 -0.00416452  0.
 -0.06096484 -0.00746276 -0.04450922  0.06786099  0.00828497  0.01952537
 -0.05823029 -0.05605913  0.02738297 -0.00219455  0.051827   -0.04185401
  0.00828497  0.         -0.03895469  0.0911735   0.          

## **Arts and Culture**

In [None]:
category_dfs['arts and culture']
embedding1 = category_dfs['arts and culture'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['arts and culture'].copy()
embedding
final_df2 = pd.concat([em, embedding], axis = 1)
final_df2

Unnamed: 0,embeddings,target,tokens
0,"[0.07518134, 0.037957367, -0.009900642, -0.002...",arts and culture,"(japan, foundation, travel, exhibition, title,..."
1,"[-0.0074627614, -0.077870324, 0.013579364, -0....",arts and culture,"(first, come, kolkata, school, believe, city, ..."
2,"[-0.002929388, 0.029800858, 0.00018485538, -0....",arts and culture,"(filmmaker, vivek, agnihotri, saturday, tell, ..."
3,"[0.0, 0.051746417, -0.05433779, 0.01550076, 0....",arts and culture,"(mangaluru, mangalore, university, award, phd,..."
4,"[-0.019977637, -0.027807226, 0.04802774, -0.03...",arts and culture,"(graphic, novelist, abhijeet, kini, give, read..."
...,...,...,...
19928,"[-0.011748527, -0.0033203035, -0.027415223, 0....",arts and culture,"(chandigarh, theme, tribe, india, fourth, edit..."
19929,"[0.027382972, 0.08500374, 0.00019786485, 0.116...",arts and culture,"(pune, air, force, station, afs, lohegaon, dis..."
19930,"[-0.002929388, 0.03803761, 0.07864907, -0.0557...",arts and culture,"(filmmaker, mahesh, bhatt, confirm, adult, fil..."
19931,"[0.02041562, -0.028789511, 0.0001472801, -0.04...",arts and culture,"(hour, half, long, wait, gate, finally, open, ..."


In [None]:
final_df2.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/arts_and_culture_.csv")

## **Automotives**

In [None]:
category_dfs['automotives']
embedding1 = category_dfs['automotives'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['automotives'].copy()
embedding
final_df3 = pd.concat([em, embedding], axis = 1)
final_df3

Unnamed: 0,embeddings,target,tokens
0,"[-0.048018366, 0.021344757, -0.032308113, -0.0...",automotives,"(november, pv, sale, november, top, pv, sale, ..."
1,"[0.092972584, 0.0, -0.018460117, -0.001665985,...",automotives,"(visteon, ecarx, jointly, support, lead, inveh..."
2,"[-0.02065463, 0.030843038, -0.009804915, 0.034...",automotives,"(well, ajay, singh, run, state, bank, india, k..."
3,"[0.050460763, 0.068941996, 0.003075177, -0.021...",automotives,"(live, northeastern, united, state, must, snow..."
4,"[0.0, -0.057892542, -0.040311422, 0.023588333,...",automotives,"(gefco, would, allow, cma, cgms, logistics, di..."
...,...,...,...
19947,"[-0.0014565245, 0.092219695, -0.077870324, 0.0...",automotives,"(technologically, car, come, long, way, past, ..."
19948,"[0.0, 0.0147119425, -0.030803375, 0.038334593,...",automotives,"(ticgard, kepong, professional, window, tint, ..."
19949,"[-0.044927403, -0.04018984, 0.0, 0.01915518, -...",automotives,"(recent, spike, covid, case, impact, operation..."
19950,"[-0.015765615, -0.08313125, -0.054995093, 0.0,...",automotives,"(auto, market, reel, costofliving, squeeze, br..."


In [None]:
final_df3.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/automotives_.csv")

## **Books and Literature**

In [None]:
category_dfs['books and literature']
embedding1 = category_dfs['books and literature'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['books and literature'].copy()
embedding
final_df4 = pd.concat([em, embedding], axis = 1)
final_df4

Unnamed: 0,embeddings,target,tokens
0,"[0.0011394823, -0.0066071735, 0.083295114, 0.0...",books and literature,"(second, book, hidden, series, magnificent, fa..."
1,"[0.0, 0.036713246, -0.07536586, -0.010626113, ...",books and literature,"(gothgirl, raven, date, dream, boyfriend, comp..."
2,"[0.013554619, 0.0422884, -0.009122442, -0.1241...",books and literature,"(bengaluru, former, bureaucrat, manu, baligar,..."
3,"[0.027544385, -0.065953456, -0.03767439, -0.00...",books and literature,"(volume, put, together, first, time, one, volu..."
4,"[-0.022914799, -0.043035608, -0.05874419, 0.0,...",books and literature,"(director, randhir, ranjan, roycast, shekhar, ..."
...,...,...,...
19555,"[-0.03289719, 0.0049916725, -0.006246912, 0.08...",books and literature,"(claire, river, race, mailbox, receive, letter..."
19556,"[0.0911735, 0.019309534, -0.056059126, 0.05046...",books and literature,"(nearly, twenty, year, live, california, march..."
19557,"[0.026056089, -0.029061222, 0.0, 0.0032683918,...",books and literature,"(j, r, longawaited, novel, william, gaddi, aut..."
19558,"[-0.09510309, 0.0, -0.025174899, -0.0074627614...",books and literature,"(john, braines, remarkable, first, novel, room..."


In [None]:
final_df4.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/books_and_literature_.csv")

## **Business and Finance**

In [None]:
category_dfs['business and finance']
embedding1 = category_dfs['business and finance'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['business and finance'].copy()
embedding
final_df5 = pd.concat([em, embedding], axis = 1)
final_df5

Unnamed: 0,embeddings,target,tokens
0,"[0.013579364, -0.021801576, -0.038509574, -0.0...",business and finance,"(kolkata, state, cabinet, monday, give, approv..."
1,"[0.018681312, 0.037418462, -0.031666275, -0.00...",business and finance,"(bangalore, microfinance, institution, mfis, f..."
2,"[-0.06352922, 0.001389485, 0.02652526, -0.0390...",business and finance,"(new, delhi, three, month, rollout, new, indir..."
3,"[-0.06352922, 0.001389485, -0.0061807577, -0.0...",business and finance,"(new, delhi, apr, concern, recent, case, highi..."
4,"[-0.03328934, -0.057545174, -0.08367307, -0.02...",business and finance,"(tweens, spend, report, billion, money, year, ..."
...,...,...,...
19899,"[-0.047078848, -0.046634465, 0.0047493423, -0....",business and finance,"(possible, go, event, learn, attendance, way, ..."
19900,"[0.013579364, 0.01799695, 0.0, -0.080753505, -...",business and finance,"(kolkata, niti, aayog, order, detailed, evalua..."
19901,"[-0.032096393, 0.0, 0.0, -0.035366558, -0.0201...",business and finance,"(div, classsectiondiv, classnormalthe, institu..."
19902,"[-0.06435118, 0.0, -0.09134774, 0.0792667, 0.0...",business and finance,"(keep, lowdenomination, currency, circulation,..."


In [None]:
final_df5.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/business_and_finance_.csv")

## **Careers**

In [None]:
category_dfs['careers']
embedding1 = category_dfs['careers'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['careers'].copy()
embedding
final_df6 = pd.concat([em, embedding], axis = 1)
final_df6

Unnamed: 0,embeddings,target,tokens
0,"[-0.07246368, -0.009022876, -0.074325755, -0.0...",careers,"(job, title, business, administrator, hour, mo..."
1,"[-0.026967645, -0.06352922, -0.031040845, -0.0...",careers,"(engineer, new, career, lead, blue, chip, expe..."
2,"[-0.015546732, -0.082498856, 0.03235175, -0.02...",careers,"(commis, chef, exclusive, contract, caterer, c..."
3,"[-0.009838372, 0.020446358, 0.010609622, -0.00...",careers,"(research, nurse, full, training, offer, becom..."
4,"[0.05879292, 0.05879292, -0.06962599, -0.05759...",careers,"(client, client, lead, high, street, restauran..."
...,...,...,...
19766,"[-0.03539809, -0.0272506, -0.06962599, 0.00149...",careers,"(harbour, jones, lead, independent, innovative..."
19767,"[0.116959885, 0.022511942, 0.0028491805, -0.05...",careers,"(jr, data, scientist, share, find, similar, ca..."
19768,"[0.05879292, -0.055335652, -0.005565719, -0.02...",careers,"(client, look, lamp, development, system, engi..."
19769,"[0.047658462, 0.00898942, 0.03625012, 0.019155...",careers,"(customer, contact, centre, case, officer, lee..."


In [None]:
final_df6.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/careers_.csv")

## **family and relationships**

In [None]:
category_dfs['family and relationships']
embedding1 = category_dfs['family and relationships'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['family and relationships'].copy()
embedding
final_df7 = pd.concat([em, embedding], axis = 1)
final_df7

Unnamed: 0,embeddings,target,tokens
0,"[0.017163485, 0.09019718, 0.12805064, 0.0, -0....",family and relationships,"(kannada, actress, aishwarya, salimath, ready,..."
1,"[-0.06352922, 0.001389485, 0.001389485, -0.115...",family and relationships,"(new, delhi, delhi, commission, woman, dcw, mo..."
2,"[0.019525371, 0.001389485, 0.038585916, -0.089...",family and relationships,"(mumbai, delhi, police, continue, record, stat..."
3,"[-0.01721782, 0.0, -0.060404133, -0.023570327,...",family and relationships,"(nagpur, durupyog, play, noga, wasahat, indora..."
4,"[0.0437402, 0.038585916, -0.0634655, 0.0380987...",family and relationships,"(chennai, police, find, body, yearold, homemak..."
...,...,...,...
19835,"[0.011563225, -0.089818865, -0.05170766, -0.05...",family and relationships,"(britain, continue, hold, sign, similar, deala..."
19836,"[-0.00024123251, -0.042358413, -0.07536586, 0....",family and relationships,"(wendy, will, date, indian, man, learn, tough,..."
19837,"[-0.062351078, -0.056059126, 0.0, 0.0, -0.0893...",family and relationships,"(quite, year, rashmika, mandanna, college, stu..."
19838,"[0.027382972, 0.0, -0.022581501, 0.018265981, ...",family and relationships,"(pune, yearold, youth, arrest, thursday, day, ..."


In [None]:
final_df7.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/family_and_relationships_.csv")

## **Food and Drinks**

In [None]:
category_dfs['food and drinks']
embedding1 = category_dfs['food and drinks'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['food and drinks'].copy()
embedding
final_df8 = pd.concat([em, embedding], axis = 1)
final_df8

Unnamed: 0,embeddings,target,tokens
0,"[0.025660804, -0.05170766, 0.02652526, 0.07025...",food and drinks,"(thane, hold, three, acre, fertile, agricultur..."
1,"[0.027382972, 0.019648537, -0.05751889, -0.064...",food and drinks,"(pune, doctor, notice, considerable, increase,..."
2,"[-0.0417292, -0.062390566, 0.03407128, 0.02281...",food and drinks,"(final, solution, destitute, home, recovery, h..."
3,"[0.095408686, 0.048095062, -0.042832807, 0.045...",food and drinks,"(northwestern, turkey, city, canakkale, tragic..."
4,"[0.0, -0.046258852, 0.058753196, 0.0, -0.03080...",food and drinks,"(yearsno, dance, seat, areasno, professional, ..."
...,...,...,...
19920,"[0.037978575, 0.046463966, 0.0, 0.0072600604, ...",food and drinks,"(india, kisan, sangarsh, coordination, committ..."
19921,"[0.09019718, 0.021616247, 0.049979836, -0.0232...",food and drinks,"(actress, shruti, haasan, last, see, telugu, f..."
19922,"[-0.04613663, -0.007130676, 0.0, -0.03595628, ...",food and drinks,"(skip, bouquet, petalladen, bubble, bath, put,..."
19923,"[-0.016807204, -0.04118435, -0.0074627614, -0....",food and drinks,"(night, march, first, felt, dryness, throat, m..."


In [None]:
final_df8.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/food_and_drinks_.csv")

## **Health**

In [None]:
category_dfs['health']
embedding1 = category_dfs['health'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['health'].copy()
embedding
final_df9 = pd.concat([em, embedding], axis = 1)
final_df9

Unnamed: 0,embeddings,target,tokens
0,"[0.014032086, 0.012647117, -0.05433779, -0.040...",health,"(nashik, maharashtra, university, health, scie..."
1,"[0.029113086, 0.0011394823, -0.0004687923, 0.0...",health,"(patna, second, wave, covid, pandemic, proving..."
2,"[0.029113086, -0.04793695, -0.064695045, 0.020...",health,"(patna, chief, minister, nitish, kumar, thursd..."
3,"[0.0437402, -0.014065171, -0.07519167, -0.0450...",health,"(chennai, containment, effort, intensify, acro..."
4,"[-0.00027761408, -0.023893595, 0.0021071928, 0...",health,"(shimla, use, mask, glove, increase, manifold,..."
...,...,...,...
19914,"[-0.06352922, 0.001389485, -0.051896516, 0.0, ...",health,"(new, delhi, global, covid, case, surge, due, ..."
19915,"[0.029113086, 0.01845548, -0.010303025, 0.0249...",health,"(patna, third, phase, nationwide, covid, vacci..."
19916,"[-0.011748527, -0.04118435, 0.00014366001, 0.0...",health,"(chandigarh, march, record, almost, sixtime, s..."
19917,"[0.033165038, -0.039025117, -0.06002869, -0.01...",health,"(four, month, age, group, complete, target, fi..."


In [None]:
final_df9.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/health_.csv")

## **movies**

In [None]:
category_dfs['movies']
embedding1 = category_dfs['movies'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['movies'].copy()
embedding
final_df10 = pd.concat([em, embedding], axis = 1)
final_df10

Unnamed: 0,embeddings,target,tokens
0,"[0.055371266, -0.038808588, -0.0977615, -0.063...",movies,"(crew, member, help, make, directorial, debut,..."
1,"[0.036713246, -0.017732438, 0.04370352, 0.0252...",movies,"(raven, chooses, orgasm, part, one, fantasy, s..."
2,"[-0.052060805, 0.032974854, -0.04285579, 0.0, ...",movies,"(kourtney, kardashian, scott, disick, split, a..."
3,"[-0.059051447, -0.033459473, 0.04029088, -0.06...",movies,"(time, problem, fan, noticeable, promotion, pr..."
4,"[0.049059942, -0.05809763, -0.04271748, 0.0271...",movies,"(mammootty, hint, yet, another, blockbuster, t..."
...,...,...,...
19959,"[0.038350347, 0.053957645, -0.065037556, 0.030...",movies,"(sanya, malhotra, await, release, next, kathal..."
19960,"[0.0, 0.055910513, 0.034909032, -0.01318562, 0...",movies,"(muchawaited, telugu, film, agent, star, akhil..."
19961,"[-0.019076187, -0.07956566, 0.070633486, 0.031...",movies,"(music, composer, rahul, raj, busy, compose, s..."
19962,"[0.02194696, -0.0037143496, -0.02262401, 0.066...",movies,"(anil, sharma, back, sequel, iconic, gadar, ye..."


In [None]:
final_df10.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/movies_.csv")

## **music and audio**

In [None]:
category_dfs['music and audio']
embedding1 = category_dfs['music and audio'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['music and audio'].copy()
embedding
final_df11 = pd.concat([em, embedding], axis = 1)
final_df11

Unnamed: 0,embeddings,target,tokens
0,"[0.008656172, 0.019012911, -0.03579998, 0.0437...",music and audio,"(varanasi, emotional, appeal, jail, mafia, muk..."
1,"[0.08362419, 0.012511327, 0.03195927, 0.022485...",music and audio,"(audio, c, v, kumars, thirukumaran, entertainm..."
2,"[-0.0074627614, -0.04534745, -0.008089382, -0....",music and audio,"(first, day, shravan, maas, celebrate, devotee..."
3,"[-0.019076187, -0.022914799, -0.033790663, 0.0...",music and audio,"(music, director, raja, narayan, deb, almost, ..."
4,"[0.026300821, -0.01882515, 0.040970158, -0.063...",music and audio,"(rapper, frank, oceans, new, album, rumour, ca..."
...,...,...,...
19930,"[-0.078224674, -0.02152608, 0.08362419, -0.075...",music and audio,"(increase, clarity, audio, ensure, sound, guit..."
19931,"[0.040549878, -0.011527377, 0.057094216, 0.012...",music and audio,"(citroen, set, launch, c, india, market, july,..."
19932,"[0.074542575, 0.0, 0.025285669, 0.012559309, -...",music and audio,"(track, sohne, di, pasand, become, quite, roar..."
19933,"[-0.021801576, -0.0019081724, -0.070495844, -0...",music and audio,"(state, sue, obama, administration, issue, gui..."


In [None]:
final_df11.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/music_and_audio_.csv")

In [None]:
category_dfs['news and politics']
embedding1 = category_dfs['news and politics'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['news and politics'].copy()
embedding
final_df12 = pd.concat([em, embedding], axis = 1)
final_df12

Unnamed: 0,embeddings,target,tokens
0,"[0.04257759, -0.030348832, 0.04257759, -0.0303...",news and politics,"(bangladesh, journal, bangladesh, journal, inn..."
1,"[-0.08882646, -0.016166463, -0.057892542, -0.0...",news and politics,"(say, care, would, stop, fight, russia, aggres..."
2,"[0.04377553, -0.00016055942, -0.05183334, -0.0...",news and politics,"(jail, kremlin, critic, alexei, navalny, tuesd..."
3,"[-0.061253663, -0.005378024, -0.030803375, 0.0...",news and politics,"(here, tip, professional, scoffer, sneer, trum..."
4,"[-0.027505241, 0.040779337, 0.018589303, 0.024...",news and politics,"(bro, fire, surf, one, worlds, dangerous, wave..."
...,...,...,...
19946,"[0.032755602, -0.027307766, 0.012075496, 0.049...",news and politics,"(dehradun, ongoing, investigation, uttarakhand..."
19947,"[-0.023893595, -0.077870324, -0.076517895, -0....",news and politics,"(use, come, ask, win, election, use, offer, mo..."
19948,"[-0.038950272, 0.0013705146, -0.0393516, 0.038...",news and politics,"(february, deputy, superintendent, police, sha..."
19949,"[0.0026147836, 0.0021679725, -0.07215045, -0.0...",news and politics,"(ludhiana, leader, shiromani, akali, dal, wedn..."


In [None]:
final_df12.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/news_and_politics_.csv")

In [None]:
category_dfs['personal finance']
embedding1 = category_dfs['personal finance'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['personal finance'].copy()
embedding
final_df13 = pd.concat([em, embedding], axis = 1)
final_df13

Unnamed: 0,embeddings,target,tokens
0,"[0.037978575, 0.025229463, -0.09322947, -0.045...",personal finance,"(india, part, plan, strengthen, security, tie,..."
1,"[0.019525371, -0.015132241, -0.08596944, -0.04...",personal finance,"(mumbai, didnt, think, current, volatility, ru..."
2,"[-0.06352922, 0.001389485, -0.11026699, -0.064...",personal finance,"(new, delhi, finance, minister, nirmala, sitha..."
3,"[-0.02024317, 0.03620368, -0.11534062, -0.0595...",personal finance,"(everybody, love, arbitrage, fund, since, unio..."
4,"[0.07423546, -0.10294973, -0.0953787, -0.03387...",personal finance,"(jammu, bill, seek, enable, employee, increase..."
...,...,...,...
19877,"[-0.051224194, -0.041077506, 0.05051178, -0.01...",personal finance,"(different, cost, component, like, preferentia..."
19878,"[0.019525371, 0.058567137, 0.04980832, -0.0616...",personal finance,"(mumbai, piramal, realty, real, estate, arm, p..."
19879,"[-0.06352922, 0.001389485, 0.001389485, -0.023...",personal finance,"(new, delhi, delhi, development, authority, dd..."
19880,"[-0.06352922, 0.001389485, 0.0036411565, -0.00...",personal finance,"(new, delhi, allahabad, bank, idbi, top, list,..."


In [None]:
final_df13.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/personal_and_finanace_.csv")

In [None]:
category_dfs['pets']
embedding1 = category_dfs['pets'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['pets'].copy()
embedding
final_df14 = pd.concat([em, embedding], axis = 1)
final_df14

Unnamed: 0,embeddings,target,tokens
0,"[-0.006922849, -0.02849333, -0.009514161, 0.02...",pets,"(attracts, lot, people, curly, lock, win, smil..."
1,"[0.019525371, 0.020358846, 0.0, -0.056059126, ...",pets,"(mumbai, two, covid, year, record, low, noise,..."
2,"[-0.06405064, 0.013312213, -0.017672868, 0.118...",pets,"(london, ever, heard, canine, living, clothes,..."
3,"[0.041940637, 0.059553254, 0.03420705, 0.00702...",pets,"(hyderabad, metro, service, expand, connect, l..."
4,"[0.027382972, 0.03925317, 0.005790466, -0.0611...",pets,"(pune, animal, activist, tuesday, accuse, civi..."
...,...,...,...
19925,"[-0.08530069, 0.019877678, 0.09394922, 0.05053...",pets,"(washington, nursing, broken, heart, brain, gi..."
19926,"[-0.06352922, 0.001389485, -0.02525633, 0.0571...",pets,"(new, delhi, senior, bjp, leader, yashwant, si..."
19927,"[0.001389485, 0.015416753, -0.0018743384, -0.0...",pets,"(delhi, zoo, official, decision, put, white, t..."
19928,"[-0.06352922, 0.001389485, -0.033212136, -0.05...",pets,"(new, delhi, staff, selection, commission, ssc..."


In [None]:
final_df14.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/pets_.csv")

In [None]:
category_dfs['pharmaceuticals, conditions, and symptoms']
embedding1 = category_dfs['pharmaceuticals, conditions, and symptoms'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['pharmaceuticals, conditions, and symptoms'].copy()
embedding
final_df15 = pd.concat([em, embedding], axis = 1)
final_df15

Unnamed: 0,embeddings,target,tokens
0,"[0.033298865, 0.0, -0.061734434, -0.0500739, 0...","pharmaceuticals, conditions, and symptoms","(bsd, rubiscospecific, assembly, chaperone, fo..."
1,"[0.054306254, -0.016407952, 0.009361258, 0.0, ...","pharmaceuticals, conditions, and symptoms","(detection, designer, steroid, methylstenbolon..."
2,"[0.048764963, 0.039601047, 0.066178955, -0.075...","pharmaceuticals, conditions, and symptoms","(atrioventricular, conduction, disturbance, ea..."
3,"[-0.044874147, -0.04317094, 0.026078431, -0.01...","pharmaceuticals, conditions, and symptoms","(experience, concern, lesbian, gay, bisexual, ..."
4,"[0.014688087, 0.16560239, -0.122534044, 0.0, 0...","pharmaceuticals, conditions, and symptoms","(visual, portable, strategy, copperii, detecti..."
...,...,...,...
19995,"[-0.01972608, 0.020682504, 0.01919685, 0.08120...","pharmaceuticals, conditions, and symptoms","(municipal, solid, waste, landfill, obvious, i..."
19996,"[-0.036852468, -0.03718729, -0.017077819, -0.0...","pharmaceuticals, conditions, and symptoms","(food, environment, cause, obesity, epidemic, ..."
19997,"[-0.04965472, -0.023893595, -0.0064084763, 0.0...","pharmaceuticals, conditions, and symptoms","(cue, use, black, fly, simulium, annulus, attr..."
19998,"[-0.018279884, -0.05962837, 0.0, -0.03572012, ...","pharmaceuticals, conditions, and symptoms","(enantioselective, quenching, roomtemperature,..."


In [None]:
final_df15.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/pharmaceuticals_conditions_and_symptoms_.csv")

In [None]:
category_dfs['real estate']
embedding1 = category_dfs['real estate'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['real estate'].copy()
embedding
final_df16 = pd.concat([em, embedding], axis = 1)
final_df16

Unnamed: 0,embeddings,target,tokens
0,"[0.012964742, -0.049493093, -0.055335652, -0.0...",real estate,"(every, goan, look, forward, budget, hope, anx..."
1,"[0.031661406, -0.016807448, 0.028908243, -0.04...",real estate,"(kanpur, dwarka, indore, pushkar, current, fin..."
2,"[0.018196529, -0.009378158, -0.03161209, -0.07...",real estate,"(story, originally, appear, jan, mumbai, ajay,..."
3,"[0.028516347, -0.061646245, -0.04420792, -0.01...",real estate,"(palghar, real, estate, agent, adjoin, thane, ..."
4,"[0.049396485, -0.06654749, 0.038544763, 0.0151...",real estate,"(visakhapatnam, ap, build, construction, worke..."
...,...,...,...
19899,"[-0.05754394, 0.032209914, -0.061646245, -0.04...",real estate,"(great, noida, real, estate, project, resident..."
19900,"[0.06038356, -0.003471133, -0.06313005, -0.046...",real estate,"(idbi, bank, make, successful, transformation,..."
19901,"[0.013579364, 0.05587144, 0.049854353, 0.0, -0...",real estate,"(kolkata, sleuth, probe, trinetra, muddle, try..."
19902,"[0.025846329, -0.047411993, -0.06352922, -0.08...",real estate,"(jaipur, transition, new, order, regime, usual..."


In [None]:
final_df16.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/real_estate_.csv")

In [None]:
category_dfs['shopping']
embedding1 = category_dfs['shopping'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['shopping'].copy()
embedding
final_df17 = pd.concat([em, embedding], axis = 1)
final_df17

Unnamed: 0,embeddings,target,tokens
0,"[-0.01721782, 0.013256208, -0.042840805, -0.02...",shopping,"(nagpur, grow, discontent, among, shopkeeper, ..."
1,"[0.012511327, -0.016630217, -0.04322924, -0.03...",shopping,"(c, provide, easy, way, new, upcoming, store, ..."
2,"[0.000835586, -0.063034385, -0.015512615, 0.01...",shopping,"(agra, week, daylight, robbery, r, lakh, worth..."
3,"[0.025660804, -0.12406182, -0.0015974406, 0.03...",shopping,"(thane, wine, shop, employee, attack, robbed, ..."
4,"[0.025846329, 0.038585916, -0.023999577, -0.04...",shopping,"(jaipur, police, team, investigate, acid, atta..."
...,...,...,...
19921,"[0.013579364, 0.0, 0.008177213, 0.028037526, 0...",shopping,"(kolkata, senco, gold, diamond, one, lead, jew..."
19922,"[-0.06352922, 0.001389485, 0.001389485, -0.041...",shopping,"(new, delhi, delhi, government, tuesday, bid, ..."
19923,"[-0.019172031, -0.02262401, -0.16000539, -0.03...",shopping,"(claw, back, cautious, expansion, cloud, kitch..."
19924,"[-0.07062508, 0.0, -0.0848494, -0.021221593, -...",shopping,"(begin, hadnt, define, role, well, thing, like..."


In [None]:
final_df17.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/shopping_.csv")

In [None]:
category_dfs['sports']
embedding1 = category_dfs['sports'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['sports'].copy()
embedding
final_df18 = pd.concat([em, embedding], axis = 1)
final_df18

Unnamed: 0,embeddings,target,tokens
0,"[-0.015602353, -0.003308999, -0.02525633, -0.0...",sports,"(like, guy, senior, candidate, fall, regular, ..."
1,"[-0.011748527, -0.042832807, -0.019857176, -0....",sports,"(chandigarh, city, still, devoid, velodrome, b..."
2,"[0.053210273, 0.0073569613, -0.023999577, 0.00...",sports,"(bagger, sport, team, sport, superstore, bagge..."
3,"[-0.03392712, 0.024237225, -0.0020954588, -0.0...",sports,"(birmingham, one, worst, performances, recent,..."
4,"[-0.01605144, -0.04534745, -0.07215045, -0.084...",sports,"(mohali, day, shiromani, akali, dal, sad, lead..."
...,...,...,...
19942,"[0.013579364, 0.0422884, 0.04983406, 0.0407437...",sports,"(kolkata, former, olympian, sibling, archer, d..."
19943,"[-0.016499653, -0.08874839, 0.10133958, -0.009...",sports,"(margao, brace, subhash, singh, help, pune, fc..."
19944,"[0.04173316, -0.02929278, -0.002450541, -0.023...",sports,"(opposite, direction, saddle, mark, could, kee..."
19945,"[0.041940637, 0.0, 0.0, 0.080533355, 0.0656134...",sports,"(hyderabad, kvr, hemanth, kumar, bag, allindia..."


In [None]:
final_df18.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/sports_.csv")

In [None]:
category_dfs['style and fashion']
embedding1 = category_dfs['style and fashion'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['style and fashion'].copy()
embedding
final_df19 = pd.concat([em, embedding], axis = 1)
final_df19

Unnamed: 0,embeddings,target,tokens
0,"[-0.06352922, 0.001389485, -0.025450002, 0.013...",style and fashion,"(new, delhi, reliance, brand, part, mukesh, am..."
1,"[-0.069111854, -0.051868647, -0.08974306, 0.06...",style and fashion,"(heady, cocktail, style, popup, shop, music, m..."
2,"[-0.04534745, -0.025853274, -0.0095948335, 0.0...",style and fashion,"(day, ago, urfi, javed, got, heat, argument, s..."
3,"[0.03343729, 0.02652526, -0.021834278, 0.01967...",style and fashion,"(bhubaneswar, three, student, national, instit..."
4,"[0.0086236205, 0.02652526, -0.04534745, -0.048...",style and fashion,"(spread, three, day, jury, meet, thetimes, unl..."
...,...,...,...
19946,"[0.0815667, -0.032308113, -0.034045458, 0.0258...",style and fashion,"(amazon, sale, present, excite, opportunity, g..."
19947,"[0.08057142, 0.0544409, 0.0073569613, -0.00640...",style and fashion,"(deepika, padukone, sport, black, mermaid, dre..."
19948,"[0.03093016, 0.006313714, -0.009935616, 0.0201...",style and fashion,"(septum, pierce, moment, sun, celebrity, like,..."
19949,"[-0.06352922, 0.001389485, 0.0404785, -0.01640...",style and fashion,"(new, delhi, australian, designer, create, sti..."


In [None]:
final_df19.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/style_and_fashion_.csv")

In [None]:
category_dfs['technology and computing']
embedding1 = category_dfs['technology and computing'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['technology and computing'].copy()
embedding
final_df20 = pd.concat([em, embedding], axis = 1)
final_df20

Unnamed: 0,embeddings,target,tokens
0,"[0.038585916, 0.037978575, 0.017999828, -0.041...",technology and computing,"(police, india, symbol, government, visible, s..."
1,"[-0.09198022, -0.028475896, -0.100625165, -0.0...",technology and computing,"(believe, everyone, learn, code, skill, cod, d..."
2,"[-0.008327144, -0.008327144, 0.013194804, -0.0...",technology and computing,"(br, br, pantaloon, retail, r, attract, buy, i..."
3,"[-0.018053673, -0.06435118, 0.050153613, 0.079...",technology and computing,"(also, keep, intact, shallow, back, sit, chair..."
4,"[-0.0019353746, 0.03656672, -0.063739635, -0.0...",technology and computing,"(stockholm, quantum, physic, invisibility, clo..."
...,...,...,...
19859,"[0.049396485, 0.06855351, 0.049023837, 0.06211...",technology and computing,"(visakhapatnam, artificial, intelligence, mach..."
19860,"[0.0056165364, 0.0, 0.0, 0.0, -0.021834278, -0...",technology and computing,"(kochi, anfil, shajo, thirdyear, student, toc,..."
19861,"[-0.044927403, -0.07515458, 0.021509638, -0.08...",technology and computing,"(recent, decision, us, appellate, court, refer..."
19862,"[0.0056165364, 0.020358846, -0.063034385, 0.0,...",technology and computing,"(kochi, two, week, edusap, intelligent, studen..."


In [None]:
final_df20.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/technology_and_computing_.csv")


In [None]:
category_dfs['healthy living']
embedding1 = category_dfs['healthy living'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['healthy living'].copy()
embedding
final_df21 = pd.concat([em, embedding], axis = 1)
final_df21

Unnamed: 0,embeddings,target,tokens
0,"[-0.029101616, 0.0078081964, -0.038141903, -0....",healthy living,"(underlie, medical, issue, like, vitamin, defi..."
1,"[-0.010233113, 0.046449784, -0.060299378, -0.0...",healthy living,"(stage, grief, mean, tell, feel, feel, exactly..."
2,"[-0.02208984, -0.025976896, -0.054860264, -0.0...",healthy living,"(youre, among, always, experiment, western, fa..."
3,"[-0.029439371, -0.063745216, -0.075855196, 0.0...",healthy living,"(lucknow, equate, tobacco, weapon, mass, destr..."
4,"[0.015308537, 0.012075496, -0.07296657, 0.0302...",healthy living,"(criminal, investigation, underway, six, peopl..."
...,...,...,...
19852,"[0.02797004, 0.03533413, 0.033611584, -0.08878...",healthy living,"(ahmedabad, vinay, oza, call, change, engineer..."
19853,"[0.0, 0.024237225, -0.0074627614, -0.04476505,...",healthy living,"(introchances, one, first, place, turn, quick,..."
19854,"[-0.025565533, -0.05103225, -0.026861606, -0.0...",healthy living,"(key, manage, conflict, first, foremost, recog..."
19855,"[-0.021725731, -0.00685773, -0.04474772, -0.02...",healthy living,"(christmas, fever, get, everyone, infect, thus..."


In [None]:
final_df21.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/healthy_living_.csv")

In [None]:
category_dfs['hobbies and interests']
embedding1 = category_dfs['hobbies and interests'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['hobbies and interests'].copy()
embedding
final_df22 = pd.concat([em, embedding], axis = 1)
final_df22

Unnamed: 0,embeddings,target,tokens
0,"[-0.08080925, 0.05943162, 0.04019517, 0.044093...",hobbies and interests,"(arent, satoshi, tajiri, japanese, designer, i..."
1,"[-0.043573905, -0.045794636, -0.09687876, -0.0...",hobbies and interests,"(president, trump, pick, replace, antonin, sca..."
2,"[-0.057374973, -0.06659152, 0.029462133, -0.03...",hobbies and interests,"(hardly, occasion, software, professional, aru..."
3,"[0.0040154443, -0.0977615, -0.07116422, -0.071...",hobbies and interests,"(program, help, explore, write, workshop, kid,..."
4,"[-0.06352922, 0.001389485, -0.050968047, 0.015...",hobbies and interests,"(new, delhi, royal, mail, go, credit, populari..."
...,...,...,...
19865,"[0.0, 0.0, -0.08984347, 0.009815102, 0.0114977...",hobbies and interests,"(ayushmann, khurrana, apply, fly, license, you..."
19866,"[-0.077870324, -0.03767439, -0.03893806, -0.00...",hobbies and interests,"(come, together, many, artist, art, lover, cit..."
19867,"[0.06374874, -0.08080192, -0.038523063, 0.0380...",hobbies and interests,"(tv, industry, know, gruelling, schedule, long..."
19868,"[0.049396485, -0.021033032, -0.056059126, -0.0...",hobbies and interests,"(visakhapatnam, newly, year, new, beginning, n..."


In [None]:
final_df22.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/hobbies_and_interests_.csv")

In [None]:
category_dfs['home and garden']
embedding1 = category_dfs['home and garden'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['home and garden'].copy()
embedding
final_df23 = pd.concat([em, embedding], axis = 1)
final_df23

Unnamed: 0,embeddings,target,tokens
0,"[-0.02327495, -0.00018387317, -0.04534745, -0....",home and garden,"(unsolicited, valentine, day, public, service,..."
1,"[-0.06352922, 0.001389485, 0.0, -0.05278541, -...",home and garden,"(new, delhi, tailormade, low, bounce, fast, gr..."
2,"[-0.0470141, -0.009351195, 0.049697038, -0.078...",home and garden,"(married, couple, spouse, increase, togetherne..."
3,"[-0.10110509, -0.080523156, -0.016166463, 0.00...",home and garden,"(advice, take, care, farm, yamanappa, benefici..."
4,"[0.0077687614, 0.03723866, 0.024815511, -0.057...",home and garden,"(thousand, goans, tourist, alike, flock, samba..."
...,...,...,...
19905,"[0.041940637, 0.010947246, 0.021632204, 0.0166...",home and garden,"(hyderabad, telangana, rashtra, samithi, trs, ..."
19906,"[-0.007342085, 0.08997911, -0.01972608, 0.0486...",home and garden,"(vijaywada, vijayawada, municipal, corporation..."
19907,"[-0.009391723, -0.091622636, -0.08323206, -0.0...",home and garden,"(th, defeat, season, new, york, pull, point, l..."
19908,"[0.013554619, 0.024998799, 0.041790027, -0.078...",home and garden,"(bengaluru, nationwide, lockdown, enforce, res..."


In [None]:
final_df23.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/home_and_garden.csv")

In [None]:
category_dfs['television']
embedding1 = category_dfs['television'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['television'].copy()
embedding
final_df24 = pd.concat([em, embedding], axis = 1)
final_df24

Unnamed: 0,embeddings,target,tokens
0,"[0.02797004, 0.06374874, -0.0049599423, 0.0, 0...",television,"(ahmedabad, tv, burn, shortcircuit, cable, con..."
1,"[0.019525371, 0.06374874, -0.060530707, 0.0342...",television,"(mumbai, tv, credit, service, financial, socie..."
2,"[0.037531357, -0.011527377, 0.03343763, 0.0593...",television,"(google, set, motion, competitor, amazon, blas..."
3,"[0.058178455, 0.0, 0.03566435, 0.049059942, -0...",television,"(sensation, mollywood, megastar, mammootty, ce..."
4,"[0.12769732, 0.03097391, 0.0, -0.10444688, 0.0...",television,"(priyanka, naidu, madhubabu, celebrate, seven,..."
...,...,...,...
19960,"[0.037978575, 0.02671257, -6.179958e-05, -0.04...",television,"(india, electricity, fluctuation, trip, common..."
19961,"[-0.06352922, 0.001389485, -0.050164126, -0.08...",television,"(new, delhi, planning, buying, scooter, rs, di..."
19962,"[0.043759476, 0.06374874, 0.0001472801, 0.0219...",television,"(malayalam, tv, long, list, reality, show, boa..."
19963,"[-0.06352922, -0.042988747, 0.06374874, -0.020...",television,"(new, mythological, tv, show, dnyaneshwar, mau..."


In [None]:
final_df24.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/television_.csv")

In [None]:
category_dfs['travel']
embedding1 = category_dfs['travel'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['travel'].copy()
embedding
final_df25 = pd.concat([em, embedding], axis = 1)
final_df25

Unnamed: 0,embeddings,target,tokens
0,"[-0.08897043, -0.0015902787, -0.0074815345, -0...",travel,"(james, beard, legendary, american, chef, writ..."
1,"[0.019525371, 0.026340218, -0.009900642, 0.054...",travel,"(mumbai, online, travel, site, release, monthl..."
2,"[0.0437402, -0.029157918, 0.014715543, 0.04680...",travel,"(chennai, summer, several, train, chennai, alr..."
3,"[-0.06352922, 0.001389485, -0.023457803, -0.06...",travel,"(new, delhi, buoy, success, paisa, travel, ins..."
4,"[-0.0067641144, -0.0074231494, 0.021509638, -0...",travel,"(amenity, color, us, shock, guest, prefer, pay..."
...,...,...,...
19907,"[0.019525371, 0.0011394823, -0.019755138, -0.0...",travel,"(mumbai, second, consecutive, year, dubai, emi..."
19908,"[0.0011394823, -0.028558142, -0.014284893, 0.0...",travel,"(second, big, startup, hub, bengaluru, say, is..."
19909,"[-0.0035659717, 0.031667084, -0.05478721, -0.0...",travel,"(annually, world, little, town, say, shires, r..."
19910,"[-0.02208984, -0.056040343, 0.04185174, -0.085...",travel,"(youre, talk, donna, think, thing, need, make,..."


In [None]:
final_df25.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/travel_.csv")

In [None]:
category_dfs['video gaming']
embedding1 = category_dfs['video gaming'].tokens.apply(generate_glove_embeddings)
em = pd.DataFrame(embedding1)
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
embedding = category_dfs['video gaming'].copy()
embedding
final_df26 = pd.concat([em, embedding], axis = 1)
final_df26

Unnamed: 0,embeddings,target,tokens
0,"[0.024659358, -0.10918879, -0.033521418, -0.02...",video gaming,"(marvel, announce, next, big, spiderman, cross..."
1,"[0.006230319, -0.03893806, -0.06352922, 0.0302...",video gaming,"(unlike, many, new, release, dont, expect, chr..."
2,"[0.0, -0.012385723, 0.0, -0.0036891627, 0.0969...",video gaming,"(fathersin, addition, fourtime, champion, lang..."
3,"[0.0, 0.028791033, -0.013642687, -0.00282266, ...",video gaming,"(gloomhaven, button, bug, reveal, surprise, ne..."
4,"[-0.021043224, -0.023893595, -0.086239874, 0.0...",video gaming,"(technology, use, political, communicationthos..."
...,...,...,...
19937,"[-0.01721782, 0.07121307, 0.0, 0.014257083, 0....",video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c..."
19938,"[0.0508124, -0.06962599, 0.07814604, 0.0224533...",video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat..."
19939,"[-0.0638693, 0.0067938687, -0.08596944, 0.0171...",video gaming,"(al, pacino, think, original, godfather, well,..."
19940,"[-0.08438859, 0.045592237, 0.0, -0.07062508, -...",video gaming,"(late, episode, imlie, begin, aryan, receive, ..."


In [None]:
final_df26.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/video_gaming_.csv")

In [None]:
df1=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/academic_interests_.csv")
df2=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/arts_and_culture_.csv")
df3=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/automotives_.csv")
df4=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/books_and_literature_.csv")
df5=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/business_and_finance_.csv")
df6=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/careers_.csv")
df7=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/family_and_relationships_.csv")
df8=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/food_and_drinks_.csv")
df9=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/healthy_living_.csv")
df10=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/health_.csv")
df11=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/hobbies_and_interests_.csv")
df12=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/home_and_garden.csv")
df13=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/movies_.csv")
df14=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/music_and_audio_.csv")
df15=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/news_and_politics_.csv")
df16=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/personal_and_finanace_.csv")
df17=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/pets_.csv")
df18=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/pharmaceuticals_conditions_and_symptoms_.csv")
df20=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/real_estate_.csv")
df19=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/shopping_.csv")
df21=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/sports_.csv")
df22=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/style_and_fashion_.csv")
df23=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/technology_and_computing_.csv")
df24=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/travel_.csv")
df25=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/television_.csv")
df26=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/video_gaming_.csv")

In [None]:
# List of all DataFrames
dfs = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26]

# Combine DataFrames and reset index
_df = pd.concat(dfs, ignore_index=True)

_df

Unnamed: 0.1,Unnamed: 0,embeddings,target,tokens
0,0,[-0.06352922 0.00138948 0.09390125 0.109289...,academic interests,"('new', 'delhi', 'andhra', 'pradesh', 'public'..."
1,1,[ 0.02738297 0.02035885 -0.06303439 -0.063529...,academic interests,"('pune', 'two', 'week', 'new', 'academic', 'ye..."
2,2,[ 1.63077712e-02 -5.72717227e-02 -7.62777478e-...,academic interests,"('guwahati', 'result', 'cbse', 'class', 'x', '..."
3,3,[ 8.2849739e-03 -2.2982287e-03 -8.8826463e-02 ...,academic interests,"('admission', 'iims', 'say', 'kapoor', 'across..."
4,4,[ 0. 0.05174642 -0.03536656 -0.021043...,academic interests,"('mangaluru', 'mangalore', 'institute', 'techn..."
...,...,...,...,...
517493,19937,[-0.01721782 0.07121307 0. 0.014257...,video gaming,"('nagpur', 'akshay', 'zadgaonkar', 'child', 'p..."
517494,19938,[ 0.0508124 -0.06962599 0.07814604 0.022453...,video gaming,"('bayonetta', 'lead', 'hideki', 'kamiya', 'rec..."
517495,19939,[-0.0638693 0.00679387 -0.08596944 0.017149...,video gaming,"('al', 'pacino', 'think', 'original', 'godfath..."
517496,19940,[-0.08438859 0.04559224 0. -0.070625...,video gaming,"('late', 'episode', 'imlie', 'begin', 'aryan',..."


In [None]:
_df.to_csv("/content/drive/MyDrive/TIL/Word_Embeddings/_df_.csv")

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/TIL/Word_Embeddings/_df_.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,embeddings,target,tokens
0,0,0,[-0.06352922 0.00138948 0.09390125 0.109289...,academic interests,"('new', 'delhi', 'andhra', 'pradesh', 'public'..."
1,1,1,[ 0.02738297 0.02035885 -0.06303439 -0.063529...,academic interests,"('pune', 'two', 'week', 'new', 'academic', 'ye..."
2,2,2,[ 1.63077712e-02 -5.72717227e-02 -7.62777478e-...,academic interests,"('guwahati', 'result', 'cbse', 'class', 'x', '..."
3,3,3,[ 8.2849739e-03 -2.2982287e-03 -8.8826463e-02 ...,academic interests,"('admission', 'iims', 'say', 'kapoor', 'across..."
4,4,4,[ 0. 0.05174642 -0.03536656 -0.021043...,academic interests,"('mangaluru', 'mangalore', 'institute', 'techn..."
...,...,...,...,...,...
517493,517493,19937,[-0.01721782 0.07121307 0. 0.014257...,video gaming,"('nagpur', 'akshay', 'zadgaonkar', 'child', 'p..."
517494,517494,19938,[ 0.0508124 -0.06962599 0.07814604 0.022453...,video gaming,"('bayonetta', 'lead', 'hideki', 'kamiya', 'rec..."
517495,517495,19939,[-0.0638693 0.00679387 -0.08596944 0.017149...,video gaming,"('al', 'pacino', 'think', 'original', 'godfath..."
517496,517496,19940,[-0.08438859 0.04559224 0. -0.070625...,video gaming,"('late', 'episode', 'imlie', 'begin', 'aryan',..."


In [None]:
# Drop the "Unnamed: 0" column
df = df.drop("Unnamed: 0.1", axis=1)

In [None]:
df = df.drop(columns=['tokens'])

In [None]:
df

Unnamed: 0.1,Unnamed: 0,embeddings,target
0,0,[-0.06352922 0.00138948 0.09390125 0.109289...,academic interests
1,1,[ 0.02738297 0.02035885 -0.06303439 -0.063529...,academic interests
2,2,[ 1.63077712e-02 -5.72717227e-02 -7.62777478e-...,academic interests
3,3,[ 8.2849739e-03 -2.2982287e-03 -8.8826463e-02 ...,academic interests
4,4,[ 0. 0.05174642 -0.03536656 -0.021043...,academic interests
...,...,...,...
517493,19937,[-0.01721782 0.07121307 0. 0.014257...,video gaming
517494,19938,[ 0.0508124 -0.06962599 0.07814604 0.022453...,video gaming
517495,19939,[-0.0638693 0.00679387 -0.08596944 0.017149...,video gaming
517496,19940,[-0.08438859 0.04559224 0. -0.070625...,video gaming


In [None]:
import numpy as np
def convert_embeddings(embeddings):
  embeddings=embeddings.strip("[]")
  return np.array([float(value) for value in embeddings.split()])
df["embeddings"]=df["embeddings"].apply(convert_embeddings)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,embeddings,target
0,0,"[-0.06352922, 0.00138948, 0.09390125, 0.109289...",academic interests
1,1,"[0.02738297, 0.02035885, -0.06303439, -0.06352...",academic interests
2,2,"[0.0163077712, -0.0572717227, -0.0762777478, -...",academic interests
3,3,"[0.0082849739, -0.0022982287, -0.088826463, 0....",academic interests
4,4,"[0.0, 0.05174642, -0.03536656, -0.02104322, 0....",academic interests
...,...,...,...
517493,19937,"[-0.01721782, 0.07121307, 0.0, 0.01425708, 0.0...",video gaming
517494,19938,"[0.0508124, -0.06962599, 0.07814604, 0.0224533...",video gaming
517495,19939,"[-0.0638693, 0.00679387, -0.08596944, 0.017149...",video gaming
517496,19940,"[-0.08438859, 0.04559224, 0.0, -0.07062508, -0...",video gaming


In [None]:
X = df['embeddings']
y = df['target']

In [None]:
categories=df.target.unique()
categories

array(['academic interests', 'arts and culture', 'automotives',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drinks', 'healthy living',
       'health', 'hobbies and interests', 'home and garden', 'movies',
       'music and audio', 'news and politics', 'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'shopping',
       'real estate', 'sports', 'style and fashion',
       'technology and computing', 'travel', 'television', 'video gaming'],
      dtype=object)

In [None]:
sample_df = pd.DataFrame(columns=['embeddings', 'target'])
for category in categories:
    category_samples = df[df['target'] == category].sample(2000)
    sample_df = pd.concat([sample_df, category_samples])

sample_df.reset_index(drop=True, inplace=True)
sample_df

Unnamed: 0.1,embeddings,target,Unnamed: 0
0,"[-0.01174853, 0.03625012, -0.07089538, 0.02652...",academic interests,6238.0
1,"[-0.00763873, -0.06327738, -0.0002243, -0.0179...",academic interests,3295.0
2,"[0.01355462, -0.02325785, -0.03234862, 0.00307...",academic interests,10644.0
3,"[-0.02598906, -0.07737079, -0.07651789, 0.0677...",academic interests,10470.0
4,"[0.00138948, -0.05433779, -0.08179064, -0.0140...",academic interests,6869.0
...,...,...,...
51995,"[0.00735696, -0.01887533, 0.01385185, -0.02508...",video gaming,19742.0
51996,"[0.0, 0.00466237, 0.04778038, -0.0397411, -0.0...",video gaming,2825.0
51997,"[0.01431744, 0.00363123, 0.0, -0.05675694, 0.0...",video gaming,12783.0
51998,"[-0.08367307, -0.08882646, 0.0422884, 0.031815...",video gaming,3382.0


In [None]:
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df' and the target column is named 'target'
X = sample_df['embeddings'].values  # Features (all columns except the target)
y = sample_df['target'].values  # Target column

# Split the data while ensuring equal distribution of target values
# X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75, stratify=y, random_state=42)

# Now, X_train, X_test, y_train, and y_test contain your training and testing data

In [None]:
max_length = max(len(seq) for seq in X)
max_length


500

In [None]:
padded_x = []
for seq in X:
    seq_len = len(seq)
    padded_seq = np.pad(seq, (0, max_length - seq_len), constant_values=0.0)
    padded_x.append(padded_seq)

padded_x = np.array(padded_x, dtype=np.float32)
padded_x

array([[-0.01174853,  0.03625012, -0.07089538, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00763873, -0.06327738, -0.0002243 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01355462, -0.02325785, -0.03234862, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.01431744,  0.00363123,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.08367307, -0.08882646,  0.0422884 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.01412074,  0.01681493,  0.03469527, ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [None]:
padded_x.shape

(52000, 500)

In [None]:
# Split the data while ensuring equal distribution of target values
X_train, X_test, y_train, y_test = train_test_split(padded_x, y,train_size=0.80, stratify=y, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create an SVM classifier with a linear kernel (you can adjust the kernel and C parameter)
svm_classifier = SVC(kernel='linear', C=6)

In [None]:
# Fit the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test)