In [9]:
import numpy as np
import pandas as pd
import ast
import torch
from transformers import BertModel, BertTokenizer

In [2]:
preprocessed_df = pd.read_csv('preprocessed_sample_df.csv')
preprocessed_df

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"['last', 'summer', 'granddaughter', 'brooke', ..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"['new', 'delhi', 'uttar', 'pradesh', 'police',..."
2,belgaum karnataka higher education minister c...,academic interests,"['belgaum', 'karnataka', 'high', 'education', ..."
3,lincoln university has welcomed the governmen...,academic interests,"['lincoln', 'university', 'welcome', 'governme..."
4,new delhi the uttar pradesh public service com...,academic interests,"['new', 'delhi', 'uttar', 'pradesh', 'public',..."
...,...,...,...
517707,death stranding has been announced at the ga...,video gaming,"['death', 'stranding', 'announce', 'game', 'aw..."
517708,raj kundra dons multiple hats but one thing th...,video gaming,"['raj', 'kundra', 'don', 'multiple', 'hat', 'o..."
517709,mario rabbids sparks of hope one of the best...,video gaming,"['mario', 'rabbids', 'spark', 'hope', 'one', '..."
517710,ahmedabad a yearold man from adalaj who marrie...,video gaming,"['ahmedabad', 'yearold', 'man', 'adalaj', 'mar..."


In [3]:
preprocessed_df.iloc[0].tokens

"['last', 'summer', 'granddaughter', 'brooke', 'play', 'us', 'kid', 'golf', 'world', 'championship', 'pinehurst', 'north', 'carolina', 'pay', 'think', 'inside', 'box']"

In [4]:
preprocessed_df.tokens = preprocessed_df.tokens.apply(ast.literal_eval)

## Sampling code for generating word embeddings using 'bert-base-uncased' model

In [5]:
sample = preprocessed_df.head(100)
sample

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...
95,visakhapatnam apprehensive parents of students...,academic interests,"[visakhapatnam, apprehensive, parent, student,..."
96,new delhi the kartet answer key has been rele...,academic interests,"[new, delhi, kartet, answer, key, release, nov..."
97,ahmedabad the new education policy nep has bee...,academic interests,"[ahmedabad, new, education, policy, nep, prepa..."
98,new delhi defence research development organi...,academic interests,"[new, delhi, defence, research, development, o..."


In [6]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
def generate_embeddings(token):
    token_ids = tokenizer.encode(token, add_special_tokens=True, max_length=512, return_tensors='pt', truncation=True)
    
    with torch.no_grad():
        outputs = model(token_ids)
        token_embeddings = outputs.last_hidden_state
#     token_embeddings = token_embeddings.numpy()
    
    return token_embeddings

In [27]:
sample['embeddings'] = sample['tokens'].apply(generate_embeddings)
sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['embeddings'] = sample['tokens'].apply(generate_embeddings)


Unnamed: 0,text,target,tokens,embeddings
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us...","[[[tensor(-0.0398), tensor(0.0898), tensor(0.2..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme...","[[[tensor(-0.0948), tensor(0.0705), tensor(0.0..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister...","[[[tensor(-0.0959), tensor(0.0608), tensor(0.1..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec...","[[[tensor(0.1248), tensor(0.2531), tensor(0.22..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ...","[[[tensor(-0.3586), tensor(0.0036), tensor(-0...."
...,...,...,...,...
95,visakhapatnam apprehensive parents of students...,academic interests,"[visakhapatnam, apprehensive, parent, student,...","[[[tensor(0.0386), tensor(0.2406), tensor(0.09..."
96,new delhi the kartet answer key has been rele...,academic interests,"[new, delhi, kartet, answer, key, release, nov...","[[[tensor(-0.0935), tensor(0.1588), tensor(0.1..."
97,ahmedabad the new education policy nep has bee...,academic interests,"[ahmedabad, new, education, policy, nep, prepa...","[[[tensor(-0.1235), tensor(-0.0790), tensor(0...."
98,new delhi defence research development organi...,academic interests,"[new, delhi, defence, research, development, o...","[[[tensor(-0.0390), tensor(0.1840), tensor(0.0..."


In [28]:
sample.iloc[0].embeddings

tensor([[[-0.0398,  0.0898,  0.2421,  ..., -0.3452,  0.1710,  0.1143],
         [-0.5097, -0.8875,  0.5615,  ..., -0.3648,  0.1800,  0.1951],
         [-0.1292,  0.2927,  0.0892,  ..., -0.7827,  0.0801, -0.8439],
         ...,
         [ 0.3515, -0.3867,  0.6344,  ...,  0.0792,  0.3978, -0.3380],
         [ 0.0046, -0.3359,  0.2507,  ...,  0.0475, -0.0955,  0.0109],
         [ 0.4190,  0.3783, -0.0905,  ...,  0.0106, -0.6071, -0.1755]]])

## Generating Word Embeddings for each category using BERT

In [43]:
preprocessed_df['target'].unique()

array(['academic interests', 'arts and culture', 'automotive',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drink', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

In [34]:
def category_dataframes(preprocessed_df):
   
    unique_categories = preprocessed_df['target'].unique()
    
    category_dataframes = {}
    
    for category in unique_categories:
        filtered_df = preprocessed_df[preprocessed_df['target'] == category].copy()
        filtered_df.reset_index(drop=True, inplace=True)
        category_dataframes[category] = filtered_df

    return category_dataframes

category_dfs = category_dataframes(preprocessed_df)

#### ACADEMIC INTERESTS

In [35]:
category_dfs['academic interests']

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...
19995,the admission committee for professional under...,academic interests,"[admission, committee, professional, undergrad..."
19996,new delhi a virtual meeting of g education min...,academic interests,"[new, delhi, virtual, meeting, g, education, m..."
19997,jaipur anil a devoted fifth grade student of a...,academic interests,"[jaipur, anil, devote, fifth, grade, student, ..."
19998,jayanti bharadwaj final year psychology honou...,academic interests,"[jayanti, bharadwaj, final, year, psychology, ..."


In [36]:
embed1 = category_dfs['academic interests'].tokens.apply(generate_embeddings)

In [37]:
embed1

0        [[[tensor(-0.0398), tensor(0.0898), tensor(0.2...
1        [[[tensor(-0.0948), tensor(0.0705), tensor(0.0...
2        [[[tensor(-0.0959), tensor(0.0608), tensor(0.1...
3        [[[tensor(0.1248), tensor(0.2531), tensor(0.22...
4        [[[tensor(-0.3586), tensor(0.0036), tensor(-0....
                               ...                        
19995    [[[tensor(-0.1531), tensor(0.2153), tensor(-0....
19996    [[[tensor(0.0786), tensor(0.2683), tensor(0.11...
19997    [[[tensor(0.1589), tensor(-0.0967), tensor(0.1...
19998    [[[tensor(0.0722), tensor(0.2184), tensor(0.15...
19999    [[[tensor(-0.0039), tensor(0.0708), tensor(0.4...
Name: tokens, Length: 20000, dtype: object

In [38]:
em = pd.DataFrame(embed1)
em

Unnamed: 0,tokens
0,"[[[tensor(-0.0398), tensor(0.0898), tensor(0.2..."
1,"[[[tensor(-0.0948), tensor(0.0705), tensor(0.0..."
2,"[[[tensor(-0.0959), tensor(0.0608), tensor(0.1..."
3,"[[[tensor(0.1248), tensor(0.2531), tensor(0.22..."
4,"[[[tensor(-0.3586), tensor(0.0036), tensor(-0...."
...,...
19995,"[[[tensor(-0.1531), tensor(0.2153), tensor(-0...."
19996,"[[[tensor(0.0786), tensor(0.2683), tensor(0.11..."
19997,"[[[tensor(0.1589), tensor(-0.0967), tensor(0.1..."
19998,"[[[tensor(0.0722), tensor(0.2184), tensor(0.15..."


In [40]:
em.rename(columns = {'tokens':'embeddings'}, inplace = True)
em

Unnamed: 0,embeddings
0,"[[[tensor(-0.0398), tensor(0.0898), tensor(0.2..."
1,"[[[tensor(-0.0948), tensor(0.0705), tensor(0.0..."
2,"[[[tensor(-0.0959), tensor(0.0608), tensor(0.1..."
3,"[[[tensor(0.1248), tensor(0.2531), tensor(0.22..."
4,"[[[tensor(-0.3586), tensor(0.0036), tensor(-0...."
...,...
19995,"[[[tensor(-0.1531), tensor(0.2153), tensor(-0...."
19996,"[[[tensor(0.0786), tensor(0.2683), tensor(0.11..."
19997,"[[[tensor(0.1589), tensor(-0.0967), tensor(0.1..."
19998,"[[[tensor(0.0722), tensor(0.2184), tensor(0.15..."


In [39]:
embed = category_dfs['academic interests'].copy()
embed

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...
19995,the admission committee for professional under...,academic interests,"[admission, committee, professional, undergrad..."
19996,new delhi a virtual meeting of g education min...,academic interests,"[new, delhi, virtual, meeting, g, education, m..."
19997,jaipur anil a devoted fifth grade student of a...,academic interests,"[jaipur, anil, devote, fifth, grade, student, ..."
19998,jayanti bharadwaj final year psychology honou...,academic interests,"[jayanti, bharadwaj, final, year, psychology, ..."


In [41]:
final_df = pd.concat([em, embed], axis = 1)
final_df

In [42]:
final_df

Unnamed: 0,embeddings,text,target,tokens
0,"[[[tensor(-0.0398), tensor(0.0898), tensor(0.2...",last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us..."
1,"[[[tensor(-0.0948), tensor(0.0705), tensor(0.0...",new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,"[[[tensor(-0.0959), tensor(0.0608), tensor(0.1...",belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister..."
3,"[[[tensor(0.1248), tensor(0.2531), tensor(0.22...",lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec..."
4,"[[[tensor(-0.3586), tensor(0.0036), tensor(-0....",new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...,...
19995,"[[[tensor(-0.1531), tensor(0.2153), tensor(-0....",the admission committee for professional under...,academic interests,"[admission, committee, professional, undergrad..."
19996,"[[[tensor(0.0786), tensor(0.2683), tensor(0.11...",new delhi a virtual meeting of g education min...,academic interests,"[new, delhi, virtual, meeting, g, education, m..."
19997,"[[[tensor(0.1589), tensor(-0.0967), tensor(0.1...",jaipur anil a devoted fifth grade student of a...,academic interests,"[jaipur, anil, devote, fifth, grade, student, ..."
19998,"[[[tensor(0.0722), tensor(0.2184), tensor(0.15...",jayanti bharadwaj final year psychology honou...,academic interests,"[jayanti, bharadwaj, final, year, psychology, ..."


In [46]:
final_df.to_csv('final_df.csv', index = False)

#### AUTOMOTIVE

In [52]:
embed2 = category_dfs['automotive'].copy()
embed2

Unnamed: 0,text,target,tokens
0,five stars extended vision poor not a secure f...,automotive,"[five, star, extend, vision, poor, secure, fit..."
1,good the air conditioner in my gti had a horri...,automotive,"[good, air, conditioner, gti, horrible, smell,..."
2,its working fine i really like the shutoff ti...,automotive,"[work, fine, really, like, shutoff, timer, war..."
3,did not work for us bummer deal dificult to mo...,automotive,"[work, us, bummer, deal, dificult, mount, vibr..."
4,does a great job truly great product a must ha...,automotive,"[great, job, truly, great, product, must, amaz..."
...,...,...,...
19995,overpriced with short life span five stars i w...,automotive,"[overprice, short, life, span, five, star, wis..."
19996,compact cooler great product proper v ac adapt...,automotive,"[compact, cooler, great, product, proper, v, a..."
19997,s top cleaner and conditioner yes and no amazi...,automotive,"[top, cleaner, conditioner, yes, amaze, stuff,..."
19998,does not fit does not fit my chrysler voyager...,automotive,"[fit, fit, chrysler, voyager, fit, dodge, stra..."


In [54]:
empty_list_mask = embed2['tokens'].apply(lambda x: len(x) == 0)
embed2 = embed2[~empty_list_mask]
embed2.reset_index(drop=True, inplace=True)

In [55]:
embed2['embeddings'] = embed2['tokens'].apply(generate_embeddings)
embed2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embed2['embeddings'] = embed2['tokens'].apply(generate_embeddings)


Unnamed: 0,text,target,tokens,embeddings
0,five stars extended vision poor not a secure f...,automotive,"[five, star, extend, vision, poor, secure, fit...","[[[tensor(-0.5493), tensor(-0.2083), tensor(0...."
1,good the air conditioner in my gti had a horri...,automotive,"[good, air, conditioner, gti, horrible, smell,...","[[[tensor(-0.2094), tensor(0.3140), tensor(-0...."
2,its working fine i really like the shutoff ti...,automotive,"[work, fine, really, like, shutoff, timer, war...","[[[tensor(-0.1737), tensor(-0.0404), tensor(0...."
3,did not work for us bummer deal dificult to mo...,automotive,"[work, us, bummer, deal, dificult, mount, vibr...","[[[tensor(0.0072), tensor(0.4846), tensor(-0.1..."
4,does a great job truly great product a must ha...,automotive,"[great, job, truly, great, product, must, amaz...","[[[tensor(-0.0311), tensor(0.2784), tensor(-0...."
...,...,...,...,...
19994,overpriced with short life span five stars i w...,automotive,"[overprice, short, life, span, five, star, wis...","[[[tensor(0.1238), tensor(-0.1887), tensor(0.0..."
19995,compact cooler great product proper v ac adapt...,automotive,"[compact, cooler, great, product, proper, v, a...","[[[tensor(-0.6659), tensor(0.2869), tensor(0.0..."
19996,s top cleaner and conditioner yes and no amazi...,automotive,"[top, cleaner, conditioner, yes, amaze, stuff,...","[[[tensor(-0.0725), tensor(0.1614), tensor(-0...."
19997,does not fit does not fit my chrysler voyager...,automotive,"[fit, fit, chrysler, voyager, fit, dodge, stra...","[[[tensor(-0.6109), tensor(-0.0556), tensor(0...."


In [60]:
final_df = pd.concat([final_df, embed2], axis = 0)

In [61]:
final_df

Unnamed: 0,embeddings,text,target,tokens
0,"tensor([[[-0.0398, 0.0898, 0.2421, ..., -0....",last summer my granddaughter brooke played in ...,academic interests,"['last', 'summer', 'granddaughter', 'brooke', ..."
1,"tensor([[[-0.0948, 0.0705, 0.0733, ..., -0....",new delhi uttar pradesh police recruitment boa...,academic interests,"['new', 'delhi', 'uttar', 'pradesh', 'police',..."
2,"tensor([[[-0.0959, 0.0608, 0.1507, ..., -0....",belgaum karnataka higher education minister c...,academic interests,"['belgaum', 'karnataka', 'high', 'education', ..."
3,"tensor([[[ 0.1248, 0.2531, 0.2269, ..., -0....",lincoln university has welcomed the governmen...,academic interests,"['lincoln', 'university', 'welcome', 'governme..."
4,"tensor([[[-3.5864e-01, 3.5984e-03, -3.4623e-0...",new delhi the uttar pradesh public service com...,academic interests,"['new', 'delhi', 'uttar', 'pradesh', 'public',..."
...,...,...,...,...
19994,"[[[tensor(0.1238), tensor(-0.1887), tensor(0.0...",overpriced with short life span five stars i w...,automotive,"[overprice, short, life, span, five, star, wis..."
19995,"[[[tensor(-0.6659), tensor(0.2869), tensor(0.0...",compact cooler great product proper v ac adapt...,automotive,"[compact, cooler, great, product, proper, v, a..."
19996,"[[[tensor(-0.0725), tensor(0.1614), tensor(-0....",s top cleaner and conditioner yes and no amazi...,automotive,"[top, cleaner, conditioner, yes, amaze, stuff,..."
19997,"[[[tensor(-0.6109), tensor(-0.0556), tensor(0....",does not fit does not fit my chrysler voyager...,automotive,"[fit, fit, chrysler, voyager, fit, dodge, stra..."


In [62]:
final_df.to_csv('final_df.csv', index = False)

#### BOOKS AND LITERATURE

In [66]:
def generate_embeddings_batch(token):
    token_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=512, truncation=True) for sent in token]

    max_seq_length = max(len(tokens) for tokens in token_ids)
    token_ids = [tokens + [tokenizer.pad_token_id] * (max_seq_length - len(tokens)) for tokens in token_ids]
    token_ids = torch.tensor(token_ids, dtype=torch.long)

    with torch.no_grad():
        outputs = model(token_ids, return_dict=True)
        token_embeddings = outputs.last_hidden_state
    token_embeddings = token_embeddings.numpy()
    
    return token_embeddings

In [67]:
embed3 = category_dfs['books and literature'].copy()
empty_list_mask = embed3['tokens'].apply(lambda x: len(x) == 0)
embed3 = embed3[~empty_list_mask]
embed3.reset_index(drop=True, inplace=True)

In [68]:
batch_size = 32
batches = [embed3['tokens'][i:i + batch_size] for i in range(0, len(embed3), batch_size)]

embeddings_list = []

for batch in batches:
    token_embeddings_batch = generate_embeddings_batch(batch)
    embeddings_list.append(token_embeddings_batch)

token_embeddings_all = np.concatenate(embeddings_list, axis=0)
embed2['embeddings'] = token_embeddings_all.tolist()

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 46301184 bytes.

In [71]:
len(embeddings_list)

232