# Import Data

In [None]:
import json

with open('urban_dict_data_cleaned_emo.json', 'r') as file:
    urban_dict_data = json.load(file)

# Sentiment Analysis Model

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

# Load the model, tokenizer, and config
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
print(device)

cuda


# Sentiment Classification

In [None]:
# Function to classify sentiment for a batch of definitions
def classify_sentiment_batch(sentences, batch_size=32):
    # Tokenize the input data in batches
    all_labels = []
    all_scores = []

    for i in range(0, len(sentences), batch_size):
        print(f"Starting batch {i}", flush=True)
        batch = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')

        # Move the batch input to GPU
        encoded_input = {key: value.to(device) for key, value in encoded_input.items()}

        # Run the model on the batch
        with torch.no_grad():  # Disable gradient calculations for inference
            output = model(**encoded_input)

        # Get the scores and apply softmax to normalize them
        scores = output.logits.detach().cpu().numpy()  # Move logits back to CPU if needed
        scores = softmax(scores, axis=-1)

        # Get the ranking of the scores (highest to lowest)
        batch_labels = [config.id2label[np.argmax(score)] for score in scores]
        batch_scores = [np.round(float(np.max(score)), 4) for score in scores]

        all_labels.extend(batch_labels)
        all_scores.extend(batch_scores)

    return all_labels, all_scores

In [None]:
# Extract definitions to be classified
definitions = [value["top_5_entries"][0]["definition"] for key, value in urban_dict_data.items()]
examples = [value["top_5_entries"][0]["example"] for key, value in urban_dict_data.items()]
print(definitions[0])
print(examples[0])

A versatile declaration, originating (more or less) in hip-hop culture.

"Word" has no single meaning, but is used to convey a casual sense of affirmation, acknowledgement, agreement, or to indicate that something has impressed you favorably.

Its usage among young blacks has been parodied ad nauseam among clueless suburban whites.

"Come on, man, we're going to the store."
"Word."



In [None]:
# Classify sentiment for all definitions in batches
definition_labels, _ = classify_sentiment_batch(definitions, batch_size=32)
example_labels, _ = classify_sentiment_batch(examples, batch_size=32)

Starting batch 0
Starting batch 32
Starting batch 64
Starting batch 96
Starting batch 128
Starting batch 160
Starting batch 192
Starting batch 224
Starting batch 256
Starting batch 288
Starting batch 320
Starting batch 352
Starting batch 384
Starting batch 416
Starting batch 448
Starting batch 480
Starting batch 512
Starting batch 544
Starting batch 576
Starting batch 608
Starting batch 640
Starting batch 672
Starting batch 704
Starting batch 736
Starting batch 768
Starting batch 800
Starting batch 832
Starting batch 864
Starting batch 896
Starting batch 928
Starting batch 960
Starting batch 992
Starting batch 1024
Starting batch 1056
Starting batch 1088
Starting batch 1120
Starting batch 1152
Starting batch 1184
Starting batch 1216
Starting batch 1248
Starting batch 1280
Starting batch 1312
Starting batch 1344
Starting batch 1376
Starting batch 1408
Starting batch 1440
Starting batch 1472
Starting batch 1504
Starting batch 1536
Starting batch 1568
Starting batch 1600
Starting batch 16

In [None]:
print(definition_labels[0:20])
print(example_labels[0:20])

['neutral', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'negative', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'negative', 'negative', 'neutral', 'negative', 'negative']
['neutral', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'neutral', 'neutral']


In [None]:
index = 0
for key, value in urban_dict_data.items():
  value["top_5_entries"][0]["definition_sentiment_label"] = definition_labels[index]
  value["top_5_entries"][0]["example_sentiment_label"] = example_labels[index]
  index += 1

In [None]:
with open("urban_dict_data_cleaned_sent.json", "w") as json_file:
    json.dump(urban_dict_data, json_file, indent=4)

from google.colab import files
files.download('urban_dict_data_cleaned_sent.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Emotion Classification

In [None]:
import json

with open('urban_dict_data_cleaned_sent.json', 'r') as file:
    urban_dict_data = json.load(file)

In [None]:
from transformers import pipeline

# Load the pipeline and move it to the GPU
pipe = pipeline("text-classification",
                model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
                return_all_scores=True,
                device=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Function to classify emotion for a batch of sentences
def classify_emotion_batch(sentences, batch_size=100):
    labels = []

    for i in range(0, len(sentences), batch_size):
        print(f"Starting batch {i}", flush=True)
        batch = sentences[i:i+batch_size]

        results = pipe(batch)

        batch_labels = []
        for result in results:
            max_label = max(result, key=lambda x: x['score'])
            batch_labels.append(max_label['label'])

        labels.extend(batch_labels)

    return labels

In [None]:
# Extract definitions to be classified
definitions = [value["top_5_entries"][0]["definition"][0:500] for key, value in urban_dict_data.items()]
examples = [value["top_5_entries"][0]["example"][0:400] for key, value in urban_dict_data.items()]
print(definitions[0])
print(examples[0])

A versatile declaration, originating (more or less) in hip-hop culture.

"Word" has no single meaning, but is used to convey a casual sense of affirmation, acknowledgement, agreement, or to indicate that something has impressed you favorably.

Its usage among young blacks has been parodied ad nauseam among clueless suburban whites.

"Come on, man, we're going to the store."
"Word."



In [None]:
definition_labels = classify_emotion_batch(definitions, batch_size=100)

Starting batch 0
Starting batch 100
Starting batch 200
Starting batch 300
Starting batch 400
Starting batch 500
Starting batch 600
Starting batch 700
Starting batch 800
Starting batch 900
Starting batch 1000


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Starting batch 1100
Starting batch 1200
Starting batch 1300
Starting batch 1400
Starting batch 1500
Starting batch 1600
Starting batch 1700
Starting batch 1800
Starting batch 1900
Starting batch 2000
Starting batch 2100
Starting batch 2200
Starting batch 2300
Starting batch 2400
Starting batch 2500
Starting batch 2600
Starting batch 2700
Starting batch 2800
Starting batch 2900
Starting batch 3000
Starting batch 3100
Starting batch 3200
Starting batch 3300
Starting batch 3400
Starting batch 3500
Starting batch 3600
Starting batch 3700
Starting batch 3800
Starting batch 3900
Starting batch 4000
Starting batch 4100
Starting batch 4200
Starting batch 4300
Starting batch 4400
Starting batch 4500
Starting batch 4600
Starting batch 4700
Starting batch 4800
Starting batch 4900
Starting batch 5000
Starting batch 5100
Starting batch 5200
Starting batch 5300
Starting batch 5400
Starting batch 5500
Starting batch 5600
Starting batch 5700
Starting batch 5800
Starting batch 5900
Starting batch 6000


In [None]:
example_labels = classify_emotion_batch(examples, batch_size=100)

Starting batch 0
Starting batch 100
Starting batch 200
Starting batch 300
Starting batch 400
Starting batch 500
Starting batch 600
Starting batch 700
Starting batch 800
Starting batch 900
Starting batch 1000
Starting batch 1100
Starting batch 1200
Starting batch 1300
Starting batch 1400
Starting batch 1500
Starting batch 1600
Starting batch 1700
Starting batch 1800
Starting batch 1900
Starting batch 2000
Starting batch 2100
Starting batch 2200
Starting batch 2300
Starting batch 2400
Starting batch 2500
Starting batch 2600
Starting batch 2700
Starting batch 2800
Starting batch 2900
Starting batch 3000
Starting batch 3100
Starting batch 3200
Starting batch 3300
Starting batch 3400
Starting batch 3500
Starting batch 3600
Starting batch 3700
Starting batch 3800
Starting batch 3900
Starting batch 4000
Starting batch 4100
Starting batch 4200
Starting batch 4300
Starting batch 4400
Starting batch 4500
Starting batch 4600
Starting batch 4700
Starting batch 4800
Starting batch 4900
Starting bat

In [None]:
print(definition_labels[0:20])
print(example_labels[0:20])

['joy', 'disgust', 'joy', 'anger', 'joy', 'anger', 'anger', 'disgust', 'anticipation', 'anticipation', 'joy', 'disgust', 'anger', 'joy', 'anger', 'anger', 'anger', 'anticipation', 'fear', 'sadness']
['anticipation', 'sadness', 'surprise', 'disgust', 'joy', 'anger', 'anger', 'joy', 'anger', 'joy', 'anger', 'joy', 'anger', 'joy', 'disgust', 'anger', 'joy', 'anger', 'anger', 'anticipation']


In [None]:
print(len(definition_labels))
print(len(example_labels))

77949
77949


In [None]:
index = 0
for key, value in urban_dict_data.items():
  value["top_5_entries"][0]["definition_emotion_label"] = definition_labels[index]
  value["top_5_entries"][0]["example_emotion_label"] = example_labels[index]
  index += 1

In [None]:
with open("urban_dict_data_cleaned_emo.json", "w") as json_file:
    json.dump(urban_dict_data, json_file, indent=4)

from google.colab import files
files.download('urban_dict_data_cleaned_emo.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>