In [1]:

import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax
import csv
import urllib.request



  from .autonotebook import tqdm as notebook_tqdm
2022-12-09 09:02:51.960758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# twitter-roberta-base-emotion 
Model card: https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion

Benchmarks: https://arxiv.org/pdf/2010.12421.pdf

Tasks: emoji, emotion, hate, irony, offensive, sentiment, stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary


## Different models


### Detect offensive language

In [2]:
##  Specify task

task='offensive'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"


## Download label mapping for tasks

mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels_off = [row[1] for row in csvreader if len(row) > 1]
print(labels_off)


## Instantiate tokenizer and model

tokenizer_off = AutoTokenizer.from_pretrained(MODEL)
model_off = AutoModelForSequenceClassification.from_pretrained(MODEL)

['not-offensive', 'offensive']


In [3]:
# Function for predicting offensive language

def predict_offensive(text, labels):
    
    # Encode
    encoded_input = tokenizer_off(text, return_tensors='pt')
    
    # Model output
    output = model_off(**encoded_input)
    
    # Convert output (tensors) to numpy array
    scores = output[0][0].detach().numpy()
    
    # Apply softmax function
    scores = softmax(scores)
    
    # Convert scores for emotions to dictionary
    offensive = {}
    for i in range(len(scores)):
        offensive[labels_off[i]] = scores[i]
    
    return offensive

### Predict emotions

In [4]:
##  Specify task

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"


## Download label mapping for tasks

mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels_emo = [row[1] for row in csvreader if len(row) > 1]
print(labels_emo)


## Instantiate tokenizer and model

tokenizer_emo = AutoTokenizer.from_pretrained(MODEL)
model_emo = AutoModelForSequenceClassification.from_pretrained(MODEL)

['anger', 'joy', 'optimism', 'sadness']


In [12]:
# Function for predicting emotions

def predict_emotions(text, labels):
    
    # Encode
    encoded_input = tokenizer_emo(text, return_tensors='pt')
    
    # Model output
    output = model_emo(**encoded_input)
    
    # Convert output (tensors) to numpy array
    scores = output[0][0].detach().numpy()
    
    # Apply softmax function
    scores = softmax(scores)
    
    # Convert scores for emotions to dictionary
    emotions = {}
    for i in range(len(scores)):
        emotions[labels_emo[i]] = scores[i]
    
    return emotions

## Detect positive / negative / neutral

In [13]:
##  Specify task

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"


## Download label mapping for tasks

mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels_sent = [row[1] for row in csvreader if len(row) > 1]
print(labels_sent)


## Instantiate tokenizer and model

tokenizer_sent = AutoTokenizer.from_pretrained(MODEL)
model_sent = AutoModelForSequenceClassification.from_pretrained(MODEL)

['negative', 'neutral', 'positive']


Downloading: 100%|██████████████████████████████████████████████████████████████| 747/747 [00:00<00:00, 190kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████| 899k/899k [00:00<00:00, 1.43MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 895kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 109kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████| 499M/499M [00:22<00:00, 21.7MB/s]


In [16]:
# Function for predicting emotions

def predict_sentiment(text, labels):
    
    # Encode
    encoded_input = tokenizer_sent(text, return_tensors='pt')
    
    # Model output
    output = model_sent(**encoded_input)
    
    # Convert output (tensors) to numpy array
    scores = output[0][0].detach().numpy()
    
    # Apply softmax function
    scores = softmax(scores)
    
    # Convert scores for emotions to dictionary
    sentiment = {}
    for i in range(len(scores)):
        sentiment[labels_sent[i]] = scores[i]
    
    return sentiment

# Test predictions

In [49]:
# User input
text = "I want to have sex but my partner does not want to"

In [43]:
offensive = predict_offensive(text, labels_off)
offensive

{'not-offensive': 0.47435132, 'offensive': 0.52564865}

In [51]:
emotions = predict_emotions(text, labels_emo)
emotions

{'anger': 0.12061143,
 'joy': 0.031667437,
 'optimism': 0.021350257,
 'sadness': 0.8263709}

In [50]:
sentiment = predict_sentiment(text, labels_sent)
sentiment

{'negative': 0.6622486, 'neutral': 0.3186334, 'positive': 0.019117936}