# Tagging

In [1]:
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram, LLMTextCompletionProgram
from llama_index.llms import OpenAI, MistralAI
from llama_index.prompts import PromptTemplate
from llama_index.output_parsers import PydanticOutputParser

import pandas as pd
from dotenv import dotenv_values

Set env var OPENAI_API_KEY, MISTRAL_API_KEY or load from a .env file

In [2]:
config = dotenv_values()
OPENAI_API_KEY = config["OPENAI_API_KEY"]
MISTRAL_API_KEY = config["MISTRAL_API_KEY"]

## Load Datasets

Load Kaggle's Twitter hate speech dataset

In [3]:
!mkdir -p data
!wget "https://drive.google.com/uc?export=download&id=1TjFzX30QVUFz1Fl_8Ei_liWomvgy0yxY" -O data/tweets.csv 

--2023-12-20 11:49:44--  https://drive.google.com/uc?export=download&id=1TjFzX30QVUFz1Fl_8Ei_liWomvgy0yxY
Resolving drive.google.com (drive.google.com)... 173.194.73.102, 173.194.73.100, 173.194.73.113, ...
Connecting to drive.google.com (drive.google.com)|173.194.73.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-30-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ntt47qtfsigf0649q5oocb7bfv2v51tl/1703051325000/04295854648265356048/*/1TjFzX30QVUFz1Fl_8Ei_liWomvgy0yxY?e=download&uuid=9ee92b48-462e-457f-8566-a0b18ab06e62 [following]
--2023-12-20 11:49:46--  https://doc-0g-30-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ntt47qtfsigf0649q5oocb7bfv2v51tl/1703051325000/04295854648265356048/*/1TjFzX30QVUFz1Fl_8Ei_liWomvgy0yxY?e=download&uuid=9ee92b48-462e-457f-8566-a0b18ab06e62
Resolving doc-0g-30-docs.googleusercontent.com (doc-0g-30-docs.googleusercontent.com)... 172.217.169.65
Connec

In [4]:
df = pd.read_csv("data/tweets.csv", sep=",")
df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [11]:
# Randomly select 10 samlpes and create list of str
df_elements = df.sample(n=10)
tweets_list = df_elements['tweet'].to_list()
tweets_list

['oh god #ripjulimuffn  ',
 'you always waiting on a damn text ð\x9f\x98\x82 ',
 '@user be careful with @user advice. he has sold his soul.  #faust #blacklivesmatter',
 'when you have no actual argument for your hatred of rm so you reso to name calling.   #raiders  ',
 "akarinselect pick-up ã\x80\x8c  mother's day...    @user #akarinselect #ã\x81\x82ã\x81\x8bã\x82\x8aã\x82\x93ã\x82»ã\x83¬ã\x82¯ã\x83\x88 ",
 'thursday - a mixed day. #happy  ',
 '23 times #degrassi got #waytoofuckingreal #real #memories   ',
 'when you are making piece of a you should do it with love or not at all â\x9d¤ï¸\x8fð\x9f\x8c¬ #a #handmade #nofarszigeti   ',
 'nude sex model of teen   pussy  ',
 '  bihdaygs   bihdaygs   bihdaygs   bihdaygs   bihdaygs   bihdaygs']

## Define output schema

We use a Pydantic schema to specify the required properties and types.

In [12]:
class Tag(BaseModel):
    """Data model for tagging."""

    text: str
    language: str
    sentiment: str
    toxicity: float
    hate: float
    hate_threatening: float
    violence: float

## Tagging with OpenAI

Define openai pydantic program

In [13]:
openai_prompt_template = """\
I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself.
The language label should be a full name of the language in which the comment is written.

comment: {user_comment}\
"""

openai_model_name = "gpt-3.5-turbo-1106"

openai_program = OpenAIPydanticProgram.from_defaults(
    output_cls=Tag,
    llm=OpenAI(model=openai_model_name, api_key=OPENAI_API_KEY),
    prompt_template_str=openai_prompt_template,
    verbose=True,
)

Run program to get tagging output.

In [14]:
# from llama_index.response.notebook_utils import display_response

openai_output = openai_program(user_comment="I love this world!")

Function call: Tag with args: {"text":"I love this world!","language":"English","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}


In [15]:
openai_output.model_dump()

{'text': 'I love this world!',
 'language': 'English',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [16]:
openai_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = openai_program(user_comment=tweet)
        openai_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

Function call: Tag with args: {"text":"oh god #ripjulimuffn","language":"English","sentiment":"neutral","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"you always waiting on a damn text \f0\fs24 \f1\fs20 ð\f0\fs24 \f1\fs20 \f0\fs24 \f1\fs20","language":"English","sentiment":"positive","toxicity":0.2,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"@user be careful with @user advice. he has sold his soul.  #faust #blacklivesmatter","language":"English","sentiment":"negative","toxicity":0.7,"hate":0.3,"hate_threatening":0.1,"violence":0.2}
Function call: Tag with args: {"text":"when you have no actual argument for your hatred of rm so you reso to name calling. #raiders","language":"English"}
Function call: Tag with args: {"text":"akarinselect pick-up ã  mother's day...    @user #akarinselect #ããããã»ã¬ã¯ã","language":"Japanese","sentiment":"neutral","toxicity":0.1,"hate":0.1,"h

In [17]:
for tagged_tweet in openai_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "oh god #ripjulimuffn",
  "language": "English",
  "sentiment": "neutral",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "you always waiting on a damn text \f0\fs24 \f1\fs20 ð\f0\fs24 \f1\fs20 \f0\fs24 \f1\fs20",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.2,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "@user be careful with @user advice. he has sold his soul.  #faust #blacklivesmatter",
  "language": "English",
  "sentiment": "negative",
  "toxicity": 0.7,
  "hate": 0.3,
  "hate_threatening": 0.1,
  "violence": 0.2
}
{
  "text": "akarinselect pick-up ã  mother's day...    @user #akarinselect #ããããã»ã¬ã¯ã",
  "language": "Japanese",
  "sentiment": "neutral",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "thursday - a mixed day. #happy",
  "language": "English",
  "sentiment": "positive",
  "toxicity": 0.2,
  "ha

## Tagging with Mixtral 8x7

In [18]:
mixtral_prompt_template = """
<s> [INST] I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself. 
The language label should be a full name of the language in which the comment is written. [/INST] </s> 
[INST] comment: {user_comment} 
Answer: [/INST]
"""

mixtral_prompt_tmpl = PromptTemplate(mixtral_prompt_template)

mixtral_model_name = "mistral-small"

mixtral_program = LLMTextCompletionProgram(
    output_parser=PydanticOutputParser(Tag),
    prompt=mixtral_prompt_tmpl,
    llm=MistralAI(model=mixtral_model_name, api_key=MISTRAL_API_KEY),
    verbose=True,
)

Run program to get tagging output.

In [19]:
mixtral_output = mixtral_program(user_comment="I love this world!")

In [20]:
mixtral_output.model_dump()

{'text': 'I love this world!',
 'language': 'English',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [21]:
mixtral_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = mixtral_program(user_comment=tweet)
        mixtral_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

In [22]:
for tagged_tweet in mixtral_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "oh god #ripjulimuffn",
  "language": "English",
  "sentiment": "negative",
  "toxicity": 0.2,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "you always waiting on a damn text ð",
  "language": "English",
  "sentiment": "neutral",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "@user be careful with @user advice. he has sold his soul.  #faust #blacklivesmatter",
  "language": "English",
  "sentiment": "negative",
  "toxicity": 0.2,
  "hate": 0.1,
  "hate_threatening": 0.0,
  "violence": 0.0
}
{
  "text": "when you have no actual argument for your hatred of rm so you resort to name calling.   #raiders",
  "language": "English",
  "sentiment": "negative",
  "toxicity": 0.3,
  "hate": 0.2,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "akarinselect pick-up ã mother's day... @user #akarinselect #ããããã»ã¬ã¯ã",
  "language": "Japanese",
  "sentiment": "neutral",
  "toxicity": 