# Tagging

In [4]:
from pydantic import BaseModel
from llama_index.program import OpenAIPydanticProgram, LLMTextCompletionProgram
from llama_index.llms import OpenAI, MistralAI
from llama_index.prompts import PromptTemplate
from llama_index.output_parsers import PydanticOutputParser

import pandas as pd
from dotenv import dotenv_values

Set env var OPENAI_API_KEY, MISTRAL_API_KEY or load from a .env file

In [5]:
config = dotenv_values()
OPENAI_API_KEY = config["OPENAI_API_KEY"]
MISTRAL_API_KEY = config["MISTRAL_API_KEY"]

## Load Datasets

Load Kaggle's Twitter hate speech dataset

In [7]:
df = pd.read_csv("./data/test_tweets_anuFYb8.csv", sep=",")
df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [25]:
# Randomly select 10 samlpes and create list of str
df_elements = df.sample(n=10)
tweets_list = df_elements['tweet'].to_list()
tweets_list

["@user #people aren't protesting #trump because a #republican won-they do so because trump has fuhered  &amp;â\x80¦ ",
 "soon i'll be one! ",
 "i will miss you. the way we use to be especially the nights being wid you. our memories that we've shared each everyday.   #emotional   ð\x9f\x98\xad",
 '@user &lt;3 listen to my most beautiful best friend  @user  singing with her most amazing voice &lt;3 &lt;3 &lt;3 &lt;3 21 ',
 "can't believe i've just been on stage with @user and to top it off the pain i've had for 18 months is gone!   #derrenmiracle",
 'i am growing. #i_am #positive #affirmation     ',
 'always seems a shame to build something so nice then box it back down again. good to see you all again ',
 'time feels as if it is moving backwards.  ',
 '"i never knew of a morning in #africa when i woke up and was not  ." we are booking #kenya fall #travel trips! ',
 'we are so close to our booked wedding dates - cannot wait to share the pictures with u!   #beautifulhighbarn ']

## Define output schema

We use a Pydantic schema to specify the required properties and types.

In [11]:
class Tag(BaseModel):
    """Data model for tagging."""

    text: str
    sentiment: str
    toxicity: float
    hate: float
    hate_threatening: float
    violence: float

## Tagging with OpenAI

Define openai pydantic program

In [15]:
openai_prompt_template = """\
I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself.

comment: {user_comment}\
"""

openai_model_name = "gpt-3.5-turbo-1106"

openai_program = OpenAIPydanticProgram.from_defaults(
    output_cls=Tag,
    llm=OpenAI(model=openai_model_name, api_key=OPENAI_API_KEY),
    prompt_template_str=openai_prompt_template,
    verbose=True,
)

Run program to get tagging output.

In [16]:
# from llama_index.response.notebook_utils import display_response

openai_output = openai_program(user_comment="I love this world!")

Function call: Tag with args: {"text":"I love this world!","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}


In [17]:
openai_output.model_dump()

{'text': 'I love this world!',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [26]:
openai_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = openai_program(user_comment=tweet)
        openai_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

Function call: Tag with args: {"text":"@user #people aren't protesting #trump because a #republican won-they do so because trump has fuhered  &amp;\n","sentiment":"negative","toxicity":0.3,"hate":0.2,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"soon i'll be one!","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"i will miss you. the way we use to be especially the nights being wid you. our memories that we've shared each everyday.   #emotional   ð­","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"@user &lt;3 listen to my most beautiful best friend  @user  singing with her most amazing voice &lt;3 &lt;3 &lt;3 &lt;3 21","sentiment":"positive","toxicity":0.1,"hate":0.1,"hate_threatening":0.1,"violence":0.1}
Function call: Tag with args: {"text":"can't believe i've just been on stage with @user and to 

In [29]:
for tagged_tweet in openai_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "@user #people aren't protesting #trump because a #republican won-they do so because trump has fuhered  &amp;\n",
  "sentiment": "negative",
  "toxicity": 0.3,
  "hate": 0.2,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "soon i'll be one!",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "i will miss you. the way we use to be especially the nights being wid you. our memories that we've shared each everyday.   #emotional   ð­",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "@user &lt;3 listen to my most beautiful best friend  @user  singing with her most amazing voice &lt;3 &lt;3 &lt;3 &lt;3 21",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "can't believe i've just been on stage with @user and to top it off the pain i've had for 18 months is 

## Tagging with Mixtral 8x7

In [30]:
mixtral_prompt_template = """\
I request you to divide the user comment below into aspects and perform sentiment analysis and give the result in sentiment analysis json format. Just return result in json format. Don't add any other comments.

Note: The toxicity, hate, hate_threatening and violence scores should be in the range (0.1 - 1). The scores should be a float.
The sentiment label should be either ['positive', 'negative', 'neutral'].
The text label should be the comment itself.

comment: {user_comment}\
"""

mixtral_prompt_tmpl = PromptTemplate(mixtral_prompt_template)

mixtral_model_name = "mistral-small"

mixtral_program = LLMTextCompletionProgram(
    output_parser=PydanticOutputParser(Tag),
    prompt=mixtral_prompt_tmpl,
    llm=MistralAI(model=mixtral_model_name, api_key=MISTRAL_API_KEY),
    verbose=True,
)

Run program to get tagging output.

In [31]:
mixtral_output = mixtral_program(user_comment="I love this world!")

In [32]:
mixtral_output.model_dump()

{'text': 'I love this world!',
 'sentiment': 'positive',
 'toxicity': 0.1,
 'hate': 0.1,
 'hate_threatening': 0.1,
 'violence': 0.1}

Run the tagging program on dataset

In [33]:
mixtral_tagged_list = []
errors = []
for tweet in tweets_list:
    # from llama_index.response.notebook_utils import display_response

    try:
        output = mixtral_program(user_comment=tweet)
        mixtral_tagged_list.append(output)
    except Exception as e:
        errors.append(tweet)

In [34]:
for tagged_tweet in mixtral_tagged_list:
    print(tagged_tweet.model_dump_json(indent=2))

{
  "text": "@user #people aren't protesting #trump because a #republican won-they do so because trump has fuhered  &amp;â¦",
  "sentiment": "negative",
  "toxicity": 0.3,
  "hate": 0.2,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "soon i'll be one!",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "i will miss you. the way we use to be especially the nights being wid you. our memories that we've shared each everyday.   #emotional   ð­",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "@user <3 listen to my most beautiful best friend @user singing with her most amazing voice <3 <3 <3 <3 21",
  "sentiment": "positive",
  "toxicity": 0.1,
  "hate": 0.1,
  "hate_threatening": 0.1,
  "violence": 0.1
}
{
  "text": "can't believe i've just been on stage with @user and to top it off the pain i've had for 18 months is gone!   #derrenm