# Summary Steps
0. Load the env file
1. Load the tweetQA dataset from HuggingFace.
2. Create a model to communicate with ChatGPT.
3. Set up ChatPromptTemplate for Q & A using the tweet
4. Test the response
5. Try running in batches 
6. Evaluate the results using Bleu, Meteor and Rogue

## Step 0 - Connect with the .env file located in the same directory of this notebook

In [26]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]
hf_token = os.environ["HF_TOKEN"]

## Step 1: Load the tweetQA dataset from HuggingFace

from huggingface_hub import login
login(token=hf_token)

Load the dataset, save it to validate.json

In [19]:
from datasets import load_dataset, config

ds = load_dataset("ucsbnlp/tweet_qa")
# Show overall dataset
print(ds)

# Show the validation set
ds_validation = ds['validation']
print(ds_validation)
print(config.DEFAULT_MAX_BATCH_SIZE)
ds_validation.to_json("validate.json", lines=False, batch_size=2000)


DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 10692
    })
    validation: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 1086
    })
    test: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 1979
    })
})
Dataset({
    features: ['Question', 'Answer', 'Tweet', 'qid'],
    num_rows: 1086
})
1000


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

332180

## Step 2 - Create a model to communicate with ChatGPT

In [27]:
from langchain_openai import ChatOpenAI

# Use gpt-4o-mini as this is the cheapest
# Set temperature to 0 as we want a precise answer for higher score
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

## Step 2 - Set up ChatPromptTemplate for Q & A using the tweet

In [10]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
    AIMessage
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You will be answering questions based on a tweet. Give the answer related to the question and tweet only. "
            "Give a precise answer, no need to answer in a sentence",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | model

### Step 2 - 1 : Check first response

In [9]:
test_message = "Tweet:The police dogs who patrol the area are some of my favorite faces to see around the Capitol. # Sen. Al Franken (@SenFranken) August 26, 2017"
test_message += ','
test_message += "Question:who are some of Franken's favorite faces to see?"
response = chain.invoke({"messages": [HumanMessage(content=test_message)]})

response.content

'The police dogs who patrol the area.'

In [13]:
test_message = "Tweet:Our prayers are with the students, educators & families at Independence High School & all the first responders on the scene. #PatriotPride\u2014 Doug Ducey (@dougducey) February 12, 2016"
test_message += ','
test_message += "Question:at which school were first responders on the scene for?"
response = chain.invoke({"messages": [HumanMessage(content=test_message)]})

response.content

'Independence High School'

Prepare the prediction dataset

In [20]:
ds_validation.to_json("prediction.json", lines=False, batch_size=2000)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

332180

In [10]:
ds_validation.to_json("validate2.json")

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

332178

In [11]:
test_message = f"Tweet:{ds_validation[0]['Tweet']}"
test_message += ','
test_message += f"Question:{ds_validation[0]['Question']}?"
print(test_message)
response = chain.invoke({"messages": [HumanMessage(content=test_message)]})

response.content

Tweet:This forecast is deflated as much as New England Patriots footballs! I apologize. W NJ has the most to lose. Dave Curren (@DaveCurren) January 27, 2015,Question:who has the most to lose?


'W NJ'

In [13]:
test_message1 = f"Tweet:{ds_validation[0]['Tweet']}"
test_message1 += ','
test_message1 += f"Question:{ds_validation[0]['Question']}?"
print(test_message1)

test_message2 = f"Tweet:{ds_validation[1]['Tweet']}"
test_message2 += ','
test_message2 += f"Question:{ds_validation[1]['Question']}?"
print(test_message2)
responses = chain.batch([{"messages": [HumanMessage(content=test_message1)]}, {"messages": [HumanMessage(content=test_message2)]}])

for response in responses:
    print(response.content)

Tweet:This forecast is deflated as much as New England Patriots footballs! I apologize. W NJ has the most to lose. Dave Curren (@DaveCurren) January 27, 2015,Question:who has the most to lose?
Tweet:This forecast is deflated as much as New England Patriots footballs! I apologize. W NJ has the most to lose. Dave Curren (@DaveCurren) January 27, 2015,Question:what is deflated as much as the new england patriot footballs?
W NJ
The forecast.


In [14]:
for response in responses:
    print(response.content)
    print(type(response.content))

W NJ
<class 'str'>
The forecast.
<class 'str'>


## Now import the evaluation formula

In [22]:
import string
import re
import json

# import nltk
# nltk.download()


from nltk.translate.bleu_score import sentence_bleu
import numpy as np

from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge

# Importing the statistics module
from statistics import mean

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

meteor_scorer = Meteor()
rouge_scorer = Rouge()

def ans_score(ans, gold_list):
    ans = normalize_answer(ans)
    gold_list = [normalize_answer(ref) for ref in gold_list]
    bleu = sentence_bleu([_.split() for _ in gold_list], ans.split(), weights=(1,0,0,0))
    meteor, _ = meteor_scorer.compute_score({0:gold_list}, {0:[ans]})
    rouge, _ = rouge_scorer.compute_score({0:gold_list}, {0:[ans]})
    return [bleu, meteor, rouge]

def evaluate(test_annotation_file, user_annotation_file, phase_codename, **kwargs):
    gold_file = test_annotation_file
    pred_file = user_annotation_file
    gold = json.load(open(gold_file))
    pred = json.load(open(pred_file))
    idx2gold = {item['qid']:item['Answer'] for item in gold}
    idx2pred = {item['qid']:item['Answer'] for item in pred}
    idx2scores = {}
    for id_ in idx2gold.keys():
        if isinstance(idx2pred[id_], list):
            pred_ans = idx2pred[id_][0]
        else:
            pred_ans = idx2pred[id_]
        idx2scores[id_] = ans_score(pred_ans, idx2gold[id_])
    bleus = [item[0] for item in idx2scores.values()]
    meteors = [item[1] for item in idx2scores.values()]
    rouges = [item[2] for item in idx2scores.values()]
    print({'BLEU': np.mean(bleus), 'METEOR': np.mean(meteors), 'ROUGE': np.mean(rouges)})

    output = {}
    output['result'] = [
    {'test_split': 
        {
        'BLEU-1': np.mean(bleus),
        'METEOR': np.mean(meteors),
        'ROUGE': np.mean(rouges)
        }
    }
    ]

    return output

In [23]:
# test evaluate function
evaluate("validate.json","prediction.json", "ChatGPT3.5")

{'BLEU': 1.0, 'METEOR': 1.0, 'ROUGE': 1.0}


{'result': [{'test_split': {'BLEU-1': 1.0, 'METEOR': 1.0, 'ROUGE': 1.0}}]}