# Summary Steps
0. Load the env file
1. Load the tweetQA dataset from HuggingFace.
2. Create a model to communicate with ChatGPT.
3. Set up ChatPromptTemplate for Q & A using the tweet
4. Test the response
5. Try running in batches 
6. Evaluate the results using Bleu, Meteor and Rogue

## Step 0 - Connect with the .env file located in the same directory of this notebook

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]
hf_token = os.environ["HF_TOKEN"]

## Step 1: Load the tweetQA dataset from HuggingFace

from huggingface_hub import login
login(token=hf_token)

Load the dataset, save it to validate.json

In [2]:
from datasets import load_dataset, config
import copy
ds = load_dataset("ucsbnlp/tweet_qa")
# Show overall dataset
print(ds)

# Show the validation set
ds_validation = ds['validation']

# Set batch_size > validation-size to avoid json error later
ds_validation.to_json("validate.json", lines=False, batch_size=2000)
#ds_prediction = copy.deepcopy(ds_validation)

# Test change
#ds_validation[0]['Answer'] = 'Test Answer'
#print(ds_validation[0])
#print(ds_prediction[0])


DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 10692
    })
    validation: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 1086
    })
    test: Dataset({
        features: ['Question', 'Answer', 'Tweet', 'qid'],
        num_rows: 1979
    })
})


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

332180

## Step 2 - Create a model to communicate with ChatGPT

In [3]:
from langchain_openai import ChatOpenAI

# Use gpt-4o-mini as this is the cheapest
# Set temperature to 0 as we want a precise answer for higher score
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

## Step 3 - Set up ChatPromptTemplate for Q & A using the tweet

In [7]:
# Prompt template worked for single tweet-question pair
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
    AIMessage
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You will be answering questions based on a tweet. Give the answer related to the question and tweet only. "
            "Give a precise answer, no need to answer in a sentence",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | model

In [4]:
# Prompt template worked for multiple tweet-question pair
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import (
    HumanMessage,
    AIMessage
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You will be answering questions based on tweets. The total count of tweet and question pairs will be indicate at the beginning." + 
            "Example, count:10 indicate there are total 10 tweets and 10 questions. So I will expect 10 answers" +
            "The tweet will start with Tweet: . The question will start with Question: " + 
            "Each tweet and question pair will be separated by the exact keyword --next-- " + 
            "Return the answer for each question sorted according to the order of questions, separated with symbol #. No need to number the answer" +
            "Give a precise answer, no need to answer in a sentence",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | model

### Step 3 - 1 : Create helper functions

Helper function does the following things:
1. Create batch messages from the validation dataset
2. Get the response from the ChatGPT model using batch api from LangChain
3. Store the response 

In [5]:
# importing datetime module
from datetime import datetime
from timeit import default_timer as timer

# Batch prediction function 
def batch_prediction(data, results, start_idx:int, batch_size:int):
    # Perform batch prediction based on start_idx in the dataset and batch_size
    # Stored it in results
    batch_messages = []
    temp_results = []
    minute2tk_cnts = []
    test_message = f"count:{batch_size} "
    for i in range(start_idx, start_idx + batch_size):
        test_message += "["
        test_message += f"Tweet:{ds_validation[i]['Tweet']}"
        test_message += ','
        test_message += f"Question:{ds_validation[i]['Question']}?"
        test_message += ']'
        if i+1 != start_idx + batch_size:
            test_message += '-next-'
        temp_results.append(ds_validation[i])
        
        #batch_messages.append({"messages": [HumanMessage(content=test_message)]})
    # print(test_message) 
    #responses = chain.batch(batch_messages)
    start = timer()
    response = chain.invoke({"messages": [HumanMessage(content=test_message)]})
    end = timer()
    print(f"Query took {end - start} seconds")
    print(response.content)
    print(response.usage_metadata["total_tokens"])
    time_obj = datetime.now()
    answers_txt = response.content.strip()
    answers = answers_txt.split("#")
    print(len(answers), len(temp_results))

    for i,response in enumerate(answers):
        if i < len(temp_results):
            temp_results[i]['Answer'] = response
    #print(temp_results)
    
    results.extend(temp_results)
    # End 
    

Set the list to store the results

In [15]:
pred_results = []

# Reload previous result
import json
prev_results = json.load(open('prediction.json'))
print(len(prev_results))
pred_results.extend(prev_results)
pred_results = pred_results[:120]
print(len(pred_results))



170
120


Call the function to obtain list of response

In [109]:
batch_prediction(ds_validation, pred_results, start_idx=80, batch_size=40)


Query took 2.858128200052306 seconds
Olympics#USA#Do Ya Think I'm Sexy#republicrecords#the Internet#Net Neutrality#Usher, Jimmy Fallon, Hamilton Musical cast#office workers#San Bernardino#Nate#14th#Golden Globes#black#unknown#sadness#1 dollar#@smosh#Erdogan#abuse#two scarves and a jacket#June#Patricia Arquette#half her life#Ellen DeGeneres and Portia#ellentube#David Levitz#stick together#protect DREAMERS#Sundance#unknown#Dunkirk#unknown#Sanders#94 crime bill#Grace VanderWaal#made her famous#Ryan#jazz#airwaves#write a book or start a podcast
2813
40 40


In [12]:
# Showing failed response
batch_prediction(ds_validation, pred_results, start_idx=170, batch_size=100)


Query took 7.158819200005382 seconds
Zac Efron#cheek#Dave Majewski#not surprised#thinkpieces#FyreFestival#not equal#year and a half#record#Sweden#FoxNews#four-shot lead#No. 17#emotions#multiple offers#Iowa flag#midfield#Ask Sheev#protect the American people#helping economy#Paul, Noma and Cherrelle#Harry Potter Play#official statement#heartbroken#Joss Whedon#BATGIRL#Nashville#Taylor#section 104#Mike Green#three-year deal#long-term deal#PR team#Zigi#crowd's mouths#prayers & hymns#boogieman#Alexander Pettyton#favorite part#PCL#hating on nomaj#the elephant#nervous#Boston#city#Team USA#lockerroom#Sterling K. Brown#proud#The Tig#sad#restroom#2014#LA#reaction#This Is What The Truth Feels Like TOUR#link#upper hand#Beyhive#tattoo#LeBron James#America#amber waves of grain#Ann Coulter#Delta#Fred Armisen#Difficult People#Governor Jay Nixon#fair#Carlton#Hotline Bling#Shia LaBeouf#surveillance images#December 29, 2017#shower#Saturday Night Live#flats#Gal Gadot#Bryzgalov#NHLAllStar#Selena & Abel#musi

Dumping result to json

In [16]:
import json
with open("prediction_120.json", "w") as final:
    json.dump(pred_results, final)

## Step 6: Evaluation

In [13]:
import string
import re
import json

# import nltk
# nltk.download()


from nltk.translate.bleu_score import sentence_bleu
import numpy as np

from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge

# Importing the statistics module
from statistics import mean

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

meteor_scorer = Meteor()
rouge_scorer = Rouge()

def ans_score(ans, gold_list):
    ans = normalize_answer(ans)
    gold_list = [normalize_answer(ref) for ref in gold_list]
    bleu = sentence_bleu([_.split() for _ in gold_list], ans.split(), weights=(1,0,0,0))
    meteor, _ = meteor_scorer.compute_score({0:gold_list}, {0:[ans]})
    rouge, _ = rouge_scorer.compute_score({0:gold_list}, {0:[ans]})
    return [bleu, meteor, rouge]

def evaluate(test_annotation_file, user_annotation_file, phase_codename, **kwargs):
    gold_file = test_annotation_file
    pred_file = user_annotation_file
    gold = json.load(open(gold_file))
    pred = json.load(open(pred_file))
    idx2gold = {item['qid']:item['Answer'] for item in gold}
    idx2pred = {item['qid']:item['Answer'] for item in pred}
    idx2scores = {}
    for id_ in idx2gold.keys():
        # Skip when prediction results not available
        if id_ not in idx2pred:
            continue
        if isinstance(idx2pred[id_], list):
            pred_ans = idx2pred[id_][0]
        else:
            pred_ans = idx2pred[id_]
        idx2scores[id_] = ans_score(pred_ans, idx2gold[id_])

    # Test print
    # print(idx2scores)
    bleus = [item[0] for item in idx2scores.values()]
    meteors = [item[1] for item in idx2scores.values()]
    rouges = [item[2] for item in idx2scores.values()]
    print({'BLEU': np.mean(bleus), 'METEOR': np.mean(meteors), 'ROUGE': np.mean(rouges)})

    output = {}
    output['result'] = [
    {'test_split': 
        {
        'BLEU-1': np.mean(bleus),
        'METEOR': np.mean(meteors),
        'ROUGE': np.mean(rouges)
        }
    }
    ]

    return output

In [17]:
# test evaluate function
evaluate("validate.json","prediction_120.json", "ChatGPT4o-mini")

{'BLEU': 0.7354816502173205, 'METEOR': 0.6962335040048635, 'ROUGE': 0.7723241380264703}


{'result': [{'test_split': {'BLEU-1': 0.7354816502173205,
    'METEOR': 0.6962335040048635,
    'ROUGE': 0.7723241380264703}}]}