In [1]:
%load_ext dotenv
%dotenv

In [2]:
import openai
import os
import pandas as pd
import stanza
import time
from datasets import load_dataset
from tqdm import tqdm

In [3]:
# Only needs to be run one time
# stanza.download('en')

In [42]:
split_name = 'test'
en_csqa = load_dataset('commonsense_qa', split=split_name)

Found cached dataset commonsense_qa (/mnt/nas2/kikiputri/cache/commonsense_qa/default/1.0.0/28d68f56649a7f0c23bc68eae850af914aa03f95f810011ae8cf58cc5ff5051b)


### Concept Relevancy Classifier

#### Post-Process

In [61]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', device=7)

2023-07-07 17:28:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-07-07 17:28:20 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| lemma     | combined |

2023-07-07 17:28:20 INFO: Using device: 7
2023-07-07 17:28:20 INFO: Loading: tokenize
2023-07-07 17:28:20 INFO: Loading: lemma
2023-07-07 17:28:20 INFO: Done loading processors!


In [9]:
def get_options_lemma(text):
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    return ' '.join(lemmas)

In [51]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy/"
lang_name = "su"
relevancy_df = pd.read_csv(f"{out_parent_dir}{split_name}_{lang_name}_step1.csv", index_col=0)

In [52]:
relevancy_df

Unnamed: 0_level_0,q_concept,option_a,option_b,option_c,option_d,option_e,names,answer
q_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90b30172e645ff91f7171a048582eb8b,yes,,,,,,[],
000990552527b1353f98f1e1a7dfc643,no,,,,,,[],
dca0f2859f3c3dd43a9b2bfeff4936a8,yes,,,,,,[],
8795a949b39702af0e452c9e1229046d,yes,,,,,,[],
1f74ea1f73b9f5d91a665b4d90218a6e,yes,,,,,,[],
...,...,...,...,...,...,...,...,...
3abf430c8338c3a4cdaa3e26b96bcae2,yes,,,,,,[],
fb46652b6016be675e301fafe03222f3,yes,,,,,,[],
27a3f39930a7383a9723897eb0e88f20,no,,,,,,"[('John', 'no'), ('Johnson', 'no'), ('Johnson'...",
c9a82c294ae81ca5f2b4dd7f4c031310,yes,,,,,,[],


In [53]:
excluded_concepts = [
    "sex", "vagina", "penis", "prostitute", "kiss", "killing people", "christian", "islam",
    "committing murder", "affair", "alcohol", "church", "mosque", "drug dealer", "gay",
    "lesbian", "lgbt", "lgbtq"
]

In [54]:
general_questions = []
for item in tqdm(en_csqa):
    q_id = item['id']

    # Check name relevancy
    is_name_irrelevant = "'no'" in relevancy_df.loc[q_id, 'names']

    # Check options relevancy
    options = [
        relevancy_df.loc[q_id, 'option_a'], relevancy_df.loc[q_id, 'option_b'],
        relevancy_df.loc[q_id, 'option_c'], relevancy_df.loc[q_id, 'option_d'],
        relevancy_df.loc[q_id, 'option_e']
    ]
    is_option_irrelevant = 'no' in options

    # Check q concept relevancy
    is_concept_relevant = relevancy_df.loc[q_id, 'q_concept'] == 'yes'

    # Check q concept offensiveness
    options_str = ' '.join(item['choices']['text']).lower()
    is_offensive = any(ex in item['question'].lower() for ex in excluded_concepts) or \
        any(ex in options_str for ex in excluded_concepts)
    
    # Check options ambiguity
    options_lemma = [get_options_lemma(option) for option in item['choices']['text']]
    is_options_ambiguous = len(set(options_lemma)) != len(options_lemma)

    if not is_name_irrelevant and not is_option_irrelevant and is_concept_relevant \
            and not is_offensive and not is_options_ambiguous:
        general_questions.append(item)

100%|██████████| 1140/1140 [00:36<00:00, 30.84it/s]


In [55]:
len(general_questions)

621

In [56]:
general_q_df = pd.DataFrame(general_questions)

In [57]:
general_q_df

Unnamed: 0,id,question,question_concept,choices,answerKey
0,90b30172e645ff91f7171a048582eb8b,"The townhouse was a hard sell for the realtor,...",townhouse,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
1,dca0f2859f3c3dd43a9b2bfeff4936a8,What were the kids doing as they looked up at ...,kids,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
2,8795a949b39702af0e452c9e1229046d,The person taught an advanced class only for who?,person,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
3,1f74ea1f73b9f5d91a665b4d90218a6e,What is a likely consequence of ignorance of r...,ignorance,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
4,0b7734f608c188350573247e3ef2a00d,After graduating the dentist set up his dental...,dental office,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
...,...,...,...,...,...
616,815a8367d08a14f150a6c777ad7f789a,Where would you put a laptop computer if you w...,computer,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
617,9082b65f2bc5328ea991f734f930ddb5,"If children were in a gym, would they be doing?",children,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
618,3abf430c8338c3a4cdaa3e26b96bcae2,What is a place where people live that has dis...,dishes,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",
619,fb46652b6016be675e301fafe03222f3,"The situation was causing anger, but his wife ...",anger,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",


In [58]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy/data_result/"
general_q_df.to_csv(f"{out_parent_dir}{split_name}_general_{lang_name}.csv", index=False)

#### Model Predict

In [None]:
ner_pipeline = stanza.Pipeline('en', processors='tokenize,ner', device=7)

In [None]:
def extract_names(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type == 'PERSON']


def extract_locations(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type in ['GPE', 'LOC']]

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.organization = os.environ['OPENAI_UILAB_KEY']
# response_history = {}

In [None]:
def get_input_prompt(concept, concept_type, location_name):
    end_prompt = "Answer with only 'yes' or 'no'."
    
    if concept_type == 'location':
        return f"Is {concept} located in {location_name}? {end_prompt}"
    
    if concept_type == 'name':
        return f"Is the name \"{concept}\" common in {location_name}? {end_prompt}"

    return f"Does {concept} commonly found in {location_name}? {end_prompt}"


def get_openai_chat_completion(input_prompt, model_name, temp=0.2):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt
            }
        ],
        temperature=temp
    )


def get_openai_relevancy(input_prompt, model_name):
    if input_prompt in response_history:
        return response_history[input_prompt]
    
    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except openai.error.RateLimitError:
        # print("OpenAI RateLimitError, wait for 1 minute...")
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)
    except openai.error.ServiceUnavailableError:
        # print("OpenAI ServiceUnavailableError, wait for 1 minute...")
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)

    response = completion.choices[0].message.content.strip().lower()
    
    if response in ["yes.", "no."]:
        response = response.replace(".", "")
    
    response_history[input_prompt] = response

    return response

In [None]:
en_csqa[0]

In [None]:
model_name = "gpt-3.5-turbo"
relevancy_data = {
    'q_id': [], 'q_concept': [],
    'option_a': [], 'option_b': [], 'option_c': [], 'option_d': [], 'option_e': [],
    'names': [], 'answer': []
}
option_idxs = ['option_a', 'option_b', 'option_c', 'option_d', 'option_e']
for item in tqdm(en_csqa):
    relevancy_data['q_id'].append(item['id'])

    input_prompt = get_input_prompt(item['question_concept'], "other", "West Java")
    rel = get_openai_relevancy(input_prompt, model_name)
    relevancy_data['q_concept'].append(rel)

    for option_idx, choice in zip(option_idxs, item['choices']['text']):
        locations = extract_locations(choice)
        if len(locations) > 0:
            input_prompt = get_input_prompt(choice, "location", "West Java")
            rel = get_openai_relevancy(input_prompt, model_name)
            relevancy_data[option_idx].append(rel)
        else:
            relevancy_data[option_idx].append(None)
        
    names = extract_names(item['question'])
    names_rel = []
    for name in names:
        input_prompt = get_input_prompt(name, "name", "West Java")
        rel = get_openai_relevancy(input_prompt, model_name)
        names_rel.append((name, rel))
    relevancy_data['names'].append(names_rel)

    relevancy_data['answer'].append(item['answerKey'])

In [None]:
relevancy_df = pd.DataFrame(relevancy_data)

In [None]:
relevancy_df

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy/"
relevancy_df.to_csv(out_parent_dir + split_name + "_su_step1.csv", index=False)

In [None]:
resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})

In [None]:
resp_history_df

In [None]:
resp_history_df.to_csv(out_parent_dir + "gpt-3.5-history-su-230704.csv", index=False)