In [1]:
%load_ext dotenv
%dotenv

In [2]:
import openai
import os
import requests
import pandas as pd
import stanza
import time
from datasets import load_dataset
from tqdm import tqdm

In [3]:
# Only needs to be run one time
# stanza.download('en', model_dir=os.environ['HF_HOME'])

In [4]:
split_name = 'test'
en_csqa = load_dataset('commonsense_qa', split=split_name)

### Concept Relevancy Classifier

#### Post-Process

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', device=0, model_dir=os.environ['HF_HOME'], download_method=None)

In [None]:
def get_options_lemma(text):
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    return ' '.join(lemmas)

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy_ensemble/"
lang_name = "su"
relevancy_df = pd.read_csv(f"{out_parent_dir}{split_name}_{lang_name}_relevancy.csv", index_col=0, converters={'q_concept': lambda x: x[1:-1].replace("'", '').split(', ')})

In [None]:
relevancy_df

In [None]:
excluded_concepts = [
    "sex", "vagina", "penis", "prostitute", "kiss", "copulating", "procreating",
    "killing people", "committing murder", "affair", "drug dealer", "terrorists", "terrorist",
]

In [None]:
general_questions, rephrase_questions = [], []
for item in tqdm(en_csqa):
    q_id = item['id']

    # Check q concept offensiveness
    options_str = ' '.join(item['choices']['text']).lower()
    is_offensive = any(ex in item['question'].lower() for ex in excluded_concepts) or \
        any(ex in options_str for ex in excluded_concepts)
    if is_offensive:
        continue

    # Check options ambiguity
    options_lemma = [get_options_lemma(option) for option in item['choices']['text']]
    is_options_ambiguous = len(set(options_lemma)) != len(options_lemma)
    if is_options_ambiguous:
        continue

    # Check name relevancy
    is_name_irrelevant = relevancy_df.loc[q_id, 'names'] != "[]"

    # Check options relevancy
    options = [
        relevancy_df.loc[q_id, 'option_a'], relevancy_df.loc[q_id, 'option_b'],
        relevancy_df.loc[q_id, 'option_c'], relevancy_df.loc[q_id, 'option_d'],
        relevancy_df.loc[q_id, 'option_e']
    ]
    is_option_irrelevant = 'no' in options

    # Check q concept relevancy
    yes_count = relevancy_df.loc[q_id, 'q_concept'].count('yes')
    no_count = relevancy_df.loc[q_id, 'q_concept'].count('no')
    
    if not is_name_irrelevant and not is_option_irrelevant and yes_count >= 4:
        general_questions.append(item)
    else:
        if no_count >= 4 or is_name_irrelevant or is_option_irrelevant:
            item['concept'] = no_count >= 4
            item['name'] = is_name_irrelevant
            item['option'] = is_option_irrelevant
            rephrase_questions.append(item)

In [None]:
len(general_questions), len(rephrase_questions), len(en_csqa)

In [None]:

round(len(general_questions)/len(en_csqa)*100,2), round(len(rephrase_questions)/len(en_csqa)*100,2)

In [None]:
q_df_general = pd.DataFrame(general_questions)
q_df_rephrase = pd.DataFrame(rephrase_questions)

In [None]:
q_df_rephrase

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy_ensemble/data_result/"
q_df_general.to_csv(f"{out_parent_dir}{split_name}_general_{lang_name}.csv", index=False)
q_df_rephrase.to_csv(f"{out_parent_dir}{split_name}_rephrase_{lang_name}.csv", index=False)

#### Stats

In [None]:
import os
import numpy as np

In [None]:
split_index, lang_index = [], []
for lang in ['id', 'su']:
    for split in ['train', 'validation', 'test']:
        lang_index.append(lang)
        split_index.append(split)
data_stat = pd.DataFrame(index=[np.array(lang_index), np.array(split_index)], columns=['general', 'rephrase', 'irr_concept', 'irr_location', 'irr_names'])

In [None]:
data_stat

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy_ensemble/data_result/"
for file_name in tqdm(os.listdir(out_parent_dir)):
    name_only = file_name.replace('.csv', '')
    split, q_type, lang = name_only.split('_')
    
    dat = pd.read_csv(out_parent_dir + file_name)
    data_stat.loc[lang,split][q_type] = len(dat)

    if q_type == 'rephrase':
        dat['concept'].tolist()
        dat['name'].tolist()
        dat['option'].tolist()
        data_stat.loc[lang,split]['irr_concept'] = dat['concept'].tolist().count(True)
        data_stat.loc[lang,split]['irr_location'] = dat['option'].tolist().count(True)
        data_stat.loc[lang,split]['irr_names'] = dat['name'].tolist().count(True)

In [None]:
data_stat

In [None]:
data_stat.loc['id'].sum(axis=0)

In [None]:
data_stat.loc['su'].sum(axis=0)

#### Model Predict

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy_ensemble/"
lang_name = "su"
location_context = "West Java"

loc_history_df = pd.read_csv(out_parent_dir + "conceptnet-api-history-id-230718-25.csv")
loc_response_history = dict((k, v) for k, v in zip(loc_history_df.prompt, loc_history_df.response) if ' ' not in k)

location_concepts = ['administrative_region', 'country', 'city', 'province']
excluded = ['city', 'town', 'park', 'country', 'province', 'countryside', 'village']

def is_conceptnet_location(option):
    option = '_'.join(option.split())

    if option in excluded:
        return False

    if option in loc_response_history:
        return loc_response_history[option]
    
    res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/{option}&rel=/r/IsA")
    obj = res.json()
    is_location = False
    for edge in obj['edges']:
        end_edge = edge['end']['@id'].split('/')
        if any([e in end_edge for e in location_concepts]):
            is_location = True
            break
    loc_response_history[option] = is_location
    return is_location

In [None]:
ner_pipeline = stanza.Pipeline('en', processors='tokenize,ner', device=0, model_dir=os.environ['HF_HOME'], download_method=None)

In [None]:
def extract_names(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type == 'PERSON']


def extract_locations(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type in ['GPE', 'LOC']]

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.organization = os.environ['OPENAI_UILAB_KEY']
resp_history_df = pd.read_csv(out_parent_dir + "gpt-3.5-history-"+ lang_name +"-230718-25.csv")
response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))

In [None]:
def get_input_prompts(concept, concept_type, location_name):
    end_prompt = "Answer with only 'yes' or 'no'."

    return [
        f"Does {concept} commonly found in {location_name}? {end_prompt}",
        f"Does people in {location_name} familiar with {concept}? {end_prompt}",
        f"Can you find {concept} in {location_name}? {end_prompt}",
        f"Is {concept} culturally relevant in {location_name}? {end_prompt}",
        f"Suppose you are a person who live in {location_name}. Are you familiar with {concept}? {end_prompt}",
    ]


def get_openai_chat_completion(input_prompt, model_name, temp=0.2):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt
            }
        ],
        temperature=temp
    )


def get_openai_relevancy(input_prompt, model_name):
    if input_prompt in response_history:
        return response_history[input_prompt]
    
    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)

    response = completion.choices[0].message.content.strip().lower()
    
    if response in ["yes.", "no."]:
        response = response.replace(".", "")
    
    response_history[input_prompt] = response

    return response

In [None]:
en_csqa[0]

In [None]:
model_name = "gpt-3.5-turbo"
relevancy_data = {
    'q_id': [], 'question': [], 'q_concept': [],
    'option_a': [], 'option_b': [], 'option_c': [], 'option_d': [], 'option_e': [],
    'names': []
}
option_idxs = ['option_a', 'option_b', 'option_c', 'option_d', 'option_e']
for item in tqdm(en_csqa):
    relevancy_data['q_id'].append(item['id'])
    relevancy_data['question'].append(item['question'])

    input_prompts = get_input_prompts(item['question_concept'], "other", location_context)
    rels = [get_openai_relevancy(input_prompt, model_name) for input_prompt in input_prompts]
    relevancy_data['q_concept'].append(rels)

    for option_idx, choice in zip(option_idxs, item['choices']['text']):
        locations = extract_locations(choice)
        if len(locations) > 0 or is_conceptnet_location(choice):
            relevancy_data[option_idx].append("no") # means irrelevant, has location
        else:
            relevancy_data[option_idx].append("yes")
        
    names = extract_names(item['question'])
    relevancy_data['names'].append(names)

In [None]:
relevancy_df = pd.DataFrame(relevancy_data)

In [None]:
relevancy_df

In [None]:
out_parent_dir = "/mnt/nas2/kikiputri/id-csqa/dataset/relevancy_ensemble/"
relevancy_df.to_csv(out_parent_dir + split_name + "_" + lang_name + "_relevancy.csv", index=False)

In [None]:
resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
loc_history_df = pd.DataFrame({'prompt': loc_response_history.keys(), 'response': loc_response_history.values()})

In [None]:
resp_history_df

In [None]:
loc_history_df

In [None]:
resp_history_df.to_csv(out_parent_dir + "gpt-3.5-history-"+ lang_name +"-230718-25.csv", index=False)
loc_history_df.to_csv(out_parent_dir + "conceptnet-api-history-"+ lang_name +"-230718-25.csv", index=False)

In [None]:
name = input('What is your name?')
print("Hello", name)