In [2]:
%load_ext dotenv
%dotenv

In [3]:
import openai
import os
import requests
import pandas as pd
import stanza
import time
from datasets import load_dataset
from tqdm import tqdm
from pathlib import Path

In [None]:
# Only needs to be run one time
# stanza.download('en', model_dir=os.environ['HF_HOME'])

In [None]:
# split_name = 'train'
# en_csqa = load_dataset('commonsense_qa', split=split_name)

### Concept Relevancy Classifier

#### Post-Process

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
wn.synsets('republican')[0].pos()

In [None]:
adj_response_history = {}
adj_and_adv = [wn.ADJ, wn.ADJ_SAT, wn.ADV]
def is_adj_or_adv(concept):
    concept = '_'.join(concept.split())
    synsets = wn.synsets(concept)
    if len(synsets) > 0:
        pos_tag = synsets[0].pos()
    else:
        pos_tag = ''

    if pos_tag in adj_and_adv:
        is_adj = True
    else:
        is_adj = False

    adj_response_history[concept] = is_adj
    
    return is_adj

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', device=0, model_dir=os.environ['HF_HOME'], download_method=None)

In [None]:
def get_options_lemma(text):
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    return ' '.join(lemmas)

In [None]:
res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/body_part&limit=5000")
obj = res.json()
body_parts = set()
for edge in obj['edges']:
    start_edge = edge['start']['@id'].replace('/c/en/', '')
    if '/' not in start_edge:
        body_parts.add(start_edge.replace('_', ' '))

In [None]:
excluded_concepts = [
    "sex", "vagina", "penis", "prostitute", "kiss", "copulating", "procreating",
    "killing people", "committing murder", "affair", "drug dealer", "terrorists", "terrorist",
]
general_concepts = [
    "human", "animal", "plant", "thing", "everyone", "people", "person"
] + list(body_parts)

In [None]:
out_parent_dir = "../dataset/relevancy_ensemble/"
for lang_name in ["id", "su"]:
    for split_name in ["train", "validation", "test"]:
        en_csqa = load_dataset('commonsense_qa', split=split_name)
        relevancy_df = pd.read_csv(f"{out_parent_dir}{split_name}_{lang_name}_relevancy.csv", index_col=0, converters={'q_concept': lambda x: x[1:-1].replace("'", '').split(', ')})

        general_questions, rephrase_questions = [], []
        for item in tqdm(en_csqa):
            q_id = item['id']

            # Check q concept offensiveness
            options_str = ' '.join(item['choices']['text']).lower()
            is_offensive = any(ex in item['question'].lower() for ex in excluded_concepts) or \
                any(ex in options_str for ex in excluded_concepts)
            if is_offensive:
                continue

            # Check options ambiguity
            options_lemma = [get_options_lemma(option) for option in item['choices']['text']]
            is_options_ambiguous = len(set(options_lemma)) != len(options_lemma)
            if is_options_ambiguous:
                continue

            # Check name relevancy
            is_name_irrelevant = relevancy_df.loc[q_id, 'names'] != "[]"

            # Check options relevancy
            options = [
                relevancy_df.loc[q_id, 'option_a'], relevancy_df.loc[q_id, 'option_b'],
                relevancy_df.loc[q_id, 'option_c'], relevancy_df.loc[q_id, 'option_d'],
                relevancy_df.loc[q_id, 'option_e']
            ]
            is_option_irrelevant = 'no' in options

            # Check q concept relevancy
            yes_count = relevancy_df.loc[q_id, 'q_concept'].count('yes')
            no_count = relevancy_df.loc[q_id, 'q_concept'].count('no')

            # Check concept generality
            is_general_concepts = any(c in item['question_concept'] for c in general_concepts) or is_adj_or_adv(item['question_concept'])
            
            if not is_name_irrelevant and not is_option_irrelevant and (is_general_concepts or yes_count >= 4):
                general_questions.append(item)
            else:
                if (not is_general_concepts and no_count >= 4) or is_name_irrelevant or is_option_irrelevant:
                    item['concept'] = no_count >= 4
                    item['name'] = is_name_irrelevant
                    item['option'] = is_option_irrelevant
                    rephrase_questions.append(item)

        q_df_general = pd.DataFrame(general_questions)
        q_df_rephrase = pd.DataFrame(rephrase_questions)

        q_df_general.to_csv(f"{out_parent_dir}data_result/{split_name}_general_{lang_name}.csv", index=False)
        q_df_rephrase.to_csv(f"{out_parent_dir}data_result/{split_name}_rephrase_{lang_name}.csv", index=False)

#### Stats

In [1]:
import os
import numpy as np

In [4]:
split_index, lang_index = [], []
for lang in ['id', 'su']:
    for split in ['train', 'validation', 'test']:
        lang_index.append(lang)
        split_index.append(split)
data_stat = pd.DataFrame(index=[np.array(lang_index), np.array(split_index)], columns=['general', 'rephrase', 'irr_concept', 'irr_location', 'irr_names'])

In [5]:
out_parent_dir = "../dataset/relevancy_ensemble/data_result/"
for file_name in tqdm(os.listdir(out_parent_dir)):
    name_only = file_name.replace('.csv', '')
    split, q_type, lang = name_only.split('_')
    
    dat = pd.read_csv(out_parent_dir + file_name)
    data_stat.loc[lang,split][q_type] = len(dat)

    if q_type == 'rephrase':
        dat['concept'].tolist()
        dat['name'].tolist()
        dat['option'].tolist()
        data_stat.loc[lang,split]['irr_concept'] = dat['concept'].tolist().count(True)
        data_stat.loc[lang,split]['irr_location'] = dat['option'].tolist().count(True)
        data_stat.loc[lang,split]['irr_names'] = dat['name'].tolist().count(True)

100%|██████████| 12/12 [00:00<00:00, 66.92it/s]


In [6]:
data_stat

Unnamed: 0,Unnamed: 1,general,rephrase,irr_concept,irr_location,irr_names
id,train,6040,2249,652,984,1035
id,validation,714,294,99,125,126
id,test,702,249,82,108,109
su,train,4697,2700,1245,984,1035
su,validation,546,353,171,125,126
su,test,564,294,132,108,109


In [7]:
data_stat.loc['id'].sum(axis=0)

general         7456
rephrase        2792
irr_concept      833
irr_location    1217
irr_names       1270
dtype: object

In [8]:
data_stat.loc['su'].sum(axis=0)

general         5807
rephrase        3347
irr_concept     1548
irr_location    1217
irr_names       1270
dtype: object

In [9]:
id_general = pd.read_csv('../dataset/relevancy_ensemble/data_result/test_general_id.csv')
su_irrelevant = pd.read_csv('../dataset/relevancy_ensemble/data_result/test_rephrase_su.csv')

In [14]:
id_general.loc[id_general['id'] == '8d916be530b91e6269b1d475601ae7ab']

Unnamed: 0,id,question,question_concept,choices,answerKey
3,8d916be530b91e6269b1d475601ae7ab,What will you experience after doing housework...,doing housework,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",


In [17]:
su_irrelevant[su_irrelevant['id'].isin(id_general['id'].tolist())]

Unnamed: 0,id,question,question_concept,choices,answerKey,concept,name,option
2,1f74ea1f73b9f5d91a665b4d90218a6e,What is a likely consequence of ignorance of r...,ignorance,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
42,8e9852f85771fceacf387d727b0772e5,"He liked the car and decided to buy it, he was...",getting,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
53,5db6f0b5c7b4ed600010157d629a2f5f,Falling doesn't cause injury. Injury is caus...,falling,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
68,cf1d1041b22f588ea187aee2e9b22bb8,"A child wants to survive, what does he need to...",child,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
69,5dfee150d0435fefa4b3a5c2d292e378,What is the benefit to you of waiting for a bus?,waiting for,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
71,f56f9a720f1140037d3f967b1a5ecd34,The minimalist author looked at his written wo...,written,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
88,4f8eae6183aaad0c9abea351287c7b03,What do children love to do with scissors and ...,children,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
98,544d1957d61d1f97e52d77e5b28673ac,What does a child learn in school to do?,child,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
142,e6e46bddd22054d29e5418ecf08f2909,Where can you a letter opener and a document s...,letter opener,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False
151,93c929a5e786422facac96d31b07b3b2,Where would you find a trunk containing a spar...,trunk,"{'label': ['A', 'B', 'C', 'D', 'E'], 'text': [...",,True,False,False


#### Model Predict

In [None]:
out_parent_dir = "../dataset/relevancy_ensemble/"
lang_name = "su"
location_context = "West Java"

loc_history_df = pd.read_csv(out_parent_dir + "conceptnet-api-history.csv")
loc_response_history = dict((k, v) for k, v in zip(loc_history_df.prompt, loc_history_df.response) if ' ' not in k)

activity_history_df = pd.read_csv(out_parent_dir + "conceptnet-api-activity-history.csv")
activity_response_history = dict((k, v) for k, v in zip(activity_history_df.prompt, activity_history_df.response) if ' ' not in k)

location_concepts = ['administrative_region', 'country', 'city', 'province']
excluded = ['city', 'town', 'park', 'country', 'province', 'countryside', 'village']

def is_conceptnet_location(option):
    option = '_'.join(option.split())

    if option in excluded:
        return False

    if option in loc_response_history:
        return loc_response_history[option]
    
    res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/{option}&rel=/r/IsA")
    obj = res.json()
    is_location = False
    for edge in obj['edges']:
        end_edge = edge['end']['@id'].split('/')
        if any([e in end_edge for e in location_concepts]):
            is_location = True
            break
    loc_response_history[option] = is_location
    return is_location


def is_conceptnet_activity(concept):
    concept = '_'.join(concept.split())

    if concept in activity_response_history:
        return activity_response_history[concept]

    res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/{concept}&rel=/r/IsA")
    obj = res.json()
    
    is_activity = False
    if len(obj['edges']) == 0:
        is_activity = True
    else:
        for edge in obj['edges']:
            end_edge = edge['end']['@id'].split('/')
            if 'activity' in end_edge:
                is_activity = True
                break

    activity_response_history[concept] = is_activity
    resp_history_df = pd.DataFrame({'prompt': activity_response_history.keys(), 'response': activity_response_history.values()})
    resp_history_df.to_csv(out_parent_dir + "conceptnet-api-activity-history.csv", index=False)
    
    return is_activity

In [None]:
len(loc_response_history), len(activity_response_history)

In [None]:
ner_pipeline = stanza.Pipeline('en', processors='tokenize,ner', device=0, model_dir=os.environ['HF_HOME'], download_method=None)

In [None]:
def extract_names(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type == 'PERSON']


def extract_locations(sentence):
    ner_results = ner_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type in ['GPE', 'LOC']]

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.organization = os.environ['OPENAI_UILAB_KEY']
resp_history_file = Path(out_parent_dir + "gpt-3.5-history-"+ lang_name +"-230728.csv")
if resp_history_file.is_file():
    print("Response history found!")
    resp_history_df = pd.read_csv(out_parent_dir + "gpt-3.5-history-"+ lang_name +"-230728.csv")
    response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
else:
    print("Response history not found. Initializing new one...")
    response_history = {}

In [None]:
def get_input_prompts(concept, concept_type, location_name):
    end_prompt = "Answer with only 'yes' or 'no'."

    if concept_type == "activity":
        return [
            f"Can people {concept} in {location_name}? {end_prompt}",
            f"Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
            f"Can people in {location_name} {concept}? {end_prompt}",
            f"Can {concept} be done in {location_name}? {end_prompt}",
            f"Suppose you are a person who live in {location_name}. Can you {concept}? {end_prompt}",
        ]

    return [
        f"Does {concept} commonly found in {location_name}? {end_prompt}",
        f"Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
        f"Can people find {concept} in {location_name}? {end_prompt}",
        f"Is {concept} easily found in {location_name}? {end_prompt}",
        f"Suppose you are a person who live in {location_name}. Can you find {concept}? {end_prompt}",
    ]


def get_openai_chat_completion(input_prompt, model_name, temp=0.2):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt
            }
        ],
        temperature=temp
    )


def get_openai_relevancy(input_prompt, model_name):
    if input_prompt in response_history:
        return response_history[input_prompt]
    
    try:
        completion = get_openai_chat_completion(input_prompt, model_name)
    except Exception:
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name)

    response = completion.choices[0].message.content.strip().lower()
    
    if response in ["yes.", "no."]:
        response = response.replace(".", "")
    
    response_history[input_prompt] = response
    resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
    resp_history_df.to_csv(out_parent_dir + "gpt-3.5-history-"+ lang_name +"-230728.csv", index=False)

    return response

In [None]:
en_csqa[0]

In [None]:
model_name = "gpt-3.5-turbo"
relevancy_data = {
    'q_id': [], 'question': [], 'q_concept': [],
    'option_a': [], 'option_b': [], 'option_c': [], 'option_d': [], 'option_e': [],
    'names': []
}
option_idxs = ['option_a', 'option_b', 'option_c', 'option_d', 'option_e']
for item in tqdm(en_csqa):
    relevancy_data['q_id'].append(item['id'])
    relevancy_data['question'].append(item['question'])

    prompt_type = "activity" if is_conceptnet_activity(item['question_concept']) else "other"
    input_prompts = get_input_prompts(item['question_concept'], prompt_type, location_context)
    rels = [get_openai_relevancy(input_prompt, model_name) for input_prompt in input_prompts]
    relevancy_data['q_concept'].append(rels)

    for option_idx, choice in zip(option_idxs, item['choices']['text']):
        locations = extract_locations(choice)
        if len(locations) > 0 or is_conceptnet_location(choice):
            relevancy_data[option_idx].append("no") # means irrelevant, has location
        else:
            relevancy_data[option_idx].append("yes")
        
    names = extract_names(item['question'])
    relevancy_data['names'].append(names)

In [None]:
relevancy_df = pd.DataFrame(relevancy_data)

In [None]:
relevancy_df

In [None]:
out_parent_dir = "../dataset/relevancy_ensemble/"
relevancy_df.to_csv(out_parent_dir + split_name + "_" + lang_name + "_relevancy.csv", index=False)