In [1]:
%load_ext dotenv
%dotenv

In [2]:
import openai
import os
import requests
import pandas as pd
import stanza
import time
from nltk.corpus import wordnet as wn
from datasets import load_dataset
from tqdm import tqdm
from pathlib import Path

In [3]:
# Only needs to be run one time
# stanza.download('en', model_dir=os.environ['HF_HOME'])

In [4]:
nlp_pipeline = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner', device=0, model_dir=os.environ['HF_HOME'], download_method=None)

2023-08-24 15:25:53 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| ner       | ontonotes |

2023-08-24 15:25:53 INFO: Using device: 0
2023-08-24 15:25:53 INFO: Loading: tokenize
2023-08-24 15:25:54 INFO: Loading: pos
2023-08-24 15:25:54 INFO: Loading: lemma
2023-08-24 15:25:55 INFO: Loading: ner
2023-08-24 15:25:55 INFO: Done loading processors!


In [5]:
res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/body_part&limit=5000")
obj = res.json()
body_parts = set()
for edge in obj['edges']:
    start_edge = edge['start']['@id'].replace('/c/en/', '')
    if '/' not in start_edge:
        body_parts.add(start_edge.replace('_', ' '))

In [6]:
excluded_concepts = [
    "sex", "vagina", "penis", "prostitute", "kiss", "copulating", "copulate", "procreating", "procreate",
    "killing people", "committing murder", "affair", "drug dealer", "terrorists", "terrorist",
]
general_concepts = [
    "human", "animal", "plant", "thing", "everyone", "people", "person"
] + list(body_parts)

In [7]:
def get_senses(concept):
    concept = '_'.join(concept.split())
    synsets = wn.synsets(concept)
    pos_tags = set()
    for s in synsets:
        pos_tags.add(s.pos())
    return pos_tags


def has_multi_sense(concept):
    pos_tags = get_senses(concept)
    return len(pos_tags) >= 2


def extract_pos_by_word(sentence, word):
    words = word.lower().split('_')
    pos_tags = set()
    results = nlp_pipeline(sentence)
    for sent in results.sentences:
        for s_word in sent.words:
            if s_word.lemma in words or s_word.text.lower() in words:
                pos_tags.add(s_word.upos)
    return pos_tags


filler_tags = [
    wn.ADJ, wn.ADJ_SAT, wn.ADV,
    'ADJ', 'ADP', 'ADV', 'ADP', 'CCONJ', 'SCONJ', 'DET', 'PART', 'PUNCT'
]
def is_filler(concept, sentence):
    senses = get_senses(concept)
    if len(senses) > 1:
        senses = extract_pos_by_word(sentence, concept)

    return all([s in filler_tags for s in senses])

### Concept Relevancy Classifier

#### Post-Process

In [8]:
def get_options_lemma(text):
    doc = nlp_pipeline(text)
    lemmas = []
    for sent in doc.sentences:
        for word in sent.words:
            if word.lemma is None:
                lemmas.append(word.text)
            else:
                lemmas.append(word.lemma)
    return ' '.join(lemmas)

In [9]:
out_parent_dir = "../dataset/relevancy_context/"
for split_name in ["train", "validation", "test"]:
    lang_name = "id"
    en_csqa = load_dataset('commonsense_qa', split=split_name)
    relevancy_df = pd.read_csv(f"{out_parent_dir}{split_name}_{lang_name}_relevancy.csv", index_col=0, converters={'q_concept': lambda x: x[1:-1].replace("'", '').split(', ')})

    general_questions, rephrase_questions = [], []
    for item in tqdm(en_csqa):
        q_id = item['id']

        # Check q concept offensiveness
        options_str = ' '.join(item['choices']['text']).lower()
        is_offensive = any(ex in item['question'].lower() for ex in excluded_concepts) or \
            any(ex in options_str for ex in excluded_concepts)
        if is_offensive:
            continue

        # Check options ambiguity
        options_lemma = [get_options_lemma(option) for option in item['choices']['text']]
        is_options_ambiguous = len(set(options_lemma)) != len(options_lemma)
        if is_options_ambiguous:
            continue

        # Check name relevancy
        is_name_irrelevant = relevancy_df.loc[q_id, 'names'] != "[]"

        # Check options relevancy
        options = [
            relevancy_df.loc[q_id, 'option_a'], relevancy_df.loc[q_id, 'option_b'],
            relevancy_df.loc[q_id, 'option_c'], relevancy_df.loc[q_id, 'option_d'],
            relevancy_df.loc[q_id, 'option_e']
        ]
        is_option_irrelevant = 'no' in options

        # Check q concept relevancy
        # is_relevant_concept = relevancy_df.loc[q_id, 'q_concept'] == 'yes'
        yes_count = relevancy_df.loc[q_id, 'q_concept'].count('yes')
        no_count = relevancy_df.loc[q_id, 'q_concept'].count('no')
        # is_relevant_concept = relevancy_df.loc[q_id, 'q_concept'].count('yes') >= 2

        # Check concept generality
        is_general_concept = any(c in item['question_concept'] for c in general_concepts) or is_filler(item['question_concept'], item['question'])
        
        if not is_name_irrelevant and not is_option_irrelevant and (is_general_concept or yes_count >= 3):
            general_questions.append(item)
        else:
            if (not is_general_concept and no_count >= 3) or is_name_irrelevant or is_option_irrelevant:
                item['concept'] = not is_general_concept and no_count >= 3
                item['name'] = is_name_irrelevant
                item['option'] = is_option_irrelevant
                rephrase_questions.append(item)

    q_df_general = pd.DataFrame(general_questions)
    q_df_rephrase = pd.DataFrame(rephrase_questions)

    q_df_general.to_csv(f"{out_parent_dir}data_result/{split_name}_general.csv", index=False)
    q_df_rephrase.to_csv(f"{out_parent_dir}data_result/{split_name}_rephrase.csv", index=False)

Found cached dataset commonsense_qa (/media/kiki/kiki_hdd/cache/commonsense_qa/default/1.0.0/28d68f56649a7f0c23bc68eae850af914aa03f95f810011ae8cf58cc5ff5051b)
100%|██████████| 9741/9741 [19:52<00:00,  8.17it/s]
Found cached dataset commonsense_qa (/media/kiki/kiki_hdd/cache/commonsense_qa/default/1.0.0/28d68f56649a7f0c23bc68eae850af914aa03f95f810011ae8cf58cc5ff5051b)
100%|██████████| 1221/1221 [02:27<00:00,  8.29it/s]
Found cached dataset commonsense_qa (/media/kiki/kiki_hdd/cache/commonsense_qa/default/1.0.0/28d68f56649a7f0c23bc68eae850af914aa03f95f810011ae8cf58cc5ff5051b)
100%|██████████| 1140/1140 [02:15<00:00,  8.41it/s]


#### Stats

In [10]:
import os
import numpy as np

In [11]:
data_stat = pd.DataFrame(index=np.array(['train', 'validation', 'test']), columns=['general', 'rephrase', 'irr_concept', 'irr_location', 'irr_names'])

In [12]:
out_parent_dir = "../dataset/relevancy_context/data_result/"
for file_name in tqdm(os.listdir(out_parent_dir)):
    name_only = file_name.replace('.csv', '')
    split, q_type = name_only.split('_')
    
    dat = pd.read_csv(out_parent_dir + file_name)
    data_stat.loc[split][q_type] = len(dat)

    if q_type == 'rephrase':
        dat['concept'].tolist()
        dat['name'].tolist()
        dat['option'].tolist()
        data_stat.loc[split]['irr_concept'] = dat['concept'].tolist().count(True)
        data_stat.loc[split]['irr_location'] = dat['option'].tolist().count(True)
        data_stat.loc[split]['irr_names'] = dat['name'].tolist().count(True)

100%|██████████| 6/6 [00:00<00:00, 148.57it/s]


In [13]:
data_stat

Unnamed: 0,general,rephrase,irr_concept,irr_location,irr_names
train,7140,2162,445,984,1036
validation,882,274,68,125,126
test,841,236,51,108,109


In [14]:
data_stat.sum()

general         8863
rephrase        2672
irr_concept      564
irr_location    1217
irr_names       1271
dtype: object

#### Model Predict

This needs to be run first before the above blocks.

In [8]:
out_parent_dir = "../dataset/relevancy_context/"
lang_name = "id"
location_context = "Indonesia"

In [9]:
def extract_names(sentence):
    ner_results = nlp_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type == 'PERSON']


def extract_locations(sentence):
    ner_results = nlp_pipeline(sentence)
    return [ent.text for ent in ner_results.ents if ent.type in ['GPE', 'LOC']]

In [10]:
loc_history_filename =  f"{out_parent_dir}conceptnet-api_location_history.csv"
activity_history_filename = f"{out_parent_dir}conceptnet-api_activity_history.csv"

loc_history_path = Path(loc_history_filename)
if loc_history_path.is_file():
    print("Location history found!")
    loc_history_df = pd.read_csv(loc_history_filename)
    loc_response_history = dict((k, v) for k, v in zip(loc_history_df.prompt, loc_history_df.response) if ' ' not in k)
else:
    print("Location history not found. Initializing new one...")
    loc_response_history = {}

activity_history_path = Path(activity_history_filename)
if activity_history_path.is_file():
    print("Activity history found!")
    activity_history_df = pd.read_csv(activity_history_filename)
    activity_response_history = dict((k, v) for k, v in zip(activity_history_df.prompt, activity_history_df.response) if ' ' not in k)
else:
    print("Activity history not found. Initializing new one...")
    activity_response_history = {}

location_concepts = ['administrative_region', 'country', 'city', 'province']
excluded = ['city', 'town', 'park', 'country', 'province', 'countryside', 'village']

def is_conceptnet_location(option):
    option = '_'.join(option.split())

    if option in excluded:
        return False

    if option in loc_response_history:
        return loc_response_history[option]
    
    res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/{option}&rel=/r/IsA")
    obj = res.json()
    is_location = False
    for edge in obj['edges']:
        end_edge = edge['end']['@id'].split('/')
        if any([e in end_edge for e in location_concepts]):
            is_location = True
            break

    loc_response_history[option] = is_location
    resp_history_df = pd.DataFrame({'prompt': loc_response_history.keys(), 'response': loc_response_history.values()})
    resp_history_df.to_csv(loc_history_path, index=False)
    
    return is_location


def is_conceptnet_activity(concept, question):
    concept = '_'.join(concept.split())

    senses = get_senses(concept)
    len_senses = len(senses)
    
    if len_senses == 1:
        is_activity = 'v' in senses

    else:
        pos_tags = extract_pos_by_word(question, concept)
        is_activity = 'VERB' in pos_tags

        if len_senses < 1 and not is_activity:
            if concept in activity_response_history:
                return activity_response_history[concept]

            res = requests.get(f"https://api.conceptnet.io/query?node=/c/en/{concept}&rel=/r/IsA")
            obj = res.json()
            
            for edge in obj['edges']:
                end_edge = edge['end']['@id'].split('/')
                if 'activity' in end_edge or 'intelligent_agent_activity' in end_edge:
                    is_activity = True
                    break
            
            activity_response_history[concept] = is_activity
            resp_history_df = pd.DataFrame({'prompt': activity_response_history.keys(), 'response': activity_response_history.values()})
            resp_history_df.to_csv(activity_history_path, index=False)
    
    return is_activity

Location history found!
Activity history found!


In [11]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.organization = os.environ['OPENAI_UILAB_KEY']
model_name = "gpt-3.5-turbo"
resp_history_filename = f"{out_parent_dir}{model_name}_history_{lang_name}_230823.csv"
resp_history_path = Path(resp_history_filename)
if resp_history_path.is_file():
    print("Response history found!")
    resp_history_df = pd.read_csv(resp_history_filename)
    response_history = dict(zip(resp_history_df.prompt, resp_history_df.response))
else:
    print("Response history not found. Initializing new one...")
    response_history = {}

Response history found!


In [12]:
def get_input_prompt(concept, question, concept_type, location_name):
    start_prompt = f"Text: {question}\nConcept: {concept}\n\n"
    end_prompt = "Answer with 'yes' or 'no'."

    if concept_type == "activity":
        if has_multi_sense(concept):
            return [
                f"{start_prompt}Can one '{concept}' in {location_name}? {end_prompt}",
                f"{start_prompt}Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
                f"{start_prompt}Is '{concept}' concept exist in {location_name}? {end_prompt}",
                f"{start_prompt}In {location_name}, can people {concept}? {end_prompt}",
                f"{start_prompt}Can '{concept}' be done in {location_name}? {end_prompt}"
            ]
        return [
            f"Can one '{concept}' in {location_name}? {end_prompt}",
            f"Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
            f"Is '{concept}' concept exist in {location_name}? {end_prompt}",
            f"In {location_name}, can people {concept}? {end_prompt}",
            f"Can '{concept}' be done in {location_name}? {end_prompt}"
        ]
    else:
        if has_multi_sense(concept):
            return [
                f"{start_prompt}Can one find '{concept}' in {location_name}? {end_prompt}",
                f"{start_prompt}Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
                f"{start_prompt}Is '{concept}' concept exist in {location_name}? {end_prompt}",
                f"{start_prompt}In {location_name}, can people find {concept}? {end_prompt}",
                f"{start_prompt}Can '{concept}' concept be found in {location_name}? {end_prompt}"
            ]
        return [
            f"Can people find '{concept}' in {location_name}? {end_prompt}",
            f"Do people in {location_name} familiar with '{concept}' concept? {end_prompt}",
            f"Is '{concept}' concept exist in {location_name}? {end_prompt}",
            f"In {location_name}, can people find {concept}? {end_prompt}",
            f"Can '{concept}' concept be found in {location_name}? {end_prompt}"
        ]


def get_openai_chat_completion(input_prompt, model_name, resp_num):
    return openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {
                'role': 'user',
                'content': input_prompt
            }
        ],
        temperature=1,
        max_tokens=1,
        n=resp_num
    )


def get_openai_relevancy(input_prompt, model_name, resp_num=15):
    if input_prompt in response_history:
        return response_history[input_prompt]
    
    try:
        completion = get_openai_chat_completion(input_prompt, model_name, resp_num)
    except Exception:
        time.sleep(60)
        completion = get_openai_chat_completion(input_prompt, model_name, resp_num)
        
    responses = [resp.message.content.strip().lower() for resp in completion.choices]
    assert len(responses) == resp_num

    count_yes = responses.count('yes')
    count_no = responses.count('no')
    if count_yes == count_no:
        response_final = 'neutral'
    elif count_yes > count_no:
        response_final = 'yes'
    else:
        response_final = 'no'

    response_history[input_prompt] = response_final
    resp_history_df = pd.DataFrame({'prompt': response_history.keys(), 'response': response_history.values()})
    resp_history_df.to_csv(resp_history_filename, index=False)
    
    return response_final

In [13]:
split_name = 'train'
en_csqa = load_dataset('commonsense_qa', split=split_name)

Found cached dataset commonsense_qa (/media/kiki/kiki_hdd/cache/commonsense_qa/default/1.0.0/28d68f56649a7f0c23bc68eae850af914aa03f95f810011ae8cf58cc5ff5051b)


In [14]:
relevancy_data = {
    'q_id': [], 'question': [], 'q_concept': [],
    'option_a': [], 'option_b': [], 'option_c': [], 'option_d': [], 'option_e': [],
    'names': []
}
option_idxs = ['option_a', 'option_b', 'option_c', 'option_d', 'option_e']
for item in tqdm(en_csqa):
    # Check q concept offensiveness
    options_str = ' '.join(item['choices']['text']).lower()
    is_offensive = any(ex in item['question'].lower() for ex in excluded_concepts) or \
        any(ex in options_str for ex in excluded_concepts)
    if is_offensive:
        continue
    
    relevancy_data['q_id'].append(item['id'])
    relevancy_data['question'].append(item['question'])

    # Check concept generality
    is_general_concepts = any(c in item['question_concept'] for c in general_concepts) or is_filler(item['question_concept'], item['question'])
    if is_general_concepts:
        relevancy_data['q_concept'].append(['yes'] * 5)
    else:
        concept_type = "activity" if is_conceptnet_activity(item['question_concept'], item['question']) else "other"
        input_prompts = get_input_prompt(item['question_concept'], item['question'], concept_type, location_context)
        assert len(input_prompts) == 5
        relevancy_data['q_concept'].append([get_openai_relevancy(input_prompt, model_name) for input_prompt in input_prompts])

    for option_idx, choice in zip(option_idxs, item['choices']['text']):
        locations = extract_locations(choice)
        if len(locations) > 0 or is_conceptnet_location(choice):
            relevancy_data[option_idx].append("no") # means irrelevant, has location
        else:
            relevancy_data[option_idx].append("yes")
        
    names = extract_names(item['question'])
    relevancy_data['names'].append(names)

100%|██████████| 9741/9741 [24:31<00:00,  6.62it/s]


In [15]:
relevancy_df = pd.DataFrame(relevancy_data)

In [16]:
relevancy_df

Unnamed: 0,q_id,question,q_concept,option_a,option_b,option_c,option_d,option_e,names
0,075e483d21c29a511267ef62bedc0461,The sanctions against the school were a punish...,"[yes, no, no, yes, yes]",yes,yes,yes,yes,yes,[]
1,61fe6e879ff18686d7552425a36344c8,Sammy wanted to go to where the people were. ...,"[yes, yes, yes, yes, yes]",yes,yes,yes,yes,yes,[Sammy]
2,4c1cb0e95b99f72d55c068ba0255c54d,To locate a choker not located in a jewelry bo...,"[yes, yes, yes, yes, yes]",yes,yes,yes,yes,yes,[]
3,02e821a3e53cb320790950aab4489e85,Google Maps and other highway and street GPS s...,"[yes, yes, yes, yes, yes]",no,no,yes,yes,yes,[]
4,23505889b94e880c3e89cff4ba119860,"The fox walked from the city into the forest, ...","[yes, no, yes, no, yes]",yes,yes,yes,yes,yes,[]
...,...,...,...,...,...,...,...,...,...
9516,f1b2a30a1facff543e055231c5f90dd0,What would someone need to do if he or she wan...,"[yes, yes, yes, yes, yes]",yes,yes,yes,yes,yes,[]
9517,a63b4d0c0b34d6e5f5ce7b2c2c08b825,Where might you find a chair at an office?,"[yes, yes, yes, yes, yes]",yes,yes,yes,yes,yes,[]
9518,22d0eea15e10be56024fd00bb0e4f72f,Where would you buy jeans in a place with a la...,"[yes, yes, yes, yes, yes]",yes,yes,yes,yes,yes,[]
9519,7c55160a4630de9690eb328b57a18dc2,John fell down the well. he couldn't believe ...,"[yes, yes, yes, yes, yes]",yes,yes,yes,no,yes,[John]


In [17]:
out_parent_dir = "../dataset/relevancy_context/"
relevancy_df.to_csv(out_parent_dir + split_name + "_" + lang_name + "_relevancy.csv", index=False)