### Extract Property with Sentence

In [2]:
import os
import json
import jsonlines
import re
from tqdm import tqdm

In [None]:
dir_path = "dataset/processed/241212"
data = []
for filename in os.listdir(dir_path):
    if filename.endswith(".json"):
        with open(os.path.join(dir_path, filename), 'r') as f:
            data.extend(json.load(f))
len(data)

In [4]:
data = [dict(
    noun_phrase = line[0],
    sentence = line[1],
    source = line[2]
) for line in data]

In [None]:
# from openai import OpenAI
# import os
# import json

# template = """Given the sentence and noun phrase, explain the property of the noun phrase.

# ---
# Sentence: You know, we are all kind of like a deer in the headlights.
# Noun phrase: a deer in the headlights
# Property: scared
# ---
# Sentence: We saw like, it looked, I mean, it looked almost like a big bus and I don't really know what that was for but it was driving around as well.
# Noun phrase: a big bus
# Property: heavy
# ---
# Sentence: Image suddenly became as important as the music for artists.
# Noun phrase: music for artists
# Property: important
# ---
# Sentence: {sentence}.
# Noun phrase: {noun_phrase}
# Property: "
# """

# prompts = [dict(
#     custom_id = f"request-{i+1}",
#     method = "POST",
#     url = "/v1/chat/completions",
#     body = dict(
#         model = "gpt-4o-mini",
#         messages = [
#             {"role": "user", "content": template.format(**line)}
#         ],
#         max_tokens = 100,
#         n=10,
#     )
# ) for i, line in enumerate(data)]

# client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# save_path = "dataset/processed/241212/property_extraction__gpt4o_miny__chunk_{}.jsonl"

# for i in range(0, len(prompts), 50000):
#     save_path_chunk = save_path.format(i)
#     with open(save_path_chunk, 'w') as outfile:
#         for entry in prompts[i:i+50000]:
#             json.dump(entry, outfile)
#             outfile.write('\n')

#     batch_input_file = client.files.create(
#         file=open(save_path_chunk, "rb"),
#         purpose="batch"
#     )

#     batch_input_file_id = batch_input_file.id

#     client.batches.create(
#         input_file_id=batch_input_file_id,
#         endpoint="/v1/chat/completions",
#         completion_window="24h",
#         metadata={
#         "description": "extract property of noun phrase from sentence"
#         }
#     )

In [5]:
emphasis_mark = ["\*", "\*\*", "\""]
colon_mark = ":"

class OpenAI_Output:
    def __init__(self, dir_path):
        self.start_name = "batch"
        self.file_paths = []
        for filename in os.listdir(dir_path):
            if filename.startswith(self.start_name):
                self.file_paths.append(os.path.join(dir_path, filename))
        self.data = dict()
        for file_path in self.file_paths:
            with jsonlines.open(file_path) as reader:
                for obj in reader:
                    key = int(obj['custom_id'].split("-")[1])
                    val = [response['message']['content'] for response in obj['response']['body']['choices']]
                    self.data[key] = val
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, key):
        if key in self.data:
            return self.data[key]
        else:
            return []
    
    def extract_properties(self, key):
        outputs = self[key]
        properties = []
        for output in outputs:
            sentences = output.split("\n")
            for sentence in sentences:
                # property is described after colon mark
                if colon_mark in sentence:
                    property = sentence.split(colon_mark)[-1]
                    properties.append(property.lower())
                # property is enclosed with emphasis mark
                for mark in emphasis_mark:
                    property = re.findall(f"{mark}([^\s{mark}]+?){mark}", sentence)
                    if property:
                        properties.extend(property)
        
        # extract english characters with dash (-)
        properties = [property.strip().lower() for property in properties if len(property.strip()) > 5]
        properties = [property for property in properties if not any([mark for mark in emphasis_mark if mark.replace("\\", "") in property])]
        properties = [re.findall(r'[a-zA-Z-]+', property) for property in properties]
        properties = [property[0] for property in properties if property]
        properties = [property for property in properties if 'propert' not in property]
        properties = list(set(properties))
        return properties
    
dir_path = "dataset/processed/241212_result"
openai_output = OpenAI_Output(dir_path)

In [None]:
data = [dict(line, index=i+1, properties=openai_output.extract_properties(i+1)) for i, line in enumerate(tqdm(data))]

# sentence 중복 제거
sentences = []
new_data = []
for line in tqdm(data):
    if line['noun_phrase']+line['sentence'] not in sentences:
        sentences.append(line['noun_phrase']+line['sentence'])
        new_data.append(line)

# re-indexing data
new_data = [dict(line, index=i+1) for i, line in enumerate(new_data)]
new_data[100]

In [8]:
with open("dataset/processed/241212_property_extraction.jsonl", 'w') as outfile:
    for line in new_data:
        json.dump(line, outfile)
        outfile.write('\n')

### Make commonsense statement (Out-dated)

In [None]:
import jsonlines
import transformers
import torch
from tqdm import tqdm
import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [15]:
fpath = "dataset/processed/241212_property_extraction.jsonl"

with jsonlines.open(fpath) as f:
    data = [line for line in f]

In [None]:
# Question formatter

model_name = 'domenicrosati/question_converter-3b'

config = transformers.AutoConfig.from_pretrained(model_name)
format_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
format_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
format_model.to(device)
print('decoder start token id:', format_model.config.decoder_start_token_id)

In [None]:
@torch.no_grad()
def convert_question(inputs, batch_size=128):
    sentences = []
    chunk = []
    
    for noun_phrase, properties in inputs:
        for p in properties:
            sentences.append(f"How is {noun_phrase} generally? </s> {p}")
        chunk.append(len(properties))

    statements = []
    
    for batch in tqdm(range(0, len(sentences), batch_size)):
        s = sentences[batch:batch+batch_size]
        input_ids = format_tokenizer.batch_encode_plus(s, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
        output = format_model.generate(input_ids=input_ids, max_length=256)
        d = format_tokenizer.batch_decode(output, skip_special_tokens=True)
        statements.extend(d)
        
    return [statements[i:j] for i, j in zip([0] + list(accumulate(chunk)), list(accumulate(chunk)))]

# convert_question("a big bus", "scary")

In [None]:
inputs = []
for line in data:
    inputs.append((line['noun_phrase'], line['properties']))
    
outputs = convert_question(inputs)

for output, line in zip(outputs, data):
    line['statements'] = output
    line['statements'] = [s if len(p) > 2 else None for s, p in zip(line['statements'], line['properties'])]

In [None]:
with open("dataset/processed/241212_statements.jsonl", 'w') as outfile:
    for line in data:
        json.dump(line, outfile)
        outfile.write('\n')

### Filter statement by VERA

In [None]:
import jsonlines
import transformers
import torch
from tqdm import tqdm
import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [2]:
fpath = "dataset/processed/241212_property_extraction.jsonl"

with jsonlines.open(fpath) as f:
    data = [line for line in f]

In [None]:
vera_tokenizer = transformers.AutoTokenizer.from_pretrained('liujch1998/vera')
vera_model = transformers.T5EncoderModel.from_pretrained('liujch1998/vera')

vera_model.to(device)

vera_model.D = vera_model.shared.embedding_dim
linear = torch.nn.Linear(vera_model.D, 1, dtype=vera_model.dtype)
linear.weight = torch.nn.Parameter(vera_model.shared.weight[32099, :].unsqueeze(0))
linear.bias = torch.nn.Parameter(vera_model.shared.weight[32098, 0].unsqueeze(0))
vera_model.eval()
t = vera_model.shared.weight[32097, 0].item() # temperature for calibration

def get_score(statements):
    scores = []
    for statement in statements:
        if isinstance(statement, str):
            input_ids = vera_tokenizer.batch_encode_plus([statement], return_tensors='pt', padding='longest', truncation='longest_first', max_length=128).input_ids.to(vera_model.device)
            
            with torch.no_grad():
                output = vera_model(input_ids)
                last_hidden_state = output.last_hidden_state
                hidden = last_hidden_state[:, -1, :]
                logit = linear(hidden).squeeze(-1)
                logit_calibrated = logit / t
                score_calibrated = logit_calibrated.sigmoid()
                scores.append(score_calibrated[0])
        else:
            scores.append(0.0)
    
    return [float(score) for score in scores]

In [None]:
data = [dict(line, scores=get_score([f"The {line['noun_phrase']} is generally {property} ." if len(property) > 2 else None for property in line['properties']])) for line in tqdm(data)]

with open("dataset/processed/241212_vera.jsonl", 'w') as outfile:
    for line in data:
        json.dump(line, outfile)
        outfile.write('\n')

### Post-processing

In [1]:
import jsonlines
from tqdm import tqdm
import json
import pandas as pd

In [4]:
fpath = "data/pipeline/241212_vera.jsonl"

df = pd.read_json(fpath, lines=True)

In [None]:
df['processed_properties'] = df.apply(lambda x: [p for p, s in zip(x['properties'], x['scores']) if s > 0.7], axis=1)
df['processed_properties'].apply(len).sum()

In [6]:
df.drop(columns=['index', 'properties', 'scores'], inplace=True)
df = df.explode('processed_properties').rename(columns={'processed_properties': 'property'})
df.dropna(subset=['property'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.iloc[0]

In [7]:
df.to_json("data/pipeline/241212_vera_processed.jsonl", orient='records', lines=True)

### (...) Make Canceled property

In [None]:
import pandas as pd
import jsonlines
import re
import os
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv("data/pipeline/emergent_seeds_flag.csv")
df = df[df.flag]
df.shape

In [None]:
import spacy
import sys
from data.MAPSKB.models import MAPSKB

fpath = "data/MAPSKB/MAPS-KB.csv"
mapskb = MAPSKB(fpath)

nlp = spacy.load("en_core_web_sm")
nlp("veiled")[0].lemma_

def get_property(concept):
    first_properties = mapskb.SI(concept)
    if len(first_properties) == 0:
        second_properties = mapskb.SI(nlp(concept)[0].lemma_)
        return second_properties
    else:
        return first_properties

In [3]:
# from openai import OpenAI
# import os
# import json

# template = """What is the property of the concept?

# ---
# Concept: car
# Property: fast
# ---
# Concept: apple
# Property: round
# ---
# Concept: needle
# Property: sharp
# ---
# Concept: {concept}
# Property: "
# """

# prompts = [dict(
#     custom_id = f"{concept}",
#     method = "POST",
#     url = "/v1/chat/completions",
#     body = dict(
#         model = "gpt-4o-mini",
#         messages = [
#             {"role": "user", "content": template.format(concept=concept)}
#         ],
#         max_tokens = 100,
#         n=10,
#     )
# ) for concept in concepts]

# client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# save_path = "data/pipeline/241227/property_extraction__gpt4o_miny__chunk_{}.jsonl"

# for i in range(0, len(prompts), 50000):
#     save_path_chunk = save_path.format(i)
#     with open(save_path_chunk, 'w') as outfile:
#         for entry in prompts[i:i+50000]:
#             json.dump(entry, outfile)
#             outfile.write('\n')

#     batch_input_file = client.files.create(
#         file=open(save_path_chunk, "rb"),
#         purpose="batch"
#     )

#     batch_input_file_id = batch_input_file.id

#     client.batches.create(
#         input_file_id=batch_input_file_id,
#         endpoint="/v1/chat/completions",
#         completion_window="24h",
#         metadata={
#         "description": "extract property of concept"
#         }
#     )

In [3]:
# emphasis_mark = ["\*", "\*\*", "\""]
# colon_mark = ":"

# class OpenAI_Output:
#     def __init__(self, dir_path):
#         self.start_name = "batch"
#         self.file_paths = []
#         for filename in os.listdir(dir_path):
#             if filename.startswith(self.start_name):
#                 self.file_paths.append(os.path.join(dir_path, filename))
#         self.data = dict()
#         for file_path in self.file_paths:
#             with jsonlines.open(file_path) as reader:
#                 for obj in reader:
#                     key = obj['custom_id']
#                     val = [response['message']['content'] for response in obj['response']['body']['choices']]
#                     self.data[key] = val
#     def __len__(self):
#         return len(self.data)
        
#     def __getitem__(self, key):
#         if key in self.data:
#             return self.data[key]
#         else:
#             return []
    
#     def extract_properties(self, key):
#         outputs = self[key]
#         properties = []
#         for output in outputs:
#             sentences = output.split("\n")
#             for sentence in sentences:
#                 # property is described after colon mark
#                 if colon_mark in sentence:
#                     property = sentence.split(colon_mark)[-1]
#                     properties.append(property.lower())
#                 # property is enclosed with emphasis mark
#                 for mark in emphasis_mark:
#                     property = re.findall(f"{mark}([^\s{mark}]+?){mark}", sentence)
#                     if property:
#                         properties.extend(property)
        
#         # extract english characters with dash (-)
#         properties = [property.strip().lower() for property in properties if len(property.strip()) > 5]
#         properties = [property for property in properties if not any([mark for mark in emphasis_mark if mark.replace("\\", "") in property])]
#         properties = [re.findall(r'[a-zA-Z-]+', property) for property in properties]
#         properties = [property[0] for property in properties if property]
#         properties = [property for property in properties if 'propert' not in property]
#         properties = list(set(properties))
#         return properties
    
# dir_path = "data/pipeline/241227_result"
# openai_output = OpenAI_Output(dir_path)

In [None]:
df['root_properties'] = df['root'].progress_apply(get_property)
df['modifier_properties'] = df['modifier'].progress_apply(get_property)

In [9]:
# Create a new DataFrame to store the expanded rows
expanded_rows = []

# Iterate over each row in the original DataFrame
for _, row in df.iterrows():
    # Create a new row for the root properties
    for root_property in row['root_properties']:
        new_row = row.copy()
        new_row['property'] = root_property
        new_row['root_rel'] = None
        new_row['modifier_rel'] = None
        new_row['noun_phrase_rel'] = None
        new_row['individual_max'] = None
        new_row['ccpt_score'] = None
        expanded_rows.append(new_row)

    # Create a new row for the modifier properties
    for modifier_property in row['modifier_properties']:
        new_row = row.copy()
        new_row['property'] = modifier_property
        new_row['root_rel'] = None
        new_row['modifier_rel'] = None
        new_row['noun_phrase_rel'] = None
        new_row['individual_max'] = None
        new_row['ccpt_score'] = None
        expanded_rows.append(new_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Display the new DataFrame
expanded_df.reset_index(drop=True, inplace=True)
expanded_df.drop(columns=['root_properties', 'modifier_properties'], inplace=True)
expanded_df.to_csv("data/pipeline/241227_expanded.csv", index=False)

### Formatting

In [None]:
fpath = "dataset/processed/passed.jsonl"

with jsonlines.open(fpath) as f:
    data = [line for line in f]

In [None]:
from tqdm import tqdm
import pandas as pd

result = []
for line in tqdm(data):
    noun_phrase = line['key'].replace("_", " ")
    property = line['attribute']
    
    token = merge_phrases(nlp(noun_phrase))[0]
    root = token._.root

    if root not in token._.C:
        continue
    modifier = [concept for concept in token._.C if concept != root]

    if len(modifier) > 0:
        modifier = modifier[0]

        result.append(dict(
            noun_phrase=noun_phrase,
            root=root,
            modifier=modifier,
            property=property,
            type=""
        ))

In [None]:
with open("dataset/add.jsonl", 'w') as outfile:
    for entry in result:
        json.dump(entry, outfile)
        outfile.write('\n')

### Conceptualization

In [None]:
import jsonlines
import json

fpath = "dataset/conceptual_combination.jsonl"

with jsonlines.open(fpath) as file:
    data = list(file.iter())
    
fpath = "dataset/processed/sample_0904.json"
with open(fpath) as f:
    raw = json.load(f)
    raw = {line[0].replace("\n", " "): line for line in raw}

In [None]:
import json
import re

def capitalize_sentences(text):
    # Split the text into sentences using regex (looking for punctuation followed by a space)
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    # Capitalize each sentence
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]
    
    # Join the sentences back together with a space
    return ' '.join(capitalized_sentences)

def sent_noun_phrase_indexing(sent, noun_phrase):
    sent = sent.lower()
    noun_phrase = noun_phrase.lower()
    index = sent.find(noun_phrase)
    sent = sent[:index] + '[' + sent[index:index+len(noun_phrase)] + ']' + sent[index+len(noun_phrase):]
    return capitalize_sentences(sent), noun_phrase.capitalize()

template = """Following the given examples, you are required to conceptualize the instance (enclosed by []) in the last given noun phrase into abstract concepts. The concept should still fit into the instance’s original sentence. Make sure that the generated abstract concepts are general and not simply hypernyms of the instance.
---
Noun phrase <1>: Julia looks nervous like a firefly in front of a chameleon. [A firefly in front of a chameleon] can be conceptualized as Prey Facing Predator.
Noun phrase <2>: All up together like [a brown apple] when he is dried up, like this way! [A brown apple] can be conceptualized as Unusually Colored Object.
Noun phrase <3>: You're not going to go storming in there like [a bull in a china shop again]. [A bull in a china shop] can be conceptualized as Unlikely Object in a Certain Place.
Noun phrase <4>: Our economy will be as stable as [an apple on a toothpick]. [An apple on a toothpick] can be conceptualized as Object in an Unstable State.
Noun phrase <5>: {sent}, [{noun_phrase}] can be conceptualized as"""

prompts = []

for line in data:
    noun_phrase = line['noun_phrase']
    sent = raw[noun_phrase][-1]

    sent, noun_phrase = sent_noun_phrase_indexing(sent, noun_phrase)
    prompts.append((sent, noun_phrase, template.format(sent=sent, noun_phrase=noun_phrase)))
    
prompts = [dict(
    custom_id = "request-{i}-{np}".format(i=i+1, np=noun_phrase.replace(" ", "_")),
    method = "POST",
    url = "/v1/chat/completions",
    body = dict(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens = 100,
        n=10,
        temperature=0.7,
        top_p=0.8,
    )
) for i, (sent, noun_phrase, prompt) in enumerate(prompts)]

In [None]:
from openai import OpenAI
import os

with open("dataset/processed/conceptualization_chatgpt.jsonl", 'w') as outfile:
    for entry in prompts:
        json.dump(entry, outfile)
        outfile.write('\n')
        
client = OpenAI(api_key=)

batch_input_file = client.files.create(
  file=open("dataset/processed/conceptualization_chatgpt.jsonl", "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "extract property of noun phrase from sentence"
    }
)

In [None]:
import jsonlines
import json

model_output = dict()
with jsonlines.open("dataset/processed/result/batch_WW0jFjwNYDuGwLkDzNkZ6g3x_output.jsonl") as f:
    for line in f.iter():
        model_output[line['custom_id'].split("-")[-1]] = [choice['message']['content'] for choice in line['response']['body']['choices']]
    model_output = {key.lower().replace("_", " "): model_output[key] for key in model_output}
    
for line in data:
    line['conceptualization'] = model_output[line['noun_phrase'].lower()]

In [None]:
with open("dataset/conceptual_combination_conceptualization.jsonl", "w", encoding="utf-8") as f:
    for line in data:
        json.dump(line, f, ensure_ascii=False) # ensure_ascii로 한글이 깨지지 않게 저장
        f.write("\n") # json을 쓰는 것과 같지만, 여러 줄을 써주는 것이므로 "\n"을 붙여준다.

In [None]:
import jsonlines

data = []
with jsonlines.open("dataset/conceptual_combination_conceptualization.jsonl") as f:
    for line in f.iter():
        data.append(line)

In [None]:
from tqdm import tqdm
import jsonlines

for line in tqdm(data):
    prompt = "{noun_phrase} is {category}"

    noun_phrase = line['noun_phrase'].capitalize()
    categories = [cat.lower() for cat in line['conceptualization']]
    
    line['conceptualization'] = [categories[i] for i, score in enumerate(get_score([prompt.format(noun_phrase=noun_phrase, category=cat) for cat in categories])) if score > 0.9]

data = []
with jsonlines.open("dataset/conceptual_combination_conceptualization_filtered.jsonl") as f:
    for line in f.iter():
        data.append(line)

### Instantiation

In [None]:
def read_jsonlines(fpath):
    import jsonlines # !pip install jsonlines 해주기
    data = []
    with jsonlines.open(fpath) as read_file:
        for line in read_file.iter():
            data.append(line)
    return data

In [None]:
import json
import re

def capitalize_sentences(text):
    # Split the text into sentences using regex (looking for punctuation followed by a space)
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    # Capitalize each sentence
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]
    
    # Join the sentences back together with a space
    return ' '.join(capitalized_sentences)

def sent_noun_phrase_indexing(sent, noun_phrase):
    sent = sent.lower()
    noun_phrase = noun_phrase.lower()
    index = sent.find(noun_phrase)
    sent = sent[:index] + '[' + sent[index:index+len(noun_phrase)] + ']' + sent[index+len(noun_phrase):]
    return capitalize_sentences(sent), noun_phrase.capitalize()

template = """Following the given examples, you are required to conceptualize the instance (enclosed by []) in the last given noun phrase into abstract concepts. The concept should still fit into the instance’s original sentence. Make sure that the generated abstract concepts are general and not simply hypernyms of the instance.
---
Noun phrase <1>: Julia looks nervous like a firefly in front of a chameleon. [A firefly in front of a chameleon] can be conceptualized as Prey Facing Predator.
Noun phrase <2>: All up together like [a brown apple] when he is dried up, like this way! [A brown apple] can be conceptualized as Unusually Colored Object.
Noun phrase <3>: You're not going to go storming in there like [a bull in a china shop again]. [A bull in a china shop] can be conceptualized as Unlikely Object in a Certain Place.
Noun phrase <4>: Our economy will be as stable as [an apple on a toothpick]. [An apple on a toothpick] can be conceptualized as Object in an Unstable State.
Noun phrase <5>: {sent}, [{noun_phrase}] can be conceptualized as"""

prompts = []

data = read_jsonlines("dataset/conceptual_combination_conceptualization_filtered.jsonl")
data = [line for line in data if len(line['conceptualization']) > 0]

for line in data:
    noun_phrase = line['noun_phrase']
    sent = raw[noun_phrase][-1]
    categories = line['conceptualization']

    sent, noun_phrase = sent_noun_phrase_indexing(sent, noun_phrase)
    prompts.append((sent, noun_phrase, template.format(sent=sent, noun_phrase=noun_phrase)))
    
prompts = [dict(
    custom_id = "request-{i}-{np}".format(i=i+1, np=noun_phrase.replace(" ", "_")),
    method = "POST",
    url = "/v1/chat/completions",
    body = dict(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens = 100,
        n=10,
        temperature=0.7,
        top_p=0.8,
    )
) for i, (sent, noun_phrase, prompt) in enumerate(prompts)]

with open("dataset/processed/conceptualization_chatgpt.jsonl", 'w') as outfile:
    for entry in prompts:
        json.dump(entry, outfile)
        outfile.write('\n')

### Filter NER

In [None]:
import jsonlines
import json
from tqdm import tqdm
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import pipeline

# load tagger
tagger = SequenceTagger.load("flair/pos-english")

def entity_tag(sentence_list) :
    sentence = Sentence(sentence_list)
    tagger.predict(sentence)
    return sentence 

ner_tag_cnt = 0
cnt = 0
phrases = []
ner_phrases = []
not_ner_phrases = []

with jsonlines.open('dataset/conceptual_combination_type_annotated.jsonl', "r") as fp:
    for line in fp.iter(type=dict):
        cnt += 1
        phrases.append(line)
        
for _, line in enumerate(tqdm(phrases)) :
    tags = [i.tag for i in entity_tag(line['noun_phrase'])]
    if ('NNPS' in tags) or ('FW' in tags) or ('NNP' in tags) :
        # print(line['noun_phrase'], tags)
        ner_phrases.append(line)
        ner_tag_cnt += 1
    else:
        not_ner_phrases.append(line)

In [None]:
len(not_ner_phrases), len(ner_phrases)

In [None]:
with open("dataset/processed/conceptualization_chatgpt_ner_filtered.jsonl", 'w') as outfile:
    for entry in not_ner_phrases:
        json.dump(entry, outfile)
        outfile.write('\n')

### Review Result

In [None]:
import jsonlines
import json

fpath = "dataset/processed/intermediate_vera.jsonl"

with jsonlines.open(fpath) as file:
    data = list(file.iter())
    
fpath = "dataset/processed/sample_0904.json"
with open(fpath) as f:
    raw = json.load(f)
    raw = {line[0].replace("\n", " "): line for line in raw}

In [None]:
for line in data:
    line['sentence'] = raw[line['noun_phrase']][-1]

In [None]:
threshold = 0.7

def identify_property_type(line):
    plausible_results = dict(
        root = "Yes" in line['output_root'][0] if line['output_root'][1] > threshold else None,
        modifier = "Yes" in line['output_modifier'][0] if line['output_modifier'][1] > threshold else None
    )
    
    if True in plausible_results.values():
        line['type'] = 'component'
    elif list(plausible_results.values()).count(False) == 2:
        line['type'] = 'phrase'
    else:
        line['type'] = None
    line['sentence'] = line['sentence'].replace("\n", " ")

    return line

def contains_digit(s):
    return any(char.isdigit() for char in s)

result = [identify_property_type(line) for line in data]
result = [line for line in result if line['type'] == 'phrase']
result = [line for line in result if not contains_digit(line['noun_phrase'])]

In [None]:
from cleantext.sklearn import CleanTransformer

cleaner = CleanTransformer(no_punct=False, lower=True)

for line in result:
    line['noun_phrase'] = cleaner.transform([line['noun_phrase']])[0]

In [None]:
from collections import Counter

Counter([line['type'] for line in result]).most_common()

In [None]:
pass_concepts = ["lot", "black", "cock", "negro", "sex", "fking", "bunch"]

result = [line for line in result if not (line['root'] in pass_concepts or line['modifier'] in pass_concepts)]

In [None]:
len(result)

In [None]:
with open("dataset/processed/intermediate_vera_phrase.jsonl", 'w') as outfile:
    for entry in result:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
fpath = "dataset/processed/intermediate_vera_phrase.jsonl"

with jsonlines.open(fpath) as file:
    data = list(file.iter())

In [None]:
data = [line for line in data if 'while' not in line['output_modifier'][0].lower() and 'while' not in line['output_root'][0].lower()]
len(data)

In [None]:
data = [line for line in data if len(list(set(line['noun_phrase'].lower().split()) & set(line['property'].lower().split()))) == 0]
len(data)

In [None]:
with open("dataset/processed/intermediate_vera_phrase_whilefilter.jsonl", 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

### Add canceled property

In [None]:
import jsonlines

# Extract phrase property and get canceled properties
with jsonlines.open("dataset/processed/intermediate_vera_phrase_whilefilter_filtered.jsonl") as f:
    data = [line for line in f]
len(data)

In [None]:
from src.model.kb_model import MAPSKB

kb = MAPSKB("src/model/MAPS-KB.csv")

In [None]:
import os
import transformers
import torch
from tqdm import tqdm
import json
from torch import nn
import numpy as np
import openai
import backoff
import copy
from datasets import load_dataset
import random
import backoff
import openai
from openai import OpenAI
import base64
import os, sys, pathlib, json, pdb
import concurrent.futures

class ParallelGPT():
    def __init__(self, model_id):
        self.model_id = model_id
        self.client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
    @backoff.on_exception(backoff.expo, (openai.RateLimitError, openai.APIError, openai.Timeout, openai.BadRequestError, openai.APIConnectionError, openai.InternalServerError))
    def completion_with_backoff(self, **kwargs):
        return self.client.chat.completions.create(**kwargs)

    def generate(self, text, image=None, max_new_tokens=2048, temperature=1, num_return_sequences=1, **kwargs):
        if isinstance(text, str):
            text = [text]
        if image is not None:
            if isinstance(image, str):
                image = [image]
            assert len(text) == len(image)

            def process_text_and_image(t, i, idx):
                base64_image = encode_image(i)
                completion = self.completion_with_backoff(
                    model=self.model_id,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text", "text": t
                                },
                                {
                                    "type": "image_url",
                                    "image_url": 
                                    {
                                        "url": f"data:image/jpeg;base64,{base64_image}"
                                    },
                                },
                            ],
                        }
                    ],
                    max_tokens=max_new_tokens,
                    temperature=temperature,
                    n=num_return_sequences,
                    **kwargs
                )
                return (completion, idx)


            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = [executor.submit(process_text_and_image, t, i, idx) for idx, t, i in zip(range(len(text)), text, image)]
                completions = []
                for future in concurrent.futures.as_completed(futures):
                    completions.append(future.result())

            completions_sorted = sorted(completions, key=lambda x: x[1])
            responses = [[completion[0].choices[i].message.content for i in range(num_return_sequences)] for completion in completions_sorted]
            completions = [completion[0] for completion in completions_sorted]


            return {'responses': responses, 'completions': completions}

        else:

            def process_text(t, idx):
                completion = self.completion_with_backoff(
                    model=self.model_id,
                    messages=[
                            {"role": "user","content": t,}
                        
                    ],
                    max_tokens=max_new_tokens,
                    temperature=temperature,
                    n=num_return_sequences,
                    **kwargs
                )
                return (completion, idx)


            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = [executor.submit(process_text, t, idx) for idx, t in enumerate(text)]
                completions = []
                for future in concurrent.futures.as_completed(futures):
                    completions.append(future.result())

            completions_sorted = sorted(completions, key=lambda x: x[1])
            responses = [[completion[0].choices[i].message.content for i in range(num_return_sequences)] for completion in completions_sorted]
            completions = [completion[0] for completion in completions_sorted]


            return {'responses': responses, 'completions': completions}

In [None]:
def get_properties(line):
    template = """List the properties of given concept. Return the response in following template: **{{property_1}}**, **{{property_2}}**, **{{property_3}}**, ..

---
Concept: Apple
Attribute: **Red**, **Round**

Concept: Winter
Attribute: **Cold**, **Snowy**, **Dry**

Concept: {concept}
Attribute: """

    return [template.format(concept=line['root']), template.format(concept=line['modifier'])]
    
def get_component_property(concept, attributes):
    prompt = "\"{concept}\" can be characterized by being/having \"{attribute}\"."
    if len(attributes) == 0:
        return []
    component_index = [i for i, score in enumerate(get_score([prompt.format(concept=concept, attribute=attribute) for attribute in attributes])) if score > 0.7]
    return [attributes[index] for index in component_index]

model_inputs = [prompt for line in data for prompt in get_properties(line)]
model = ParallelGPT(model_id="gpt-4o-mini")
results = model.generate(model_inputs, num_return_sequences=5)['responses']


In [None]:
import re

def pairwise(lst):
    return [lst[i:i+2] for i in range(0, len(lst), 2)]

results = pairwise(results)
for line, result in zip(data, results):
    line['root_component'] = list(set([att.lower() for r in result[0] for att in re.findall(r"\*\*(.*?)\*\*", r)]))
    line['modifier_component'] = list(set([att.lower() for r in result[1] for att in re.findall(r"\*\*(.*?)\*\*", r)]))

    
with open("dataset/processed/intermediate_vera_phrase_whilefilter_filtered_component.jsonl", 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
import jsonlines

# Extract phrase property and get canceled properties
with jsonlines.open("dataset/processed/intermediate_vera_phrase_whilefilter_filtered_component.jsonl") as f:
    data = [line for line in f]

def get_cancelled_property(noun_phrase, attributes):
    prompt = "\"{concept}\" can be characterized by being/having \"{attribute}\"."
    if len(attributes) == 0:
        return []
    component_index = [i for i, score in enumerate(get_score([prompt.format(concept=noun_phrase, attribute=attribute) for attribute in attributes])) if score < 0.2]
    return [attributes[index] for index in component_index]

for line in data:
    line['root_canceled'] = get_cancelled_property(line['noun_phrase'], line['root_component'])
    line['modifier_canceled'] = get_cancelled_property(line['noun_phrase'], line['modifier_component'])
    
with open("dataset/processed/intermediate_vera_phrase_whilefilter_filtered_component_canceled.jsonl", 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
import jsonlines

# Extract phrase property and get canceled properties
with jsonlines.open("dataset/conceptual_combination_type_annotated_add_component_filtered2.jsonl") as f:
    data = [line for line in f]
len(data)

In [None]:
import json
with open("dataset/conceptual_combination_type_annotated_add_component_filtered2.jsonl", 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

### Get relation

In [None]:
from datasets import load_dataset
from ast import literal_eval

df = load_dataset("conceptnet5/conceptnet5", split="train")
df = df.to_pandas()
df = df[df['lang'] == 'en']

df['arg1'] = df['arg1'].apply(lambda x: x.split("/")[-1])
df['arg2'] = df['arg2'].apply(lambda x: x.split("/")[-1])

df['weight'] = df['extra_info'].apply(lambda x: literal_eval(x)['weight'])
df = df[df['weight'] >= 1.0]

In [None]:
def get_all_edges(concept):
    lookup_df = df[(df['arg1'] == concept) | (df['arg2'] == concept)]
    return lookup_df

def format_series_to_triplet(df):
    return list(set(df.apply(lambda x: (x['arg1'], x['rel'], x['arg2']), axis=1).to_list()))

def get_common_edge(row):
    concept1 = row['root']
    concept2 = row['modifier']
    
    lookup_df_1 = get_all_edges(concept1)
    lookup_df_2 = get_all_edges(concept2)
    
    ## first-hop relation
    fh_lookup_df_1 = lookup_df_1[(lookup_df_1['arg1'] == concept2) | (lookup_df_1['arg2'] == concept2)]
    if fh_lookup_df_1.shape[0] > 0:
        return {f"{concept1}_{concept2}": (format_series_to_triplet(fh_lookup_df_1), None)}
    
    ## second-hop relation
    adjacent_1 = set(lookup_df_1[['arg1', 'arg2']].to_numpy().flatten())
    adjacent_2 = set(lookup_df_2[['arg1', 'arg2']].to_numpy().flatten())
    
    common_element = adjacent_1 & adjacent_2
    if len(common_element) > 0:
        adjacent_1 = lookup_df_1[(lookup_df_1['arg1'].apply(lambda x: x in common_element)) | (lookup_df_1['arg2'].apply(lambda x: x in common_element))]
        adjacent_2 = lookup_df_2[(lookup_df_2['arg1'].apply(lambda x: x in common_element)) | (lookup_df_2['arg2'].apply(lambda x: x in common_element))]
        return {f"{concept1}_{concept2}": (format_series_to_triplet(adjacent_1) + format_series_to_triplet(adjacent_2), common_element)}
    else:    
        return {f"{concept1}_{concept2}": ([], None)}

### Get Google-Ngram-Corpus

In [None]:
import pandas as pd
    
df = pd.read_csv("dataset/google1M5G.csv")

In [None]:
import sys
sys.path.append("conceptual_combination")
from dataset.processed.get_nounphrase import *

In [None]:
from tqdm import tqdm

tqdm.pandas()

In [None]:
def f(x):
    x = x.split(" ")
    x = " ".join([s for s in x if re.search(r'[^a-zA-Z0-9\s]', s)])
    return x

with multiprocessing.Pool(20) as p:
    df['0'] = list(tqdm(p.imap(f, df['0']), total=df.shape[0]))
    p.close()
    p.join()

In [None]:
import multiprocessing
from tqdm import tqdm

def flatten(data):
    return [sample for row in data for sample in row]

def f(x):
    return flatten([token._.C for token in nlp(x)])
    
with multiprocessing.Pool(20) as p:
    df['concepts'] = list(tqdm(p.imap(f, df['0']), total=df.shape[0]))
    p.close()
    p.join()

In [None]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    if 'soccer' in i:
        print(row)

In [None]:
df

In [None]:
np = "American footballs"
concepts = merge_phrases(nlp(np))[0]._.C
print(concepts)

df[df['concepts'].apply(lambda x: set(concepts).issubset(set(x)))]

In [None]:
# data = "0 1 2 3 4 5 6 7 8 9 _ADJ_ _ADP_ _ADV_ _CONJ_ _DET_ _NOUN_ _NUM_ _PRON_ _PRT_ _VERB_ a_ aa ab ac ad ae af ag ah ai aj ak al am an ao ap aq ar as at au av aw ax ay az b_ ba bb bc bd be bf bg bh bi bj bk bl bm bn bo bp bq br bs bt bu bv bw bx by bz c_ ca cb cc cd ce cf cg ch ci cj ck cl cm cn co cp cq cr cs ct cu cv cw cx cy cz d_ da db dc dd de df dg dh di dj dk dl dm dn do dp dq dr ds dt du dv dw dx dy dz e_ ea eb ec ed ee ef eg eh ei ej ek el em en eo ep eq er es et eu ev ew ex ey ez f_ fa fb fc fd fe ff fg fh fi fj fk fl fm fn fo fp fq fr fs ft fu fv fw fx fy fz g_ ga gb gc gd ge gf gg gh gi gj gk gl gm gn go gp gq gr gs gt gu gv gw gx gy gz h_ ha hb hc hd he hf hg hh hi hj hk hl hm hn ho hp hq hr hs ht hu hv hw hx hy hz i_ ia ib ic id ie if ig ih ii ij ik il im in io ip iq ir is it iu iv iw ix iy iz j_ ja jb jc jd je jf jg jh ji jj jk jl jm jn jo jp jq jr js jt ju jv jw jx jy jz k_ ka kb kc kd ke kf kg kh ki kj kk kl km kn ko kp kq kr ks kt ku kv kw kx ky kz l_ la lb lc ld le lf lg lh li lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz m_ ma mb mc md me mf mg mh mi mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz n_ na nb nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw nx ny nz o_ oa ob oc od oe of og oh oi oj ok ol om on oo op oq or os ot other ou ov ow ox oy oz p_ pa pb pc pd pe pf pg ph pi pj pk pl pm pn po pp pq pr ps pt pu punctuation pv pw px py pz q_ qa qb qc qd qe qf qg qh qi qj ql qm qn qo qp qq qr qs qt qu qv qw qx qy qz r_ ra rb rc rd re rf rg rh ri rj rk rl rm rn ro rp rq rr rs rt ru rv rw rx ry rz s_ sa sb sc sd se sf sg sh si sj sk sl sm sn so sp sq sr ss st su sv sw sx sy sz t_ ta tb tc td te tf tg th ti tj tk tl tm tn to tp tq tr ts tt tu tv tw tx ty tz u_ ua ub uc ud ue uf ug uh ui uj uk ul um un uo up uq ur us ut uu uv uw ux uy uz v_ va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz w_ wa wb wc wd we wf wg wh wi wj wk wl wm wn wo wp wq wr ws wt wu wv ww wx wy wz x_ xa xb xc xd xe xf xg xh xi xj xk xl xm xn xo xp xq xr xs xt xu xv xw xx xy xz y_ ya yb yc yd ye yf yg yh yi yj yk yl ym yn yo yp yq yr ys yt yu yv yw yx yy yz z_ za zb zc zd ze zf zg zh zi zj zk zl zm zn zo zp zq zr zs zt zu zv zw zx zy zz"

# data = data.split()

In [None]:
# from google_ngram_downloader import readline_google_store
# from tqdm import tqdm
# import multiprocessing

# urls = [f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-{d}.gz" for d in data]
# fnames = [f"googlebooks-eng-all-5gram-20120701-{d}.gz" for d in data]
# data = list(zip(urls, fnames))

# with multiprocessing.Pool(10) as p:
#     data = list(tqdm(p.imap(get_google_ngram_corpus, data), total=len(data)))
#     p.close()
#     p.join()