In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json
from openai import OpenAI
import nltk
import fasttext

client = OpenAI(api_key = open('/mnt/qb/work/eickhoff/esx208/openai_keys/health_nlp_key.txt').read())
ft_model = fasttext.load_model('/mnt/qb/work/eickhoff/esx208/data/fasttext_models/fasttext-en-vectors/model.bin')



In [2]:
def find_semantically_similar_terms_batches(query_term_table, openai_client, batch_size=5, word_num=3):
    prompt = '''
You will be given a user query and a single word from it. For a given word you should generate {word_num} semantically similar words given the query as a context.
The generated words must not exist in the query. Please sort them from the most similar to the least.
Your output should only be a list of dicts as in the example. It should be readable with the json.loads() function. 
Please generate similar words only to the word in the field "word".
Example input: 
[
    {{
        "query": "define preventive",
        "word": "preventive"
    }},
    {{
        "query": "here there be dragons comic",
        "word": "comic"
    }},
]
Example output:
[
    {{
        "query": "define preventive",
        "word": "preventive",
        "synonyms": ["prophylactic", "preemptive", "averting"]
    }},
    {{
        "query": "here there be dragons comic",
        "word": "comic",
        "synonyms": ["manga", "graphic", "cartoon"]
    }},
]
Input:
{input}
Output:
'''
    output = []
    curr_input = []
    for idx, line in tqdm(query_term_table.iterrows(), total=query_term_table.shape[0]):
        curr_input.append({
            "query": line['query'],
            "term": line['inj_term']
        })
        if len(curr_input) == batch_size or idx == query_term_table.shape[0] - 1:
            completion = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                    "role": "user",
                    "content": prompt.format(input=json.dumps(curr_input, indent=4), word_num=word_num)
                    }
                ]
            )
            try:
                output += json.loads(completion.choices[0].message.content)
            except:
                pass
            curr_input = []
    return output

Let's load TFC1 data and try to replace appended words with their syunonyms using GPT4-o. We will then save the resulting dataset as STMC1 data

In [3]:
tfc1_data = pd.read_csv('/mnt/qb/work/eickhoff/esx208/MechIR/data/TFC1-data.tsv.gz', sep='\t')
tfc1_data_perturbed = tfc1_data[tfc1_data['perturbed'] == True]
tfc1_data_perturbed['inj_term'] = tfc1_data_perturbed['text'].apply(lambda text: text.split(' ')[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tfc1_data_perturbed['inj_term'] = tfc1_data_perturbed['text'].apply(lambda text: text.split(' ')[-1])


In [4]:
semantically_similar_terms_batches = find_semantically_similar_terms_batches(tfc1_data_perturbed[['query', 'inj_term']].drop_duplicates(), client, batch_size=5, word_num=3)

  0%|          | 0/337 [00:00<?, ?it/s]

In [5]:
def select_best_semantically_similar_pair_fasttext(semantically_similar_terms, ft_model):
    semantically_similar_pairs = dict()
    
    for item in tqdm(semantically_similar_terms):
        tokenized = nltk.word_tokenize(item['query'])
        tokenized = [term.lower() for term in tokenized if term.isalpha()]
        similar_words = [term.lower() for term in item['synonyms'] if not term.lower() in tokenized and not term.lower() in item['query']]
        
        if len(tokenized) > 0 and len(similar_words) > 0:
            query_vectors = np.stack([ft_model.get_word_vector(term) for term in tokenized])
            similar_words_vectors = np.stack([ft_model.get_word_vector(term) for term in similar_words])

            query_vectors /= np.linalg.norm(query_vectors, axis=-1)[:, np.newaxis]
            similar_words_vectors /= np.linalg.norm(similar_words_vectors, axis=-1)[:, np.newaxis]

            scores = query_vectors @ similar_words_vectors.T
            argmax = np.unravel_index(scores.argmax(), scores.shape)
            semantically_similar_pairs[(item['query'], 
                                        item['term'] if 'term' in item else item['word'])] = (tokenized[argmax[0]], similar_words[argmax[1]])
    return semantically_similar_pairs

def select_semantically_similar_pair_first_possible(semantically_similar_terms):
    semantically_similar_pairs = dict()
    for item in semantically_similar_terms:
        tokenized = nltk.word_tokenize(item['query'])
        tokenized = [term.lower() for term in tokenized if term.isalpha()]
        for word in item['synonyms']:
            if not word.lower() in tokenized:
                if 'word' in item:
                    semantically_similar_pairs[item['query']] = (item['word'], word)
                else:
                    semantically_similar_pairs[item['query']] = (item['term'], word)
    return semantically_similar_pairs

In [6]:
semantically_similar_pairs = select_best_semantically_similar_pair_fasttext(semantically_similar_terms_batches, ft_model)

  0%|          | 0/325 [00:00<?, ?it/s]

In [20]:
tfc1_data_perturbed_new = tfc1_data_perturbed.copy()
tfc1_data_perturbed_new = tfc1_data_perturbed_new[tfc1_data_perturbed_new.apply(lambda row: (row['query'], row['inj_term']) in semantically_similar_pairs, axis=1)]
tfc1_data_perturbed_new['synonym'] = tfc1_data_perturbed_new.apply(lambda row: semantically_similar_pairs[(row['query'], row['inj_term'])][1], axis=1)
tfc1_data_perturbed_new['inj_term'] = tfc1_data_perturbed_new.apply(lambda row: semantically_similar_pairs[(row['query'], row['inj_term'])][0], axis=1)
tfc1_data_perturbed_new['text'] = tfc1_data_perturbed_new.apply(lambda row: row['text'].rsplit(' ', 1)[0] + ' ' + row['synonym'], axis=1)

tfc1_data_non_perturbed = tfc1_data[tfc1_data['perturbed'] == False].copy()
tfc1_data_non_perturbed = tfc1_data_non_perturbed.merge(tfc1_data_perturbed[['qid', 'docno', 'inj_term']], on=['qid', 'docno'], how='inner')
tfc1_data_non_perturbed = tfc1_data_non_perturbed[tfc1_data_non_perturbed.apply(lambda row: (row['query'], row['inj_term']) in semantically_similar_pairs, axis=1)]
tfc1_data_non_perturbed['inj_term'] = None

stmc1_data = pd.concat([tfc1_data_non_perturbed, tfc1_data_perturbed_new], ignore_index=True)

In [21]:
stmc1_data.to_csv('/mnt/qb/work/eickhoff/esx208/MechIR/data/STMC1-data.tsv.gz', sep='\t', index=False)