In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
import nltk
import textdistance
from nltk.stem import WordNetLemmatizer
from scipy.stats import pearsonr, kendalltau, spearmanr
from itertools import permutations, product, combinations
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import ttest_ind, ttest_rel
from scipy.spatial.distance import cosine




In [2]:
df = pd.read_excel("/Volumes/My Passport/NOUN-NOUN-COMPOUNDS-V1/data/noun_noun_compounds/nn_compounds_with_continuation_words1.xlsx")
df = df[~df["AND_sentence"].isnull()]
corr_metric = "kendalltau"

In [4]:
order_dict = dict(zip(["bert-base-uncased", "roberta-base", "openai-community/gpt2", "microsoft/phi-1", "meta-llama/Llama-3.2-1B", "bert-base-japanese", "distilroberta", "xlm-mlm-xnli15-1024", "xlnet-base-cased", "openai-community/openai-gpt"], range(0,10)))


model_name_map = {"bert-base-uncased" : "BERT", "roberta-base" : "RoBERTa", "openai-community/gpt2" : "GPT2", "microsoft/phi-1" : "Phi1", "meta-llama/Llama-3.2-1B" : "LLaMA3", "bert-base-japanese" : "BERT-Japanese", "distilroberta" : "DistilRoBERTa", "xlm-mlm-xnli15-1024" : "XLM", "xlnet-base-cased" : "XLNet", "openai-community/openai-gpt" : "GPT1"}

def sort_df_by_model_order(df, keep_order_col = True, update_names = True):
    df["model_order"] = [order_dict[x.model] for x in df.iloc]
    extra_columns_to_sort = ["representation", "Word representations processed . . ."]
    extra_columns_to_sort = [x for x in extra_columns_to_sort if x in df.columns]
    sort_cols = ["model_order"] + extra_columns_to_sort
    df = df.sort_values(sort_cols)

    if not keep_order_col:
        del df["model_order"]
    
    if update_names:
        df["model_name"] = [model_name_map[x.model] for x in df.iloc]
        if "representation" in df.columns:
            df["representation_name"] = [model_name_map[x.representation] if x.representation in model_name_map else x.representation for x in df.iloc]
    
    return df

In [5]:
def get_processed_sentence(word, original_sent, head_word=True):
    if head_word and 'They are' in original_sent:
        return 'They are ' + word
    if 'It is a' not in original_sent and 'They are' not in original_sent:
        return 'It is ' + word
    if word[0] in ['a', 'e', 'u', 'i', 'o']:
        return 'It is an ' + word
    else:
        return 'It is a ' + word


full_sents = df['AND_sentence'][:300].tolist()

words_per_sent = [x.split(' ') for x in full_sents]

raw_mod_word_sents = ['{}\t{}'.format(words[-3], get_processed_sentence(words[-3], full, False)) for words, full in zip(words_per_sent, full_sents)]
raw_head_word_sents = ['{}\t{}'.format(words[-2], get_processed_sentence(words[-2], full, True)) for words, full in zip(words_per_sent, full_sents)]
raw_and_word_sents = ['{}\t{}'.format(words[-1], get_processed_sentence(words[-1], full, True)) for words, full in zip(words_per_sent, full_sents)]


flat_list = [item for sublist in list(zip(raw_mod_word_sents, raw_head_word_sents, raw_and_word_sents)) for item in sublist]

file = open('/Volumes/My Passport/NOUN-NOUN-COMPOUNDS-V1/data/noun_noun_compounds/composition/probe_words_and_sentences_and_300_raw.txt', 'w')
for i, sent in enumerate(flat_list):
    file.write(sent + '\n')

In [6]:
def any_sentences_are_different(sentences):
    results = [[sent_a == sent_b for sent_b in sentences] for sent_a in sentences]
    return any([not all(x) for x in results])

any_sentences_are_different(['This is a test!', 'This is a test!', 'This is a test!'])

False

In [7]:
import nltk

In [8]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saffronkendrick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
sentences = np.array(df['NN sentence'][:300].tolist() + df['Gloss sentence'].tolist())

df['compound'] = ['{} {}'.format(x['mod'], x['head']) for x in df.iloc]
compounds = np.array(df['compound'][:300].tolist() + df['compound'].tolist())

process_sent = lambda x: [y for y in nltk.word_tokenize(x.strip().lower()) if y.isalpha()]
words_per_sents = [process_sent(x) for x in sentences]
lemmatiser = WordNetLemmatizer()

word_dict = {'gestates': "gestate"}
look_up = lambda word: word_dict[word] if word in word_dict else lemmatiser.lemmatize(word)
get_vector = lambda word: fasttext[word] if word in fasttext else fasttext[look_up(word)] 

def get_average_vector(words):
    return np.vstack([gest_vector(x) for x in words]).mean(axis=0)

paraphrase_ind_tuples = [[i, i+300, i+600] for i in range(300)]
paraphrase_inds = [item for sublist in paraphrase_ind_tuples for item in sublist]

ordered_sentences = sentences[paraphrase_inds]

# # Keep True because this file exists
# load = True

# if not load:
#     fasttext = gensim.models.KeyedVectors.load_word2vec_format('D:/NOUN-NOUN-COMPOUNDS-V1/data/wiki.en.vec', limit=500000)
#     mean_fasttext_reps_per_sent = np.vstack([get_average_vector(x) for x in words_per_sent])
#     ordered_fasttext_reps = mean_fasttext_reps_per_sent[paraphrase_inds]
#     np.save('/Volumes/My Passport/NOUN-NOUN-COMPOUNDS-V1/data/ordered_fasttext_reps_and.npy', ordered_fasttext_reps)
# else:
#     ordered_fasttext_reps = np.load('/Volumes/My Passport/NOUN-NOUN-COMPOUNDS-V1/data/ordered_fasttext_reps_and.npy')

In [10]:
len(words_per_sent)

300