In [None]:
import pandas as pd
import numpy as np
import math
import json
from collections import Counter
import random
import math 
import warnings
warnings.filterwarnings("ignore")

In [None]:
def divide_test_dev_sentences(test_dev_sentences_df, seed):
    ''' Take an equal amount of sentences from each sense for each worf for test and dev sets'''
    test_sentences, dev_sentences = pd.DataFrame(), pd.DataFrame()
    for word in test_dev_sentences_df.lemma.unique():
        word_df = test_dev_sentences_df[test_dev_sentences_df.lemma == word]
        for sense in word_df.sense.unique():
            sense_df = word_df[word_df.sense == sense]
            test_samples = sense_df.sample(frac=0.5, random_state=seed)
            dev_samples = sense_df[~sense_df.index.isin(test_samples.index)]
            test_sentences = pd.concat([test_sentences, test_samples], ignore_index=True)
            dev_sentences = pd.concat([dev_sentences, dev_samples], ignore_index=True)

    return test_sentences, dev_sentences

def numpy_encoder(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return str(obj)  # Convert any other unknown type to string

# Sample test and dev sentences
def sample_sentences(word_list, wsd_df):
    train_sentences, test_dev_sentences = pd.DataFrame(), pd.DataFrame()
    
    for word in word_list:
        word_df = wsd_df[wsd_df.lemma == word]
        
        # Get all unique labels for the current word
        labels = word_df.sense.unique()
        
        # Initialize temporary DataFrames for this word's samples
        train_samples = pd.DataFrame()
        test_dev_samples = pd.DataFrame()
        
        for label in labels:
            # Filter DataFrame by the current label
            label_df = word_df[word_df.sense == label]
            
            # Shuffle the rows
            sampled_df = label_df.sample(frac=1.0, random_state=42)  # random_state for reproducibility
            train_split_index = int(len(sampled_df) * 0.75)
            
            # Split into train and test/dev sets
            train_samples_label = sampled_df.iloc[:train_split_index]
            test_dev_samples_label = sampled_df.iloc[train_split_index:]
            
            # Append the label-specific samples to the respective DataFrames
            train_samples = pd.concat([train_samples, train_samples_label], ignore_index=True)
            test_dev_samples = pd.concat([test_dev_samples, test_dev_samples_label], ignore_index=True)
        
        # Concatenate the word-specific samples to the final DataFrames
        train_sentences = pd.concat([train_sentences, train_samples], ignore_index=True)
        test_dev_sentences = pd.concat([test_dev_sentences, test_dev_samples], ignore_index=True)

    return train_sentences, test_dev_sentences

In [None]:
def process_word(word_df, seed, repeat_num):
    for i in range(repeat_num):
        word_df[f'same_sense_paired_{i}'] = None
        word_df[f'diff_sense_paired_{i}'] = None

    word_df = word_df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Pair up sentences with the same sense sentences
    for sense in word_df.sense.unique():
        sense_df = word_df[word_df.sense == sense] #.reset_index(drop=True) #.sample(frac=1, random_state=seed).reset_index(drop=True)

        # Iterate for each repeat number
        for i in range(repeat_num):
            # Loop through each sentence in the sense-specific DataFrame
            for row1 in sense_df.itertuples():
                # Dynamically check up to same_sense_paired_{i-1}
                if pd.isna(getattr(row1, f'same_sense_paired_{i}')):
                    # List of columns to check for previous pairings
                    previous_columns = [f'same_sense_paired_{j}' for j in range(i)]
                    
                    # Filter unpaired candidates for same_sense
                    unpaired_candidates = sense_df[
                        (sense_df[f'same_sense_paired_{i}'].isna()) &  # Ensure this column is NaN
                        (~sense_df[previous_columns].eq(row1.Index, axis=0).any(axis=1)) &  # No prior pairing in same_sense_paired_0..i-1
                        (sense_df.index != row1.Index)  # Exclude the current sentence
                    ]
                    if not unpaired_candidates.empty:
                        # Randomly select one candidate for pairing
                        row2 = unpaired_candidates.sample(1).iloc[0]

                        # Update the DataFrame for the current pairing
                        word_df.at[row1.Index, f'same_sense_paired_{i}'] = row2.name
                        word_df.at[row2.name, f'same_sense_paired_{i}'] = row1.Index
                        
                        # Update sense-specific DataFrame as well
                        sense_df.at[row1.Index, f'same_sense_paired_{i}'] = row2.name
                        sense_df.at[row2.name, f'same_sense_paired_{i}'] = row1.Index

    # Initialize diff_sense_paired columns for each repeat
    for i in range(repeat_num):
        word_df[f'diff_sense_paired_{i}'] = None

    # Sense counts and inter-set pair tracking
    sense_counts = word_df.sense.value_counts().to_dict()
    sense_pair_counts = {sense: {other_sense: 0 for other_sense in word_df.sense.unique()} for sense in word_df.sense.unique()}

    # Iterate over each repeat number to pair sentences with different senses
    for i in range(repeat_num):
        for row_index in range(len(word_df)):
            row = word_df.iloc[row_index]
            # If the current diff_sense_paired_{i} column is NaN
            if pd.isna(row[f'diff_sense_paired_{i}']):
                row_sense = row.sense

                # Find sense with the fewest inter-set pairs, relative to the target proportions
                sense_ratios = {
                    sense: sense_pair_counts[row_sense][sense] / sense_counts[sense]
                    for sense in sense_counts if sense != row_sense
                }

                if sense_ratios:
                    # Select the sense with the lowest pairing ratio to have the right proportion of label pairs
                    less_than_proportion_sense = min(sense_ratios, key=sense_ratios.get)

                    # Get candidate sentences from the selected sense
                    candidate_sentences = word_df[
                        (word_df.sense == less_than_proportion_sense) & # Select the sense with the lowest pairing ratio
                        pd.isna(word_df[f'diff_sense_paired_{i}']) &  # Ensure the current column is unpaired
                        (~word_df[[f'diff_sense_paired_{j}' for j in range(i)]].eq(row_index, axis=0).any(axis=1)) &  # Ensure no previous pairing with this sentence
                        (word_df.index != row.name)  # Exclude the current sentence
                    ]

                    # Check for candidates and select one randomly
                    if not candidate_sentences.empty:
                        diff_sense_sentence = candidate_sentences.sample(1).iloc[0].name
                        word_df.at[row.name, f'diff_sense_paired_{i}'] = diff_sense_sentence
                        word_df.at[diff_sense_sentence, f'diff_sense_paired_{i}'] = row.name

                        # Update the sense pair counts
                        sense_pair_counts[row_sense][less_than_proportion_sense] += 1
                        sense_pair_counts[less_than_proportion_sense][row_sense] += 1

                else:
                    # If all other senses have reached the target proportions, pair with any different sense
                    candidate_sentences = word_df[
                        (word_df.sense != row_sense) &
                        pd.isna(word_df[f'diff_sense_paired_{i}']) &
                        (~word_df[[f'diff_sense_paired_{j}' for j in range(i)]].eq(row_index, axis=0).any(axis=1)) &
                        (word_df.index != row.name)
                    ]

                    if not candidate_sentences.empty:
                        diff_sense_sentence = candidate_sentences.sample(1).iloc[0].name
                        word_df.at[row.name, f'diff_sense_paired_{i}'] = diff_sense_sentence
                        word_df.at[diff_sense_sentence, f'diff_sense_paired_{i}'] = row.name

                        # Update the sense pair counts
                        sense_pair_counts[row_sense][word_df.loc[diff_sense_sentence].sense] += 1
                        sense_pair_counts[word_df.loc[diff_sense_sentence].sense][row_sense] += 1

    return word_df
                    
def extract_sentence_pairs(word_df, repeat_num):
    # Initialize lists to store the extracted data
    sentence_pairs, id1s, id2s, labels = [], [], [], []
    start1s, end1s, start2s, end2s, lemmas = [], [], [], [], []
    
    # Use a set to track pairs that have been added to avoid duplicates
    added_pairs = set()

    for i in range(repeat_num):

        # Iterate over each row in the DataFrame
        for _, row in word_df.iterrows():
            # Process same_sense_paired pairs
            if pd.notna(row[f'same_sense_paired_{i}']):
                paired_row = word_df.loc[row[f'same_sense_paired_{i}']]
                pair = (row['sentence'], paired_row['sentence'])
                reverse_pair = (paired_row['sentence'], row['sentence'])
                
                # Only add the pair if it or its reverse hasn't been added already
                if pair not in added_pairs and reverse_pair not in added_pairs:
                    # Append data to lists for same-sense pair
                    sentence_pairs.append((row['sentence'], paired_row['sentence']))
                    id1s.append(row['sent_id'])
                    id2s.append(paired_row['sent_id'])
                    labels.append(1)  # Label 1 for same sense
                    start1s.append(row['start'])
                    end1s.append(row['end'])
                    start2s.append(paired_row['start'])
                    end2s.append(paired_row['end'])
                    lemmas.append(row['lemma'])
                    
                    # Add the pair to the set to mark it as added
                    added_pairs.add(pair)

    for i in range(repeat_num):
        for _, row in word_df.iterrows():
            # Process diff_sense_paired pairs
            if pd.notna(row[f'diff_sense_paired_{i}']):
                paired_row = word_df.loc[row[f'diff_sense_paired_{i}']]
                pair = (row['sentence'], paired_row['sentence'])
                reverse_pair = (paired_row['sentence'], row['sentence'])

                # Only add the pair if it or its reverse hasn't been added already
                if pair not in added_pairs and reverse_pair not in added_pairs:
                    # Append data to lists for different-sense pair
                    sentence_pairs.append((row['sentence'], paired_row['sentence']))
                    id1s.append(row['sent_id'])
                    id2s.append(paired_row['sent_id'])
                    labels.append(0)  # Label 0 for different sense
                    start1s.append(row['start'])
                    end1s.append(row['end'])
                    start2s.append(paired_row['start'])
                    end2s.append(paired_row['end'])
                    lemmas.append(row['lemma'])
                    
                    # Add the pair to the set to mark it as added
                    added_pairs.add(pair)

    # Format the data into a list of dictionaries
    formatted_data = [
        {
            "lemma": lemma,
            "sentence1": pair[0],
            "sentence2": pair[1],
            "sent_id1": sent_id1,
            "sent_id2": sent_id2,
            "start1": start1,
            "end1": end1,
            "start2": start2,
            "end2": end2,
            "label": label
        }
        for pair, label, start1, end1, start2, end2, lemma, sent_id1, sent_id2 
        in zip(sentence_pairs, labels, start1s, end1s, start2s, end2s, lemmas, id1s, id2s)
    ]
    
    return formatted_data

# Helper function to process sentences for each word
def create_split(wsd_df, word_list, output_file, seed=None, extra_sentences_df=None, repeat_num=1):
    formatted_data = []
    
    for word in word_list:
        word_df = wsd_df[wsd_df.lemma == word]
        new_word_df = process_word(word_df, seed, repeat_num)
        formatted_data.extend(extract_sentence_pairs(new_word_df, repeat_num))

    # Include additional sentences if provided
    if extra_sentences_df is not None:
        for word in extra_sentences_df.lemma.unique():
            word_df = extra_sentences_df[extra_sentences_df.lemma == word]
            new_word_df = process_word(word_df, seed, repeat_num)
            formatted_data.extend(extract_sentence_pairs(new_word_df, repeat_num))

    label_counts = Counter([pair['label'] for pair in formatted_data])
    counts_0 = label_counts[0]
    counts_1 = label_counts[1]
    sum = counts_0 + counts_1
    print(f"Label distribution 0:1 :", round(counts_0/sum,2), round(counts_1/sum,2))

    # Save the data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, ensure_ascii=False, indent=4, default=numpy_encoder)



In [None]:

short_langs = {
    'Azerbaijani': 'az',
    'Telugu': 'te',
    'Marathi': 'mr',
    'Swahili': 'sw',
    'Vietnamese': 'vi',
    'Punjabi': 'pa',
    'Polish': 'pl',
    'Urdu': 'ur',
    'Korean': 'ko',
    'Kannada': 'kn',
    'Bulgarian': 'bg',
}
languages = short_langs.keys()

seed = 0 # for random sampling
# Set the maximum number that each sentence can be repeated (across unique pairs)
repeat_num = 8

for language in languages:
    print()
    print(language)
    # Load data and initialize variables
    wsd_df = pd.read_csv(f'./formatted_wsd_files/{language}.csv')
    print("total sentences", len(wsd_df))
    # to each sentence, add a sentence id consisting of {short_langs[language]}_{sense}_{index in the dataframe}
    # drop nan rows and print them
    wsd_df = wsd_df.dropna()
    
    wsd_df['sent_id'] = wsd_df.apply(lambda x: f"{short_langs[language]}_{int(x.sense)}_{x.name}", axis=1)

    words = wsd_df.lemma.unique()
    print("num of words", len(words))

    # Split words into train, test, and dev sets
    six = math.ceil(len(words) * 0.7)
    two = math.ceil(len(words) * 0.15)

    train_words = np.random.choice(words, six, replace=False)
    dev_words = np.random.choice([word for word in words if word not in train_words], two, replace=False)
    test_words = [word for word in words if word not in train_words and word not in dev_words]

    percent_train_words = np.random.choice(train_words, math.ceil(len(train_words) * 0.3), replace=False)

    # Generate and save datasets
    train_sentences_df, test_dev_sentences_df = sample_sentences(percent_train_words, wsd_df)
    excluded_train_words = train_sentences_df.lemma.unique()
    train_words = [word for word in train_words if word not in excluded_train_words]

    test_sentences_df, dev_sentences_df = divide_test_dev_sentences(test_dev_sentences_df, seed)

    print("Creating train data...")
    create_split(wsd_df, train_words, f'./formatted_wic_files_{repeat_num*2}_reps/{language}_train.data', seed, train_sentences_df, repeat_num)

    print("Creating test data...")
    create_split(wsd_df, test_words, f'./formatted_wic_files_{repeat_num*2}_reps/{language}_test.data', seed, test_sentences_df,repeat_num)

    print("Creating dev data...")
    create_split(wsd_df, dev_words, f'./formatted_wic_files_{repeat_num*2}_reps/{language}_dev.data', seed, dev_sentences_df,repeat_num)