This can generate the triplets for training the model using triplet loss


In [3]:
import pandas as pd
from tqdm.auto import tqdm
import ast
import itertools
from math import comb
from random import sample
import ast


def generate_all_true_pairs(df, virtual_text_limit=20):
    max_pairs_limit = comb(virtual_text_limit, 3) * (virtual_text_limit - 3)

    all_true_pairs = []
    all_context_embeddings_with_index = []
    context_index = 0  # Initialize context_index here

    for author_id, group in tqdm(df.groupby('author'), desc='Processing authors', dynamic_ncols=True):
        print(f"Processing author ID: {
              author_id}, Number of texts: {len(group)}")

        author_texts = group.drop(columns='author')

        if len(author_texts) > virtual_text_limit:
            sampled_indices = sample(
                range(len(author_texts)), virtual_text_limit)
            author_texts_sampled = author_texts.iloc[sampled_indices]
        else:
            author_texts_sampled = author_texts

        pairs, context_embeddings = generate_true_pairs_for_author(
            author_texts_sampled, author_id, context_index)

        for pair, embedding in zip(pairs[:max_pairs_limit], context_embeddings[:max_pairs_limit]):
            all_true_pairs.append([context_index, pair[1]])
            all_context_embeddings_with_index.append(
                [context_index] + embedding)
            context_index += 1  # Increment for each pair

        print(len(all_true_pairs))

    return all_true_pairs, all_context_embeddings_with_index


def generate_true_pairs_for_author(author_texts, author_id, start_context_index, max_context_size=3):
    true_pairs_list = []
    context_embeddings_list = []

    context_index = start_context_index  # Start from the passed context_index

    for context_combination in itertools.combinations(author_texts.index, max_context_size):
        context_embedding = author_texts.loc[list(
            context_combination)].mean().values.tolist()
        context_embeddings_list.append(context_embedding + [author_id])

        remaining_texts = list(set(author_texts.index) -
                               set(context_combination))
        for check_index in remaining_texts:
            check_embedding = author_texts.loc[check_index].values.tolist(
            ) + [author_id]
            true_pairs_list.append([context_index, check_embedding])
        context_index += 1

    return true_pairs_list, context_embeddings_list


# datasetpath = r"C:\Users\S\Desktop\VerifyMe\datasets\BAWE\Authors_20.csv"
datasetpath = r"C:\Users\S\Desktop\VerifyMe\datasets\Reuters\RAuthors_20.csv"

# Usage with your DataFrame
df = pd.read_csv(datasetpath)

true_pairs_list, context_embeddings_with_index = generate_all_true_pairs(df)

# Convert the lists to DataFrames
true_pairs_df = pd.DataFrame(true_pairs_list, columns=[
                             'context_index', 'positive_embedding'])

# true_pairs_df.to_csv(r"C:\Users\Ger\Desktop\Reuterb4TrueCheck-BAWE.csv", index=False)
true_pairs_df = true_pairs_df.drop('context_index', axis=1)

# Save the true pairs DataFrame to CSV
true_pairs_df.to_csv(r"TRUEcheck.csv", index=False)

# Now create the DataFrame
context_embeddings_df = pd.DataFrame(context_embeddings_with_index, columns=[
                                     'context_index'] + [str(i) for i in range(len(context_embeddings_with_index[0]) - 2)] + ['author'])


context_embeddings_df = context_embeddings_df.drop('context_index', axis=1)

context_embeddings_df.to_csv(r"Cleaned_Context.csv", index=False)

# Load the DataFrames
context_embeddings_df = pd.read_csv(r"Cleaned_Context.csv")
true_pairs_df = pd.read_csv(r"TRUEcheck.csv")

# Function to remove the last element from the embedding list


def remove_last_item_from_embedding(embedding_str):
    embedding_list = ast.literal_eval(embedding_str)  # Convert string to list
    modified_embedding_list = embedding_list[:-1]  # Remove the last element
    return str(modified_embedding_list)  # Convert back to string


# Apply the function to the 'positive_embedding' column
true_pairs_df['positive_embedding'] = true_pairs_df['positive_embedding'].apply(
    remove_last_item_from_embedding)

# Drop the 'author' column from the context embeddings DataFrame
context_embeddings_df = context_embeddings_df.drop('author', axis=1)


# Convert context embeddings to the desired format
context_embeddings_df['anchor_embedding'] = context_embeddings_df.apply(
    lambda row: str(row.tolist()), axis=1)

# Keep only the 'positive_embedding' column from true_pairs_df
true_pairs_df = true_pairs_df[['positive_embedding']]

# Combine the two DataFrames
combined_df = pd.concat(
    [context_embeddings_df['anchor_embedding'], true_pairs_df], axis=1)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(r"TRUE-Pairs.csv", index=False)


context_embeddings_df = pd.read_csv(r"Cleaned_Context.csv")

data = pd.read_csv(datasetpath)

# Initialize the false pairs list of dictionaries
false_pairs_list = []

# Generate false pairs
for index, context_row in tqdm(context_embeddings_df.iterrows(), total=context_embeddings_df.shape[0], desc='Generating false pairs', dynamic_ncols=True):
    # Get the context embedding from the row
    # Assuming the last column is 'author'
    context_embed = context_row[:-1].tolist()
    author = context_row['author']

    # Filter the original dataset to exclude the current author
    other_authors_data = data[data['author'] != author]

    # Randomly select a check embedding from a different author
    negative_embed = other_authors_data.sample(
        # Assuming the last column is 'author'
        1).iloc[:, :-1].values.flatten().tolist()

    # Add the pair to the false pairs list
    false_pairs_dict = {'anchor_embedding': context_embed,
                        'negative_embedding': negative_embed}
    false_pairs_list.append(false_pairs_dict)


# Convert the list of dictionaries to a DataFrame
false_pairs_df = pd.DataFrame(false_pairs_list)

# Save the false pairs to a CSV file
false_pairs_df.to_csv(r"FALSE-Pairs.csv", index=False)

# Load the true and false pairs
true_pairs = pd.read_csv(r"True-Pairs.csv")
false_pairs = pd.read_csv(r"False-Pairs.csv")

print(true_pairs.columns)
print(false_pairs.columns)

print(true_pairs["anchor_embedding"].head())
print(false_pairs["anchor_embedding"].head())

triplets = pd.concat([true_pairs, false_pairs["negative_embedding"]], axis=1)
print("\n")
print("The triplet has the following columns: " + str(triplets.columns))
print("\n")
print(triplets)

# Shuffle the rows
triplets_shuffled = triplets.sample(frac=1).reset_index(drop=True)

# Save the final training file
triplets_shuffled.to_csv(r"30_3_RFinal-Triplets_Test.csv", index=False)

print("Triplet dataset is saved!")


# surely we can make multiples more training data now, and there wont really be the issue of class imbalances
# 50:50 balance no longer matters

Processing authors:   0%|                                                                       | 0/10 [00:00<…

Processing author ID: JimGilchrist, Number of texts: 100
4060
Processing author ID: JohnMastrini, Number of texts: 100
8120
Processing author ID: KevinDrawbaugh, Number of texts: 100
12180
Processing author ID: MarcelMichelson, Number of texts: 100
16240
Processing author ID: MarkBendeich, Number of texts: 100
20300
Processing author ID: MartinWolk, Number of texts: 100
24360
Processing author ID: MatthewBunce, Number of texts: 100
28420
Processing author ID: SarahDavison, Number of texts: 100
32480
Processing author ID: TanEeLyn, Number of texts: 100
36540
Processing author ID: ToddNissen, Number of texts: 100
40600


Generating false pairs:   0%|                                                                | 0/40600 [00:00<…

Index(['anchor_embedding', 'positive_embedding'], dtype='object')
Index(['anchor_embedding', 'negative_embedding'], dtype='object')
0    [0.6313577898690252, 0.5812030501144747, 0.469...
1    [0.6585505094633697, 0.6110594246712918, 0.582...
2    [0.6639580080082189, 0.6174643094158528, 0.582...
3    [0.6231171525545147, 0.5814809733239562, 0.480...
4    [0.6432079573471893, 0.5938726540462341, 0.516...
Name: anchor_embedding, dtype: object
0    [0.6313577898690252, 0.5812030501144747, 0.469...
1    [0.6585505094633697, 0.6110594246712918, 0.582...
2    [0.6639580080082189, 0.6174643094158528, 0.582...
3    [0.6231171525545147, 0.5814809733239562, 0.480...
4    [0.6432079573471893, 0.5938726540462341, 0.516...
Name: anchor_embedding, dtype: object


The triplet has the following columns: Index(['anchor_embedding', 'positive_embedding', 'negative_embedding'], dtype='object')


                                        anchor_embedding  \
0      [0.6313577898690252, 0.5812030501144747, 0.4

In [15]:
import pandas as pd
import ast

# Function to safely convert stringified lists back to lists and round the floats
def str_to_list_and_round(embedding_str, precision=5):
    embedding_list = ast.literal_eval(embedding_str)
    # Round each float in the list to the specified precision
    rounded_embedding = [round(float(num), precision) for num in embedding_list]
    return rounded_embedding

# Specify the precision for rounding
precision = 3

# Load datasets
train_df = pd.read_csv(r"C:\Users\S\Desktop\VerifyMe\Home\10_3_RFinal-Triplets_Test.csv")
test_df = pd.read_csv(r"C:\Users\S\Desktop\VerifyMe\Home\10_3_RFinal-Triplets_Train.csv")

# Convert embeddings from string to list and round
train_df['positive_embedding'] = train_df['positive_embedding'].apply(lambda x: str_to_list_and_round(x, precision))
train_df['negative_embedding'] = train_df['negative_embedding'].apply(lambda x: str_to_list_and_round(x, precision))

test_df['positive_embedding'] = test_df['positive_embedding'].apply(lambda x: str_to_list_and_round(x, precision))
test_df['negative_embedding'] = test_df['negative_embedding'].apply(lambda x: str_to_list_and_round(x, precision))

# Flatten the embeddings into lists
train_pos_embeddings = list(train_df['positive_embedding'])
train_neg_embeddings = list(train_df['negative_embedding'])

test_pos_embeddings = list(test_df['positive_embedding'])
test_neg_embeddings = list(test_df['negative_embedding'])

# Convert lists of embeddings to sets of tuple for comparison
train_pos_set = set(tuple(embedding) for embedding in train_pos_embeddings)
train_neg_set = set(tuple(embedding) for embedding in train_neg_embeddings)

test_pos_set = set(tuple(embedding) for embedding in test_pos_embeddings)
test_neg_set = set(tuple(embedding) for embedding in test_neg_embeddings)


# Check for overlaps
overlap_pos = train_pos_set.intersection(test_pos_set)
overlap_neg = train_neg_set.intersection(test_neg_set)

neg_ops = train_neg_set.intersection(test_pos_set)
# Print results
print(f"Found {len(overlap_pos)} overlapping positive embeddings between training and testing sets.")
print(f"Found {len(overlap_neg)} overlapping negative embeddings between training and testing sets.")

if overlap_pos:
    print("Overlapping positive embeddings detected.")
if overlap_neg:
    print("Overlapping negative embeddings detected.")
print(len(neg_ops))


Found 0 overlapping positive embeddings between training and testing sets.
Found 0 overlapping negative embeddings between training and testing sets.
0
