In [1]:
import sys 
sys.path.append("..")

In [2]:
from src.datasets import HateXplainRace

In [3]:
import faiss

In [4]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
shots = 5

In [6]:
embedding = SentenceTransformer('all-mpnet-base-v2')

In [7]:
hate = HateXplainRace("../data/HateXplain/")

In [8]:
train_df, test_df, overall_demographics = hate.create_prompts()

In [9]:
train_vectors = embedding.encode(train_df['prompts'].tolist(), batch_size = 32, show_progress_bar=True)
test_vectors = embedding.encode(test_df['prompts'].tolist(), batch_size = 32, show_progress_bar=True)

Batches: 100%|██████████| 168/168 [00:04<00:00, 35.29it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 41.73it/s]


In [10]:
faiss.normalize_L2(train_vectors)

faiss.normalize_L2(test_vectors)

In [11]:
vector_dim = train_vectors.shape[1]

index = faiss.IndexFlatIP(vector_dim)

index.add(train_vectors)

distances, neighbors = index.search(test_vectors, shots)

In [12]:
#Within
pre_computed_inclusions = dict()

for demographic in set(hate.demographics):
    pre_computed_inclusions[demographic] = train_df[train_df.filtered_demographics == demographic]

within_samples = []
for row in test_df.itertuples():
    filtered_df = pre_computed_inclusions[row.filtered_demographics]

    within_sample = filtered_df["prompts"].sample(n=shots).index.tolist()

    within_samples.append(within_sample)

In [13]:
#Excluding
pre_computed_exclusions = dict()

for demographic in set(hate.demographics):
    pre_computed_exclusions[demographic] = train_df[train_df.filtered_demographics != demographic]

excluding_samples = []
for row in test_df.itertuples():
    filtered_df = pre_computed_exclusions[row.filtered_demographics]

    excluding_sample = filtered_df["prompts"].sample(n=shots).index.tolist()

    excluding_samples.append(excluding_sample)

In [14]:
from sklearn.cluster import KMeans

In [15]:
kmeans2 = KMeans(shots)

In [16]:
kmeans2.fit(train_vectors)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
kmeans2_distances, kmeans2_neighbors = index.search(kmeans2.cluster_centers_, 1)
kmeans2_neighbors = [element for sublist in kmeans2_neighbors for element in sublist]

In [18]:
diversity_sample2 = train_df["prompts"].iloc[kmeans2_neighbors].index.tolist()

diversity_samples2 = [diversity_sample2 for x in range(len(test_df))]

In [19]:
#Diversity
kmeans = faiss.Kmeans(vector_dim, shots, verbose=True, niter=1000)
kmeans.train(train_vectors)
kmeans_distances, kmeans_neighbors = index.search(kmeans.centroids, 1)
kmeans_neighbors = [element for sublist in kmeans_neighbors for element in sublist]

diversity_sample = train_df["prompts"].iloc[kmeans_neighbors].index.tolist()

diversity_samples = [diversity_sample for x in range(len(test_df))]

Sampling a subset of 1280 / 5376 for training
Clustering 1280 points in 768D to 5 clusters, redo 1 times, 1000 iterations
  Preprocessing in 0.01 s
  Iteration 999 (0.38 s, search 0.23 s): objective=294.327 imbalance=1.043 nsplit=0       


In [20]:
train_df["prompts"].iloc[kmeans_neighbors].tolist()

['Post: america always was and always will be a white mans land all the jews and mud must leave or be forced out\nQuestion: Does this post contain offensive language?\nAnswer: yes',
 'Post: this is what happens when niggers gain power there is not one street one neighborhood one city one state or one country that is better off once the nigger infects it\nQuestion: Does this post contain offensive language?\nAnswer: yes',
 'Post: ffs people do unto others does not let foreigners and niggers come to your country and screw your daughters\nQuestion: Does this post contain offensive language?\nAnswer: yes',
 'Post: yet another life saving development coming out of what muslim terrorists and arab land thieves claim is jewish occupied palestine\nQuestion: Does this post contain offensive language?\nAnswer: yes',
 'Post: <user> <user> <user> nah boys nah im out niggers\nQuestion: Does this post contain offensive language?\nAnswer: yes']

In [21]:
#Random
random_samples = []
for row in test_df.itertuples():
    random_sample = train_df["prompts"].sample(n=shots).index.tolist()

    random_samples.append(random_sample)

In [22]:
import numpy as np

In [23]:
average_cosine_differences_between_within = []
average_cosine_differences_between_sim = []
average_cosine_difference_between_random = []
average_cosine_difference_between_diversity = []
average_cosine_difference_between_excluding = []

for i in range(len(within_samples)):

    within_vectors = train_vectors[within_samples[i]]
    sim_vectors = train_vectors[neighbors[i]]
    random_vectors = train_vectors[random_samples[i]]
    excluding_vectors = train_vectors[excluding_samples[i]]
    diversity_vectors = train_vectors[diversity_samples2[i]]


    average_cosine_differences_between_within.append(((within_vectors @ within_vectors.T)- np.identity(shots)).mean())
    average_cosine_differences_between_sim.append(((sim_vectors @ sim_vectors.T)- np.identity(shots)).mean())
    average_cosine_difference_between_random.append(((random_vectors @ random_vectors.T)- np.identity(shots)).mean())
    average_cosine_difference_between_diversity.append(((diversity_vectors @ diversity_vectors.T)- np.identity(shots)).mean())
    average_cosine_difference_between_excluding.append(((excluding_vectors @ excluding_vectors.T)- np.identity(shots)).mean())


ValueError: operands could not be broadcast together with shapes (10,10) (5,5) 

In [None]:
np.array(average_cosine_differences_between_within).mean()

In [None]:
np.array(average_cosine_differences_between_sim).mean()

In [None]:
np.array(average_cosine_difference_between_random).mean()

In [None]:
np.array(average_cosine_difference_between_diversity).mean()

In [None]:
np.array(average_cosine_difference_between_excluding).mean()