In [1]:
import json
import os, re
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

def save_jsonl(name, data):
    with open(name, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

In [3]:
def get_main_concepts(stimuli_name: str) -> dict:
    with open('../data/BATS/goldMCs/' + stimuli_name + '.json', 'r') as f:
        concepts = json.load(f)
    return concepts

In [4]:
generated_concepts_dir = '../MCGenerator/Responses/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
eval_response_save_dir = './Responses/zero_shot/gpt4o/original_narrative_input/monte-carlo-same-temp-decomposed-mcs/'

In [5]:
vs_stimuli = ['MarcusYam', 'SylviaEarle', 'NaomiDeLaRosa', 'RobinSteinberg']
sd_stimuli = ['AuntMother', 'Ferguson', 'Sept11', 'NoHandbook']
list_of_stimuli = sd_stimuli+vs_stimuli

stimuli2nummc = {'AuntMother':7,\
                 'Ferguson':10,\
                 'Sept11':12,\
                 'NoHandbook':11,\
                 'MarcusYam':11,\
                 'SylviaEarle':8,\
                 'NaomiDeLaRosa':8,\
                 'RobinSteinberg':7}

In [6]:
stimuli2mcs = {}
for prompt_name in os.listdir(generated_concepts_dir):
    if '.ipynb_checkpoints' in prompt_name: continue
    data = load_jsonl(generated_concepts_dir+'/'+prompt_name.split('/')[-1])
    for r in data:
        if r['Stimuli'] not in stimuli2mcs:
            stimuli2mcs[r['Stimuli']] = r['decomposed_mcs']
        else:
            stimuli2mcs[r['Stimuli']].extend(r['decomposed_mcs'])

### Cluster MCs

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Couldn't import dot_parser, loading of dot files will not be possible.


  from tqdm.autonotebook import tqdm, trange
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/ankitagupta/anaconda3/envs/supcourt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [8]:
from sklearn.datasets import make_blobs
from pdc_dp_means import DPMeans
import numpy as np

def cosine_similarity(v1, v2):
    """Compute the cosine similarity between two vectors."""
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / (norm_v1 * norm_v2)

def find_closest_examples(n_clusters, cluster_centers, embeddings, cluster_predictions, mcs):
    closest_examples = {}

    for cluster in range(n_clusters):
        if cluster not in cluster_predictions: continue
        cluster_indices = np.where(cluster_predictions == cluster)[0]
        cluster_embeddings = embeddings[cluster_indices]
        
        similarities = [
            cosine_similarity(embedding, cluster_centers[cluster]) 
            for embedding in cluster_embeddings
        ]
        closest_index_in_cluster = np.argmax(similarities)

        closest_examples[cluster] = mcs[cluster_indices[closest_index_in_cluster]]

    return closest_examples

In [9]:
from sklearn.preprocessing import normalize
from tqdm import tqdm
from pdc_dp_means import DPMeans
deltas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 5, 10, 15, 20]

mcs_per_stimuli = {}
for stimuli in tqdm(list_of_stimuli):
    mcs = stimuli2mcs[stimuli]
    mcs = [m for m in mcs if m!='']
    mc_embeddings = model.encode(mcs)
    mc_embeddings = mc_embeddings.astype(np.float64)
    mc_embeddings = normalize(mc_embeddings, norm='l2')
    assert mc_embeddings.shape[0]==len(mcs), 'error in embedding process'
    
    mcs_per_delta = {}
    for delta in deltas:
        from pdc_dp_means import DPMeans
        dpmeans = DPMeans(n_init=1, delta=delta)  # n_init and delta parameters
        dpmeans.fit(mc_embeddings)

        cluster_predictions = dpmeans.predict(mc_embeddings)
        cluster_centers = dpmeans.cluster_centers_
        n_clusters = dpmeans.n_clusters
        closest_examples = find_closest_examples(n_clusters, cluster_centers,\
                                                 mc_embeddings,\
                                                 cluster_predictions,\
                                                 mcs)

        mcs_per_delta[delta] = {'closest_examples': list(closest_examples.values()),
                                  'n_clusters': n_clusters,\
                                  'cluster_predictions': cluster_predictions,\
                                  'cluster_centers': cluster_centers}
    
    mcs_per_stimuli[stimuli] = mcs_per_delta

  0%|                                                                                        | 0/8 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████████████████████████████████████████████████| 8/8 [02:05<00:00, 15.67s/it]


In [None]:
# import pickle 
# with open('./clustering_results/bats/deduplicated_mcs_clustering_run2.pickle', 'wb') as f:
#     pickle.dump(mcs_per_stimuli, f)
    
import pickle 
with open('./clustering_results/bats/deduplicated_mcs_clustering_run2.pickle', 'rb') as f:
    mcs_per_stimuli = pickle.load(f)