In [1]:
import json
import os, re
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
def load_jsonl(path):
    data=[]
    with open(path, 'r', encoding='utf-8') as reader:
        for line in reader:
            data.append(json.loads(line))
    return data 

def save_jsonl(name, data):
    with open(name, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

In [3]:
narrasum_dataset = load_jsonl('../data/narrasum/narrasum_sampled_data_with_decomposed_mcs_gpt4.jsonl')

gold_concepts = {}
for idx, d in enumerate(narrasum_dataset):
    gold_concepts[idx] = d['decomposed_summary_sentences_gpt4']

In [6]:
generated_concepts_dir = '../MCGenerator/Responses/narrasum/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'
eval_response_save_dir = './Responses/narrasum/zero_shot/llama3/original_narrative_input/monte-carlo-same-temp/'

In [7]:
target_prompts = ['help_me_understand', 'five_ws_definition', 'five_finger_definition_2']
prompt_names = os.listdir(generated_concepts_dir)
filtered_prompts = [name for name in prompt_names if any(target in name for target in target_prompts)]

stimuli2mcs = {}
for prompt_name in sorted(filtered_prompts):
    if '.ipynb_checkpoints' in prompt_name: continue
        
    print(prompt_name)
        
    data = load_jsonl(generated_concepts_dir+'/'+prompt_name.split('/')[-1])
    
    for r in data:
        if r['row'] not in stimuli2mcs:
            stimuli2mcs[r['row']] = r['decomposed_mcs']
        else:
            stimuli2mcs[r['row']].extend(r['decomposed_mcs'])

five_finger_definition_2_temp_0.67_0.jsonl
five_finger_definition_2_temp_0.67_1.jsonl
five_finger_definition_2_temp_0.67_2.jsonl
five_finger_definition_2_temp_0.67_3.jsonl
five_finger_definition_2_temp_0.67_4.jsonl
five_ws_definition_temp_0.67_0.jsonl
five_ws_definition_temp_0.67_1.jsonl
five_ws_definition_temp_0.67_2.jsonl
five_ws_definition_temp_0.67_3.jsonl
five_ws_definition_temp_0.67_4.jsonl
help_me_understand_temp_0.67_0.jsonl
help_me_understand_temp_0.67_1.jsonl
help_me_understand_temp_0.67_2.jsonl
help_me_understand_temp_0.67_3.jsonl
help_me_understand_temp_0.67_4.jsonl


In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


from sklearn.datasets import make_blobs
from pdc_dp_means import DPMeans
import numpy as np

def cosine_similarity(v1, v2):
    """Compute the cosine similarity between two vectors."""
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / (norm_v1 * norm_v2)

def find_closest_examples(n_clusters, cluster_centers, embeddings, cluster_predictions, mcs):
    closest_examples = {}

    for cluster in range(n_clusters):
        if cluster not in cluster_predictions: continue
        cluster_indices = np.where(cluster_predictions == cluster)[0]
        cluster_embeddings = embeddings[cluster_indices]
        
        similarities = [
            cosine_similarity(embedding, cluster_centers[cluster]) 
            for embedding in cluster_embeddings
        ]
        closest_index_in_cluster = np.argmax(similarities)
        
        # distances = np.linalg.norm(cluster_embeddings - cluster_centers[cluster], axis=1)
        # closest_index_in_cluster = np.argmin(distances)

        closest_examples[cluster] = mcs[cluster_indices[closest_index_in_cluster]]

    return closest_examples

Couldn't import dot_parser, loading of dot files will not be possible.


  from tqdm.autonotebook import tqdm, trange
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/ankitagupta/anaconda3/envs/supcourt/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
from sklearn.preprocessing import normalize
from tqdm import tqdm
from pdc_dp_means import DPMeans

deltas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4]

mcs_per_stimuli = {}
for stimuli in tqdm(np.arange(100)):
    
    mcs = stimuli2mcs[stimuli]
    mcs = [m for m in mcs if m!='']
    
    mc_embeddings = model.encode(mcs)
    mc_embeddings = mc_embeddings.astype(np.float64)
    mc_embeddings = normalize(mc_embeddings, norm='l2')
    assert mc_embeddings.shape[0]==len(mcs), 'error in embedding process'
    
    mcs_per_delta = {}
    for delta in deltas:
        #print(delta)
        dpmeans = DPMeans(n_init=1, delta=delta)  # n_init and delta parameters
        dpmeans.fit(mc_embeddings)

        cluster_predictions = dpmeans.predict(mc_embeddings)
        cluster_centers = dpmeans.cluster_centers_
        n_clusters = dpmeans.n_clusters
        closest_examples = find_closest_examples(n_clusters, cluster_centers,\
                                                 mc_embeddings,\
                                                 cluster_predictions,\
                                                 mcs)

        mcs_per_delta[delta] = {'closest_examples': list(closest_examples.values()),
                                  'n_clusters': n_clusters,\
                                  'cluster_predictions': cluster_predictions,\
                                  'cluster_centers': cluster_centers}
    
    mcs_per_stimuli[stimuli] = mcs_per_delta

In [None]:
# import pickle 
# with open('./clustering_results/narrasum/results/clustered_mcs_topthreeprompts_run2.pickle', 'wb') as f:
#     pickle.dump(mcs_per_stimuli, f)
    
with open('./clustering_results/narrasum/results/clustered_mcs_topthreeprompts_run2.pickle', 'wb') as f:
    pickle.dump(mcs_per_stimuli, f)