In [None]:
import torch
from datasets import Dataset, DatasetDict, IterableDataset, load_dataset,load_from_disk
import hashlib
import os
import tqdm
file_path = ""

data = torch.load(file_path)

dataset_path=""
train_dataset = load_dataset(
            dataset_path,
            split="train",
            trust_remote_code=True,
        )


In [None]:
print(train_dataset)


In [None]:

columns_to_keep = [ 'id',"image", "conversations"]

train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in columns_to_keep])

print(train_dataset)


In [15]:

data_list=[]
for data_l in data:
    keys=list(data_l.keys())
    for key in keys:
        data_list.append((key,data_l[key]))


In [None]:
print(len(data_list))
print(data_list[0]) 

In [17]:
def generate_text_hash(text: str) -> str:
    """
    Generate a unique identifier for the given text using SHA-256.

    Args:
        text (str): Input text.

    Returns:
        str: Unique hash for the text.
    """
    hash_object = hashlib.sha256(text.encode('utf-8'))
    return hash_object.hexdigest()

In [None]:
dataset_dict = {}
for j, item in tqdm.tqdm(enumerate(train_dataset), desc="Processing data_list", unit="item"):
    key = generate_text_hash(item['conversations'][0]["value"]+item['conversations'][1]["value"])
    if key not in dataset_dict:
        dataset_dict[key] = []
    dataset_dict[key].append(j) 

formatted_dataset = []
index_set = set() 

for i in tqdm.tqdm(range(len(data_list)), desc="Processing data_list", unit="item"):
    key_datalist, value_datalist = data_list[i]
    
    if key_datalist in dataset_dict:
        for j in dataset_dict[key_datalist]:
            if j not in index_set:  
                new_item = train_dataset[j].copy() 
                new_item["cooccur_score"] = value_datalist
                formatted_dataset.append(new_item)
                
                index_set.add(j)
                break  


In [None]:
print(len(formatted_dataset))
print(formatted_dataset[0]["cooccur_score"])

In [None]:

cosi_file_path = ""  
osi_dict = {}
with open(cosi_file_path, "r") as file:
    for line in file:
        key, value = line.strip().split(",")  
        osi_dict[int(key)] = float(value)    

print(osi_dict)

In [None]:
set_cosi_key=set(osi_dict.keys())
cosi_coocur_data={}
for f_data in tqdm.tqdm(formatted_dataset):
    score=0
    for value in f_data['cooccur_score']:
        if value in set_cosi_key:
            score+=osi_dict[value]
    f_data["Cooccur_score"]=score

In [None]:
print(formatted_dataset[-1]["Cooccur_score"])

In [None]:
dataset = Dataset.from_list(formatted_dataset)
dataset.save_to_disk("")

In [39]:
for item in formatted_dataset:
    item["l0"]=float(item["l0"])

In [25]:
formatted_dataset_sorted = sorted(formatted_dataset, key=lambda x: x["Cooccur_score"], reverse=True)

In [None]:
print(formatted_dataset_sorted[0]["Cooccur_score"])
print(formatted_dataset_sorted[-1]["Cooccur_score"])

In [None]:


num_samples = len(formatted_dataset_sorted)
q1 = int(num_samples * 0.25)
q2 = int(num_samples * 0.5)
q3 = int(num_samples * 0.75)

split_datasets = {
    "q0_25": formatted_dataset_sorted[:q1],
    "q25_50": formatted_dataset_sorted[q1:q2],
    "q50_75": formatted_dataset_sorted[q2:q3],
    "q75_100": formatted_dataset_sorted[q3:]
}

hf_datasets = {}
for split_name, split_data in tqdm.tqdm(split_datasets.items(), desc="Processing splits", unit="split"):
    formatted_data = {}
    
    for key in tqdm.tqdm(split_data[0].keys(), desc=f"Formatting {split_name}", unit="column",position=1):
        formatted_data[key] = [d[key] for d in split_data]
    
    hf_datasets[split_name] = Dataset.from_dict(formatted_data)

    hf_datasets[split_name].save_to_disk(f"./{split_name}_dataset")

for split_name, dataset in hf_datasets.items():
    print(f"{split_name} dataset:")
    print(dataset)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
from collections import Counter

values_list=[float(tensor["Cooccur_score"]) for tensor in formatted_dataset_sorted]
num_bins = 10

frequencies, bin_edges = np.histogram(values_list, bins=num_bins)

bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

bin_centers_smooth = np.linspace(bin_centers[0], bin_centers[-1], 300)  
frequencies_smooth = make_interp_spline(bin_centers, frequencies)(bin_centers_smooth)

plt.figure(figsize=(8, 6))
plt.plot(bin_centers_smooth, frequencies_smooth, color='orange', lw=2)


plt.grid(which='both', linestyle='--', linewidth=0.5, alpha=0.7)
plt.xlabel('cosine similarity score of data ', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('The distribution of Compcap data based on cosine similarity score', fontsize=16)
plt.legend(fontsize=12)
plt.show()
