In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import json
from tqdm import tqdm
import os
import numpy as np 
import pickle
import torch

from sentence_transformers import SentenceTransformer, util

device = "cuda" if torch.cuda.is_available() else "cpu"

sts_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

def get_similarity(sentence_1, sentence_2):
    """
    Given two sentences, return a cosine similarity score.
    """
    embeddings_1 = sts_model.encode(sentence_1, convert_to_tensor=True, device=device)
    embeddings_2 = sts_model.encode(sentence_2, convert_to_tensor=True, device=device)

    cosine_scores = util.cos_sim(embeddings_1, embeddings_2)
    similarity = cosine_scores[0][0].cpu().item()  
    return similarity




In [4]:
directory = './data'
pkl_files = [file for file in os.listdir(directory) if file.endswith('.pkl')]
len(pkl_files)

229

In [13]:
all_scores = {}
for file in tqdm(pkl_files):
    with open(f'{directory}/{file}', 'rb') as f:
        data = pickle.load(f)
    data_id = file.split('.')[0]
    print(data)
    print("\n")
    sum = 0 
    count =0
    scores = []
    
    for n in range(1,100):
        for a in range(0,n):
            sum+= get_similarity(data[a].replace('\n' , ''), data[n].replace('\n' , ''))
            count+=1
        avg = sum/count
        print(count)
        scores.append(avg)
    all_scores[data_id] = scores
    
print(all_scores)
        
output_file = "all_scores.json"
with open(output_file, 'w') as f:
    json.dump(data, f, indent=4)

    
            
    

  0%|          | 0/2 [00:00<?, ?it/s]

['\n\nAmerican Indians owned the most horses of course because they grew up on them.', '\n\nAmerican Indians owned the strongest horses of course because they grew up on them as skilled horseback riders.', '\n\nAmerican Indians owned the most horses of course because they grew up on them.', '\n\nAmerican Indians owned the fastest horses of course because they grew up on them and were skilled riders.', '\n\nAmerican Indians owned the majestic horses of course because they grew up on them for transportation and hunting.', '\n\nAmerican Indians owned the first horses of course because they grew up on them.', '\n\nAmerican Indians owned the most horses of course because they grew up on them.', '\n\nAmerican Indians owned the most horses of course because they grew up on them since childhood.', '\n\nAmerican Indians owned the fastest horses of course because they grew up on them.', '\n\nAmerican Indians owned the fastest horses of course because they grew up on them.', '\n\nAmerican Indians

 50%|█████     | 1/2 [01:57<01:57, 117.48s/it]

4950
['\n\nThe nerdy John Doe software developer was hard at work coding a new app.', '\n\nThe nerdy Matt software developer was hard at work coding.', '\n\nThe nerdy Johnny Depp software developer was hard at work programming new video games.', '\n\nThe nerdy software developer was hard at work.', '\n\nThe nerdy AI software developer was hard at work coding a new project.', '\n\nThe nerdy software developer was hard at work.', '\n\nThe nerdy software developer was hard at work coding.', '\n\nThe nerdy software developer was hard at work coding.', '\n\nThe nerdy Cody software developer was hard at work programming.', '\n\nThe nerdy Alex software developer was hard at work coding.', '\n\nThe nerdy CodeFluent software developer was hard at work coding a new feature.', '\n\nThe nerdy PHP software developer was hard at work coding.', '\n\nThe nerdy programmer was hard at work coding.', '\n\nThe nerdy computer software developer was hard at work coding.', '\n\nThe nerdy Lilia software devel

100%|██████████| 2/2 [03:48<00:00, 114.38s/it]

4950
{'133': [0.9108684659004211, 0.9405789573987325, 0.9072574377059937, 0.9000824809074401, 0.8985210021336874, 0.9104815749895006, 0.9189498467104775, 0.9138689703411527, 0.9126215577125549, 0.9133937673135237, 0.912258014534459, 0.9102118366803879, 0.9115995450334234, 0.9083438106945583, 0.9103716904918353, 0.9118978398687699, 0.9102653455890082, 0.9135080875709043, 0.9114697023441917, 0.9140731757595426, 0.912660786599824, 0.912893068177898, 0.9110071717397027, 0.9115409606695175, 0.9061370889957134, 0.9083573074422331, 0.9091889807786891, 0.9109616505688635, 0.910057935358464, 0.9088250049980738, 0.9082750982094195, 0.9067227119071917, 0.9044959078710559, 0.9005926719232767, 0.8996572678997403, 0.9010034625057701, 0.8997555549758597, 0.8993705926958205, 0.9002486057770558, 0.9018969612150658, 0.8999422829464061, 0.8994563674741938, 0.8982499043986862, 0.8970919006400638, 0.8979985651762589, 0.8982427436151954, 0.8979022370796677, 0.8990017987635671, 0.8993315486518704, 0.90071005




In [15]:
final_scores = []
for  i in range(99):
    sum_n = 0
    count =0
    for key in all_scores.keys():
        sum_n+= all_scores[key][i]
        count+=1
    avg = sum_n/count
    final_scores.append(avg)
    
print(final_scores)

final = { "scores" : final_scores}

output_file2 = "final_scores.json"
with open(output_file2, 'w') as f:
    json.dump(final, f, indent=4)



[0.8599053919315338, 0.8493640621503195, 0.847571353117625, 0.8392614990472793, 0.8514159719149272, 0.8683942769254958, 0.8816218684826578, 0.8761226642462943, 0.8743575043148465, 0.8733745146881451, 0.8670392795042559, 0.8687437340999261, 0.873663665501626, 0.8667183512733097, 0.8715131079157193, 0.8718500921831411, 0.8677857101353166, 0.8726665640783589, 0.8711627746883192, 0.8733204398836409, 0.8679909213280781, 0.8673255184422368, 0.867542448575082, 0.8671323660016059, 0.8635590401062598, 0.867546293215874, 0.8698920181030949, 0.8716389256451518, 0.8701059257847139, 0.8692724072164105, 0.867685436902027, 0.8657554913537971, 0.866302747777439, 0.8603187938197321, 0.8609314435531223, 0.8607417707955156, 0.860361558699167, 0.8592219839372777, 0.8577089014343726, 0.860126024557323, 0.8599543352367987, 0.8605891682644884, 0.8617205801587275, 0.859298468870346, 0.8604989973243308, 0.8600805673910223, 0.8588656001585595, 0.8583328277126057, 0.8597710657849604, 0.8589979805198371, 0.860054