In [None]:
import os 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from sentence_transformers import SentenceTransformer, util
modelHindi = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [None]:
def summarize_kmeans(sentences,hindi=False):
   
    model=modelHindi
    embeddings = [model.encode(sentence) for sentence in sentences]
    k=max(1,len(sentences)//4)
    kmeans = KMeans(n_clusters=k, n_init=k,random_state=0).fit(embeddings)
    cluster_centers = kmeans.cluster_centers_
    cluster_indices = kmeans.predict(embeddings)

    summary_sentences = []
    for i in range(k):
        cluster = [sentences[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        cluster_embeddings = [embeddings[j] for j in range(len(sentences)) if cluster_indices[j] == i]
        centroid = cluster_centers[i]
        closest_sentence_idx = min(range(len(cluster_embeddings)), key=lambda x: cosine_similarity([centroid], [cluster_embeddings[x]]))
        summary_sentences.append(cluster[closest_sentence_idx])
    return summary_sentences

In [None]:
input_path="../data/"

# Recursively create output directory if it doesn't exist
output_path="../results/summary/"

os.makedirs(output_path, exist_ok=True)
if os.path.exists(output_path) == False:
    os.mkdir(output_path)

data = pd.read_csv(f'{input_path}data.csv')
sentences = data['sentences'].apply(eval)
summary=[]
for i in tqdm(range(len(sentences))):
    summary.append(summarize_kmeans(sentences[i],hindi=True))

data['summary'] = summary
data.to_csv(f'{output_path}kmeans.csv', index=False)