In [3]:
import re
import string
import os
import numpy as np
import pandas as pd
import torch
from torch import clamp
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import glob
from natsort import natsorted
import time

class SemanticSearch:

    def __init__(self, corpus_embeddings_path='C:/Users/Geraldus Wilsen/Documents/ProjectRekomendasiObat/data/corpus_dense_embeddings_sbert.npy', cluster_centroid_embeddings_path='cluster_centroid_data.h5'):
        self.corpus_embeddings_path = corpus_embeddings_path
        self.cluster_centroid_embeddings_path = cluster_centroid_embeddings_path
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def load_pretrained(self, from_pretrained:str="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"):
        self.model = SentenceTransformer(from_pretrained)
        self.model.to(self.device)

    def process(self, *corpora):
        print('Encoding process using', self.device)
        for corpus in corpora:

            if len(corpus) > 100:
                max_size = 100
                smaller_batch = [corpus[i:i + max_size] for i in range(0, len(corpus), max_size)]
                print(len(corpus), 'corpus separated into', len(smaller_batch), 'smaller batch')

                i = 1
                for batch in smaller_batch:
                    mean_pooled = self.model.encode(batch)
                    np.save(f'temp/temp_{i}.npy', mean_pooled)
                    print(f"Finish embed corpus, batch {i}")
                    i += 1
                    time.sleep(10)

            else:
                mean_pooled = self.model.encode(corpus)
                print("Finish embed query")

        temp_directories = natsorted(glob.glob("temp/*.npy"))
        if len(temp_directories) > 1:
          corpus = []
          for e in natsorted(glob.glob("temp/*.npy")):
              print(e)
              corpus.append(np.load(e))
              mean_pooled = np.vstack(corpus)
              print('Success corpus append')
              os.remove(e)
          np.save('corpus_dense_embeddings_sbert.npy', mean_pooled)
          print(f"Finish embed corpus")

        return mean_pooled

    def rank(self, corpus, query):

        if os.path.exists(self.corpus_embeddings_path):
            corpus_embeddings = np.load(self.corpus_embeddings_path)
        else:
            corpus_embeddings = self.process(corpus)
        query_embeddings = self.process([query])

        rank = cosine_similarity(query_embeddings,corpus_embeddings)
        rank_dict = {i: rank[0, i] for i in range(len(rank[0]))}

        dense_rank = dict(sorted(rank_dict.items(), key=lambda item: item[1], reverse=True))

        return dense_rank

    def get_result(self, corpus, query, n:int=10):
        dense_rank = self.rank(corpus, query)
        corpus_id = list(dense_rank.keys())
        result = []
        for id in corpus_id[:n]:
            result.append(corpus[id])
        return result

In [None]:
model = SemanticSearch()
model.load_pretrained()

df = pd.read_csv('data_obat_fix_ordered.csv')
sentences = df['summary'].to_list()

print(len(sentences))
final_embedding = model.process(sentences)
print(final_embedding)

10058
Encoding process using cuda:0
10058 corpus separated into 101 smaller batch
Finish embed corpus, batch 1
Finish embed corpus, batch 2
Finish embed corpus, batch 3
Finish embed corpus, batch 4
Finish embed corpus, batch 5
Finish embed corpus, batch 6
Finish embed corpus, batch 7
Finish embed corpus, batch 8
Finish embed corpus, batch 9
Finish embed corpus, batch 10
Finish embed corpus, batch 11
Finish embed corpus, batch 12
Finish embed corpus, batch 13
Finish embed corpus, batch 14
Finish embed corpus, batch 15
Finish embed corpus, batch 16
Finish embed corpus, batch 17
Finish embed corpus, batch 18
Finish embed corpus, batch 19
Finish embed corpus, batch 20
Finish embed corpus, batch 21
Finish embed corpus, batch 22
Finish embed corpus, batch 23
Finish embed corpus, batch 24
Finish embed corpus, batch 25
Finish embed corpus, batch 26
Finish embed corpus, batch 27
Finish embed corpus, batch 28
Finish embed corpus, batch 29
Finish embed corpus, batch 30
Finish embed corpus, batch 

In [8]:
df = pd.read_csv('C:/Users/Geraldus Wilsen/Documents/ProjectRekomendasiObat/data/data_obat_fix_ordered.csv')
corpus = df['summary'].to_list()
query = "pil untuk mengurangi gatal di kulit"
model = SemanticSearch()
model.load_pretrained()
model.get_result(corpus, query)

Encoding process using cpu
Finish embed query


['borobudur kamal pil kamal membantu mengurangi gatal  gatal kulit',
 'kapsida kembang bulan kapsul membantu meringankan gatal  gatal  bisul  koreng jerawat',
 'borobudur kamal kapsul kamal membantu mengurangi gatal  gatal kulit',
 'borobudur darsi pil darsi membantu mengurangi jerawat  bisul  gatal  gatal',
 'sido muncul aluss kapsul aluss mencegah mengobati jerawat  menghilangkan bercak  bercak hitam kulit  menjaga kulit halus bersih',
 'triamcorta krim triamcinolone acetonide mengurangi peradangan gatal disebabkan kelai kulit responsif kortikosteroid efek anti inflamasi  anti alergi  anti pruritus gatal',
 'apolar cream krim desonide mengatasi kondisi kulit mengalami inflamasi akibat penyakit dermatitis kontak atopik',
 'apolar cream krim desonide mengatasi kondisi kulit mengalami inflamasi akibat penyakit dermatitis kontak atopik',
 'rodeca lotion  asam salisilat meringankan gatal disebabkan biang keringat',
 'borobudur darsi kapsul darsi membantu mengurangi jerawat  bisul  gatal  