Sebelum run code ini:
1. Buat folder temp, small batch
2. masukin data_obat_fix.csv

In [2]:
import re
import string
import os
import numpy as np
import pandas as pd
import torch
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import glob
from natsort import natsorted
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class SemanticSearch:

    def __init__(self, corpus_embeddings_path='C:/Users/Geraldus Wilsen/Documents/ProjectRekomendasiObat/data/corpus_dense_embeddings_indobert.npy'):
        self.corpus_embeddings_path = corpus_embeddings_path
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def load_pretrained(self, from_pretrained:str="indobenchmark/indobert-base-p1"):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        self.model.to(self.device)

    def encode(self, corpus, max_length):
        tokens = {'input_ids': [], 'attention_mask': []}

        for sentence in corpus:
            new_tokens = self.tokenizer.encode_plus(sentence, max_length=max_length, truncation=True,
                                                   padding='max_length', return_tensors='pt')
            tokens['input_ids'].append(new_tokens['input_ids'][0])
            tokens['attention_mask'].append(new_tokens['attention_mask'][0])

        # Move tensors to GPU
        tokens['input_ids'] = torch.stack(tokens['input_ids']).to(self.device)
        tokens['attention_mask'] = torch.stack(tokens['attention_mask']).to(self.device)

        # Run the model on GPU
        with torch.no_grad():
            outputs = self.model(**tokens)

        embeddings = outputs.last_hidden_state

        attention_mask = tokens['attention_mask']

        mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

        masked_embeddings = embeddings * mask

        summed = torch.sum(masked_embeddings, 1)
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / summed_mask

        # Convert from PyTorch tensor to numpy array
        mean_pooled = mean_pooled.detach().cpu().numpy()

        return mean_pooled

    def process(self, *corpora):
        print('Encoding process using', self.device)
        for max_length, corpus in zip([16, 32, 64, 128], corpora):
            print(f"Starting to embed corpus with {max_length} max length word")

            if len(corpus) > 50:
                max_size = 50
                smaller_batch = [corpus[i:i + max_size] for i in range(0, len(corpus), max_size)]
                print(len(corpus), 'in corpus with', max_length, ' max length word separated into', len(smaller_batch),
                      'smaller batch')

                i = 1
                for batch in smaller_batch:
                    mean_pooled = self.encode(batch, max_length)
                    np.save(f'temp/temp_{i}_{max_length}.npy', mean_pooled)
                    print(f"Finish embed corpus with {max_length} max length word, batch {i}")
                    i += 1
                    time.sleep(30)

                corpus = []
                for e in natsorted(glob.glob("temp/*.npy")):
                    print(e)
                    corpus.append(np.load(e))
                    mean_pooled = np.vstack(corpus)
                    print('Success corpus append')
                    os.remove(e)
                np.save(f'small_batch/corpus_dense_embeddings_{max_length}.npy', mean_pooled)
                print(f"Finish embed corpus with {max_length} length of word")

                time.sleep(60)

            else:
                # max_length = len(corpus[0].split(" "))
                mean_pooled = self.encode(corpus, max_length)
                print("Finish embed query")


        if len(corpora) > 1:
            corpus = []
            for e in natsorted(glob.glob("small_batch/*.npy")):
                print(e)
                corpus.append(np.load(e))
                mean_pooled = np.vstack(corpus)
                print('Success corpus append')
            np.save('corpus_dense_embeddings_indobert.npy', mean_pooled)

        return mean_pooled

    def rank(self, corpus, query):

        if os.path.exists(self.corpus_embeddings_path):
            corpus_embeddings = np.load(self.corpus_embeddings_path)
        else:
            corpus_embeddings = self.process(corpus)
        query_embeddings = self.process([query])

        rank = cosine_similarity(query_embeddings,corpus_embeddings)
        rank_dict = {i: rank[0, i] for i in range(len(rank[0]))}

        dense_rank = dict(sorted(rank_dict.items(), key=lambda item: item[1], reverse=True))

        return dense_rank

    def get_result(self, corpus, query, n:int=10):
        dense_rank = self.rank(corpus, query)
        corpus_id = list(dense_rank.keys())
        result = []
        for id in corpus_id[:n]:
            result.append(corpus[id])
        return result

In [None]:
df = pd.read_csv('C:/Users/Geraldus Wilsen/Documents/ProjectRekomendasiObat/data/data_obat_fix_ordered.csv')
max_length_16= df.loc[df['max_length'] == 16, 'summary'].tolist()
max_length_32= df.loc[df['max_length'] == 32, 'summary'].tolist()
max_length_64= df.loc[df['max_length'] == 64, 'summary'].tolist()
max_length_128= df.loc[df['max_length'] == 128, 'summary'].tolist()

In [None]:
model = SemanticSearch()
model.load_pretrained()

model.process(max_length_16,max_length_32,max_length_64, max_length_128)

In [19]:
df = pd.read_csv('C:/Users/Geraldus Wilsen/Documents/ProjectRekomendasiObat/data/data_obat_fix_ordered.csv')
corpus = df['summary'].to_list()
query = "pil untuk mengurangi gatal di kulit"
model = SemanticSearch()
model.load_pretrained()
model.get_result(corpus, query)

Encoding process using cpu
Starting to embed corpus with 16 max length word
Finish embed query


['borobudur kamal pil kamal membantu mengurangi gatal  gatal kulit',
 'borobudur kamal kapsul kamal membantu mengurangi gatal  gatal kulit',
 'kapsida kembang bulan kapsul membantu meringankan gatal  gatal  bisul  koreng jerawat',
 'borobudur darsi pil darsi membantu mengurangi jerawat  bisul  gatal  gatal',
 'borobudur darsi kapsul darsi membantu mengurangi jerawat  bisul  gatal  gatal',
 'borobudur darsi kapsul darsi membantu mengurangi jerawat  bisul  gatal  gatal',
 'borobudur darsi kapsul darsi membantu mengurangi jerawat  bisul  gatal  gatal',
 'sang putih kapsul secara tradisional membantu lendir  mengurangi gatal bau tidak sedap daerah kewanitaan',
 'borobudur mastin neem kapsul mastin neem membantu meringankan jerawat gatal  gatal',
 'migranal pil digunakan membantu meredakan sakit kepala']