In [46]:
# create embeddings for the data (pdf and html files) and store them in local

In [47]:
# import necessary packages
import os
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader
import sys
sys.path.append('src/')

In [48]:
#import config an utils as the configuration file of different resources in the processing
import config

In [49]:
# load the embeddings 
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5",
                                   model_kwargs = {'device':'cuda:2'},)

In [50]:
# create the text splitter that you want to use
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap= 100, 
                                               separators=["\n\n","\n"," ",""]) 

In [54]:
# Example usage:
directory = "data/"
pdf_files_list = config.find_files(directory,'.pdf')
html_files_list = config.find_files(directory,'.html')

In [55]:
html_files_list

['data/LILRB2 - an overview _ ScienceDirect Topics.html',
 'data/NASH Overview_ Causes, Symptoms, Diagnosis, and Treatment _ Pfizer.html']

In [56]:
# pdf splittings

TEXT_pdf = config.pdf_chunks(pdf_files_list,text_splitter)

In [57]:
# html splittings
TEXT_html = config.html_chunks(html_files_list)

In [58]:
TEXT_html[1]

Document(page_content="\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nNASH Overview: Causes, Symptoms, Diagnosis, and Treatment | Pfizer\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Sorry, you need to enable JavaScript to visit this website.\n        \n\n\n\n      Skip to main content\n    \nScienceClinical TrialsGuide to Clinical Trials\n              Your participation makes a difference\n\n          Clinical Trials in Children\n              Designed to improve kids' health\n\n          Data and Results\n              Sharing our Results\n\n          Integrity and Transparency\n              Building Trust\n\n          Diversity\n              Equity and Representation\n\n          Plain Language Study Results \n              Trial Result Summaries\n\n          Expanded Access & Compassionate Use\n              Possible Treatment Options\n\n          Find a TrialAreas of FocusRare Disease\n              Smaller populations but big impact\n\n          Internal Medicine\n          

In [59]:
# all Splittings

ALL_TEXT  = TEXT_pdf + TEXT_html

vectorstore = FAISS.from_documents(ALL_TEXT, embeddings)
vectorstore.save_local("vector_db")
print("Embeddings successfully saved in vector Database and saved locally")


Embeddings successfully saved in vector Database and saved locally
