In [35]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import torch
import faiss
import os
from tqdm import tqdm
from glob import glob

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device},
    cache_folder="cache",
)


folder_path = "/Users/ha/Downloads/mirrulations/specific/"
folders = os.listdir(folder_path)

In [37]:
from uuid import uuid4

from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

for fold in folders:
    for com_file in tqdm(glob(folder_path + fold + "/**/raw-data/documents/*htm", recursive=True), desc=f"Processing files in {fold}"):
        with open(com_file, "r") as file:
            text = file.read().lower()
            print(text)
            vector_store.add_documents(
                documents=[Document(page_content=text, metadata={"source": com_file})],
                ids=[str(uuid4())],
            )
            print(f"Added {com_file} to vector store.")
            
            
vector_store.save_local("vector_store")

Processing files in CMS-2019-0039:   0%|          | 0/4 [00:00<?, ?it/s]

<html>
<head>
<title>federal register, volume 84 issue 42 (monday, march 4, 2019)</title>
</head>
<body><pre>
[federal register volume 84, number 42 (monday, march 4, 2019)]
[proposed rules]
[pages 7610-7680]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2019-02200]



  federal register / vol. 84 , no. 42 / monday, march 4, 2019 / 
proposed rules  
-----------------------------------------------------------------------

department of health and human services

centers for medicare & medicaid services

42 cfr parts 406, 407, 422, 423, 431, 438, 457, 482, and 485

office of the secretary

45 cfr part 156

[cms-9115-p]
rin 0938-at79


medicare and medicaid programs; patient protection and affordable 
care act; interoperability and patient access for medicare advantage 
organization and medicaid managed care plans, state medicaid agencies, 
chip agencies and chip managed care entities, issuers of qualified 

Processing files in CMS-2019-0039:  25%|██▌       | 1/4 [00:00<00:00,  3.57it/s]

Added /Users/ha/Downloads/mirrulations/specific/CMS-2019-0039/raw-data/documents/CMS-2019-0039-0001_content.htm to vector store.
<html>
<head>
<title>federal register, volume 85 issue 85 (friday, may 1, 2020)</title>
</head>
<body><pre>[federal register volume 85, number 85 (friday, may 1, 2020)]
[rules and regulations]
[pages 25510-25640]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2020-05050]



[[page 25509]]

vol. 85

friday,

no. 85

may 1, 2020

part ii





department of health and human services





-----------------------------------------------------------------------





centers for medicare & medicaid services





-----------------------------------------------------------------------





42 cfr parts 406, 407, 422, et al.

45 cfr part 156





medicare and medicaid programs; patient protection and affordable care 
act; interoperability and patient access for medicare advantage 
organiz

Processing files in CMS-2019-0039: 100%|██████████| 4/4 [00:00<00:00,  5.01it/s]


Added /Users/ha/Downloads/mirrulations/specific/CMS-2019-0039/raw-data/documents/CMS-2019-0039-1625_content.htm to vector store.
<html>
<head>
<title>federal register, volume 86 issue 235 (friday, december 10, 2021)</title>
</head>
<body><pre>[federal register volume 86, number 235 (friday, december 10, 2021)]
[rules and regulations]
[pages 70412-70413]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2021-26764]


-----------------------------------------------------------------------

department of health and human services

centers for medicare & medicaid services

42 cfr parts 422, 431, 435, 438, 440, and 457

[cms-9115-n2]


medicare and medicaid programs; patient protection and affordable 
care act; interoperability and patient access for medicare advantage 
organizations and medicaid managed care plans, state medicaid agencies, 
chip agencies and chip managed care entities, issuers of qualified 
heal

Processing files in .DS_Store: 0it [00:00, ?it/s]
Processing files in DEA-2016-0015: 100%|██████████| 2/2 [00:00<00:00, 32.51it/s]


<html>
<head>
<title>federal register, volume 81 issue 169 (wednesday, august 31, 2016)</title>
</head>
<body><pre>
[federal register volume 81, number 169 (wednesday, august 31, 2016)]
[proposed rules]
[pages 59929-59934]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2016-20803]


-----------------------------------------------------------------------

department of justice

drug enforcement administration

21 cfr part 1308

[docket no. dea-442]


schedules of controlled substances: temporary placement of 
mitragynine and 7-hydroxymitragynine into schedule i

agency: drug enforcement administration, department of justice.

action: notice of intent.

-----------------------------------------------------------------------

summary: the administrator of the drug enforcement administration is 
issuing this notice of intent to temporarily schedule the opioids 
mitragynine and 7-hydroxymitragynine, which are 

Processing files in DEA-2024-0059:   0%|          | 0/2 [00:00<?, ?it/s]

<html>
<head>
<title>federal register, volume 89 issue 99 (tuesday, may 21, 2024)</title>
</head>
<body><pre>
[federal register volume 89, number 99 (tuesday, may 21, 2024)]
[proposed rules]
[pages 44597-44622]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2024-11137]


-----------------------------------------------------------------------

department of justice

drug enforcement administration

21 cfr part 1308

[docket no. dea-1362; a.g. order no. 5931-2024]


schedules of controlled substances: rescheduling of marijuana

agency: drug enforcement administration, department of justice.

action: notice of proposed rulemaking.

-----------------------------------------------------------------------

summary: the department of justice (``doj'') proposes to transfer 
marijuana from schedule i of the controlled substances act (``csa'') to 
schedule iii of the csa, consistent with the view of the department 

Processing files in DEA-2024-0059:  50%|█████     | 1/2 [00:00<00:00,  8.77it/s]

Added /Users/ha/Downloads/mirrulations/specific/DEA-2024-0059/raw-data/documents/DEA-2024-0059-0001_content.htm to vector store.
<html>
<head>
<title>federal register, volume 89 issue 168 (thursday, august 29, 2024)</title>
</head>
<body><pre>
[federal register volume 89, number 168 (thursday, august 29, 2024)]
[proposed rules]
[pages 70148-70149]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2024-19370]


-----------------------------------------------------------------------

department of justice

drug enforcement administration

21 cfr part 1301

[docket no. dea-1362]
rin 1117-ab77


schedules of controlled substances: rescheduling of marijuana

agency: drug enforcement administration, department of justice.

action: notice of hearing on proposed rulemaking.

-----------------------------------------------------------------------

summary: this is notice that the drug enforcement administration will 

Processing files in DEA-2024-0059: 100%|██████████| 2/2 [00:00<00:00, 14.79it/s]


Added /Users/ha/Downloads/mirrulations/specific/DEA-2024-0059/raw-data/documents/DEA-2024-0059-42928_content.htm to vector store.


Processing files in HHS-ONC-2019-0002:   0%|          | 0/2 [00:00<?, ?it/s]

<html>
<head>
<title>federal register, volume 84 issue 78 (tuesday, april 23, 2019)</title>
</head>
<body><pre>[federal register volume 84, number 78 (tuesday, april 23, 2019)]
[proposed rules]
[pages 16834-16835]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2019-08178]


-----------------------------------------------------------------------

department of health and human services

office of the secretary

45 cfr parts 170 and 171

rin 0955-aa01


21st century cures act: interoperability, information blocking, 
and the onc health it certification program

agency: office of the national coordinator for health information 
technology (onc), department of health and human services (hhs).

action: proposed rule; extension of comment period.

-----------------------------------------------------------------------

summary: on march 4, 2019, the department of health and human services 
(hhs) published a pro

Processing files in HHS-ONC-2019-0002: 100%|██████████| 2/2 [00:00<00:00,  2.85it/s]


Added /Users/ha/Downloads/mirrulations/specific/HHS-ONC-2019-0002/raw-data/documents/HHS-ONC-2019-0002-0001_content.htm to vector store.


Processing files in CMS-2025-0050: 0it [00:00, ?it/s]
Processing files in CMS-2022-0163: 100%|██████████| 1/1 [00:00<00:00, 14.07it/s]

<html>
<head>
<title>federal register, volume 87 issue 194 (friday, october 7, 2022)</title>
</head>
<body><pre>[federal register volume 87, number 194 (friday, october 7, 2022)]
[notices]
[pages 61018-61029]
from the federal register online via the government publishing office [<a href="http://www.gpo.gov">www.gpo.gov</a>]
[fr doc no: 2022-21904]


-----------------------------------------------------------------------

department of health and human services

centers for medicare & medicaid services

[cms-0058-nc]
rin 0938-zb72


request for information; national directory of healthcare 
providers & services

agency: centers for medicare & medicaid services (cms), health and 
human services (hhs).

action: request for information.

-----------------------------------------------------------------------

summary: this request for information solicits public comments on 
establishing a national directory of healthcare providers & services 
(ndh) that could serve as a ``centralized data




In [3]:
results = vector_store.similarity_search(
    "chocolate chip pancakes",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")