In [2]:
!pip install faiss-cpu
!pip install transformers
!pip install torch
!pip install sentence-transformers
!pip install pandas
!pip install numpy




In [3]:

with open('HIPAA.txt', 'r') as file:
    hipaa_text = file.read()

with open('INTERNAL_POLICY.txt', 'r') as file:
    internal_policy_text = file.read()

with open('ISO_27001.txt', 'r') as file:
    iso_27001_text = file.read()

print("HIPAA Text:\n", hipaa_text[:500])
print("\nINTERNAL_POLICY Text:\n", internal_policy_text[:500])
print("\nISO 27001 Text:\n", iso_27001_text[:500])


HIPAA Text:
 Privacy Rule: Protect individuals’ health information and provide rights regarding their data.
Security Rule: Ensure confidentiality, integrity, and availability of electronic protected health information.
Breach Notification Rule: Notify affected individuals and HHS of data breaches.
Administrative Safeguards: Implement policies to manage security measures.
Physical Safeguards: Restrict physical access to protect data.
Technical Safeguards: Use technology to control access and ensure data secur

INTERNAL_POLICY Text:
 Employees must use strong passwords and change them regularly.
All external storage devices must be encrypted.
Access to confidential information requires manager approval.
Only authorized personnel may modify system configurations.
All software must be up-to-date to prevent vulnerabilities.
Data must be backed up daily and stored securely.
Regular audits must be conducted to ensure compliance with company policies.


ISO 27001 Text:
 Clause 4: Understand th

# Embedding the documents

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np


model = SentenceTransformer('all-MiniLM-L6-v2')


hipaa_chunks = hipaa_text.split("\n")
internal_policy_chunks = internal_policy_text.split("\n")
iso_27001_chunks = iso_27001_text.split("\n")

# Combine all chunks into one list
all_chunks = hipaa_chunks + internal_policy_chunks + iso_27001_chunks

# Encode the chunks using the pre-trained model
all_embeddings = model.encode(all_chunks)

# Verify the size of the embeddings
print(f"Number of chunks: {len(all_chunks)}")
print(f"Embedding shape: {all_embeddings.shape}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of chunks: 23
Embedding shape: (23, 384)


# Creating a FAISS index


In [5]:
import faiss


index = faiss.IndexFlatL2(all_embeddings.shape[1])


index.add(all_embeddings)


print(f"Number of items in the index: {index.ntotal}")


Number of items in the index: 23


# Building the retriever


In [9]:
def retrieve_relevant_documents(query, top_k=3):

    query_embedding = model.encode([query])


    distances, indices = index.search(query_embedding, top_k)


    relevant_chunks = [all_chunks[i] for i in indices[0]]
    return relevant_chunks


query = "What are the data privacy requirements in compliance?"
relevant_docs = retrieve_relevant_documents(query)


for i, doc in enumerate(relevant_docs):
    print(f"Document {i + 1}: {doc}")


Document 1: Privacy Rule: Protect individuals’ health information and provide rights regarding their data.
Document 2: Data must be backed up daily and stored securely.
Document 3: Access to confidential information requires manager approval.


# Building the generator

In [10]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
generator_model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_answer(relevant_docs):
    input_text = " ".join(relevant_docs)
    input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt")


    output = generator_model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)


answer = generate_answer(relevant_docs)
print(f"Generated Answer: {answer}")


Generated Answer: data must be backed up daily and stored securely. access to confidential information requires manager approval.


# Example Usage

In [11]:
query = "What are the regulations for data protection?"
relevant_docs = retrieve_relevant_documents(query)
answer = generate_answer(relevant_docs)

print(f"Query: {query}")
print(f"Generated Answer: {answer}")


Query: What are the regulations for data protection?
Generated Answer: technical Safeguards: Use technology to control access and ensure data security. privacy rule: protect individuals’ health information and provide rights regarding their data.
