In [17]:
import numpy as np
import faiss
import psycopg2
import openai

openai.api_key = "xxxxx"

# Define the function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# Define the texts for which we want to generate embeddings
texts = ["Ashley Lee, a 25-year-old woman from Florida, went to the doctor for a medical issue. She provided her name, address, and medical history to the healthcare provider. Later, she discovered that her medical records had been compromised and her information was being sold on the dark web.",
         "Samantha Jones from New York, received a phone call from someone claiming to be from her bank. They asked for her name, address, and social security number to verify her account. She became suspicious and hung up, then reported the call to her bank.",
         "Mark Davis, a 42-year-old man from Texas, applied for a job online. He provided his name, address, phone number, and resume to the company. Unfortunately, the company turned out to be a scam and stole his identity."]

# Generate embeddings for the texts
embeddings = np.array([get_embedding(text) for text in texts])

# Define the path to the index file
index_path = "vector.index"

# Check if the index file exists
if os.path.isfile(index_path):
    # Load the index from the file
    index = faiss.read_index(index_path)
else:
    # Create a new index
    index = faiss.IndexFlatIP(embeddings.shape[1])

# Add the embeddings to the index
index.add(embeddings)

# Save the index to the file
faiss.write_index(index, index_path)

# Define a function to perform search based on similarity score
def search(query, k=1):
    # Retrieve the Faiss index from the .index file
    index_retrieved = faiss.read_index("vector.index")

    # Search the Faiss index
    query_embedding = get_embedding(query)
    distances, indices = index_retrieved.search(np.array([query_embedding]), k)
    return [(texts[i], distances[0][j]) for j, i in enumerate(indices[0])]


# Perform a search based on similarity score
query = "Jessica, is a business women while travelling. she lost her personal belongings on a crowded bus."
results = search(query)

if(results[0][1] > 0.85):
    print(f"The query '{query}' matches the following text with similarity score {results[0][1]}: {results[0][0]}")
else:
    print("Non-PII")

[[-0.01583775  0.01085611  0.02609865 ... -0.00462905 -0.00321543
  -0.0313132 ]
 [-0.02218292 -0.00251366  0.00988183 ... -0.0215685  -0.01310751
  -0.01391392]
 [-0.0192842  -0.00636998  0.00838359 ... -0.03022998 -0.02252405
  -0.00965501]]
Non-PII
