In [4]:
import numpy as np
import faiss
import psycopg2
import openai

openai.api_key = "xxxx"

# Define the function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# Define the texts for which we want to generate embeddings
texts = ["This is the first text.", "This is the second text.", "And this is the third text."]

# Generate embeddings for the texts
embeddings = np.array([get_embedding(text) for text in texts])
print(embeddings)

# Store the embeddings in a Faiss index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Create a PostgreSQL database and table
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="xxxx"
)

cur = conn.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS piidetection (id SERIAL PRIMARY KEY, embedding bytea);")
conn.commit()

# Insert the Faiss index into the database
embedding_bytes = embeddings.tobytes()
cur.execute("INSERT INTO piidetection (embedding) VALUES (%s);", (psycopg2.Binary(embedding_bytes),))
conn.commit()

# Define a function to perform search based on similarity score
def search(query, k=1):
    # Retrieve the Faiss index from the database
    cur.execute("SELECT embedding FROM piidetection WHERE id = 1;")
    embedding_bytes = cur.fetchone()[0]
    retrieved_embeddings = np.frombuffer(embedding_bytes, dtype=np.float32).reshape((-1, embeddings.shape[1]))

    # Search the Faiss index
    query_embedding = get_embedding(query)
    distances, indices = index.search(np.array([query_embedding]), k)
    return [(texts[i], distances[0][j]) for j, i in enumerate(indices[0])]

# Perform a search based on similarity score
results = search("This is a text.")
print(results[0][1])

if(results[0][1] > 0.85):
    print("PII")
else:
    print("Non-PII")

# Close the cursor and the connection
cur.close()
conn.close()

[[-0.00928653  0.00186054 -0.02263714 ... -0.01674812 -0.00685327
  -0.01055494]
 [-0.01629999  0.00237545 -0.00908385 ... -0.01844198 -0.01731874
  -0.00446682]
 [-0.01924564  0.00535853 -0.0083033  ... -0.01169543 -0.0124807
  -0.02541197]]
0.94135267
PII


In [46]:
import numpy as np
import faiss
import psycopg2
import openai

openai.api_key = "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"

# Define the function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# Define the texts for which we want to generate embeddings
texts = ["Ashley Lee, a 25-year-old woman from Florida, went to the doctor for a medical issue. She provided her name, address, and medical history to the healthcare provider. Later, she discovered that her medical records had been compromised and her information was being sold on the dark web.",
         "Samantha Jones from New York, received a phone call from someone claiming to be from her bank. They asked for her name, address, and social security number to verify her account. She became suspicious and hung up, then reported the call to her bank.",
         "Mark Davis, a 42-year-old man from Texas, applied for a job online. He provided his name, address, phone number, and resume to the company. Unfortunately, the company turned out to be a scam and stole his identity."]

# Generate embeddings for the texts
embeddings = np.array([get_embedding(text) for text in texts])
print(embeddings)

# Store the embeddings in a Faiss index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Create a PostgreSQL database and table
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="Poojabhi@5"
)

cur = conn.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS piidetection (id SERIAL PRIMARY KEY, embedding bytea);")
conn.commit()

# Insert the Faiss index into the database
embedding_bytes = embeddings.tobytes()
cur.execute("INSERT INTO piidetection (embedding) VALUES (%s);", (psycopg2.Binary(embedding_bytes),))
conn.commit()

# Define a function to perform search based on similarity score
def search(query, k=1):
    # Retrieve the Faiss index from the database
    cur.execute("SELECT embedding FROM piidetection WHERE id = 1;")
    embedding_bytes = cur.fetchone()[0]
    retrieved_embeddings = np.frombuffer(embedding_bytes, dtype=np.float32).reshape((-1, embeddings.shape[1]))

    # Search the Faiss index
    query_embedding = get_embedding(query)
    distances, indices = index.search(np.array([query_embedding]), k)
    return [(texts[i], distances[0][j]) for j, i in enumerate(indices[0])]

# Perform a search based on similarity score
query = "Jessica, is a business women while travelling. she lost her personal belongings on a crowded bus."
results = search(query)
print(results)
print(results[0][1])

if(results[0][1] > 0.85):
    print(f"The query '{query}' matches the following text with similarity score {results[0][1]}: {results[0][0]}")
else:
    print("Non-PII")

# Close the cursor and the connection
cur.close()
conn.close()

[[-0.01583662  0.01090708  0.02610972 ... -0.00462872 -0.00310845
  -0.03136272]
 [-0.02214365 -0.00254063  0.00981672 ... -0.02156733 -0.01314019
  -0.01393423]
 [-0.0192842  -0.00636998  0.00838359 ... -0.03022998 -0.02252405
  -0.00965501]]
[('Samantha Jones from New York, received a phone call from someone claiming to be from her bank. They asked for her name, address, and social security number to verify her account. She became suspicious and hung up, then reported the call to her bank.', 0.80227387)]
0.80227387
Non-PII


In [48]:
import numpy as np
import faiss
import psycopg2
import openai

openai.api_key = "xxxx"

# Define the function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# Define the texts for which we want to generate embeddings
# texts = ["Ashley Lee, a 25-year-old woman from Florida, went to the doctor for a medical issue. She provided her name, address, and medical history to the healthcare provider. Later, she discovered that her medical records had been compromised and her information was being sold on the dark web."]
#          "Samantha Jones, a 35-year-old woman from New York, received a phone call from someone claiming to be from her bank. They asked for her name, address, and social security number to verify her account. She became suspicious and hung up, then reported the call to her bank.",
#          "Mark Davis, a 42-year-old man from Texas, applied for a job online. He provided his name, address, phone number, and resume to the company. Unfortunately, the company turned out to be a scam and stole his identity."]

texts = ["Ashley Lee, a 25-year-old woman from Florida, went to the doctor for a medical issue.", "Samantha Jones, a 35-year-old woman from New York, received a phone call from someone claiming to be from her bank.", "Mark Davis, a 42-year-old man from Texas, applied for a job online."]

# Generate embeddings for the texts
embeddings = np.array([get_embedding(text) for text in texts])
print(embeddings)

# Store the embeddings in a Faiss index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Create a PostgreSQL database and table
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="xxxx"
)

cur = conn.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS piidetection (id SERIAL PRIMARY KEY, embedding bytea);")
conn.commit()

# Insert the Faiss index into the database
embedding_bytes = embeddings.tobytes()
cur.execute("INSERT INTO piidetection (embedding) VALUES (%s);", (psycopg2.Binary(embedding_bytes),))
conn.commit()

# Define a function to perform search based on similarity score
def search(query, k=1):
    # Retrieve the Faiss index from the database
    cur.execute("SELECT embedding FROM piidetection WHERE id = 1;")
    embedding_bytes = cur.fetchone()[0]
    retrieved_embeddings = np.frombuffer(embedding_bytes, dtype=np.float32).reshape((-1, embeddings.shape[1]))
    
    # Generate the query embedding
    query_embedding = get_embedding(query)
    
    # Generate the prompt
    prompt = f"Consider only the given list of texts and find the most similar text in the list:\n{', '.join(texts)}\n\nQuery:\n{query}\n\nSimilarity:"
    
    # Generate the response from the GPT-3 model
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1027,
        n=1,
        stop=None,
        temperature=0.7,
    )

    # Parse the response to get the index of the most similar text
    result = response.choices[0].text.strip().split("\n")[0]
    return result

# query = "This is a text"
query = "Jessica, is a business women while travelling she lost her personal belongings on a crowded bus."

# Perform a search based on similarity score
results = search(query)
print(results)

if(results is not None):
    print(f"The query '{query}' matches the following text with {results}")
else:
    print("The the query doesn't match with any document and it doesn't contain PII")

# Close the cursor and the connection
cur.close()
conn.close()

[[-0.01957625  0.01859302  0.03037911 ...  0.00110928  0.0047491
  -0.03209345]
 [-0.02320451 -0.00083089  0.00584883 ... -0.01732705 -0.00373384
  -0.01167858]
 [-0.02528366  0.0005833   0.00137179 ... -0.02430127 -0.01324936
   0.00736148]]
Samantha Jones, a 35-year-old woman from New York, received a phone call from someone claiming to be from her bank.
The query 'Jessica, is a business women while travelling she lost her personal belongings on a crowded bus.' matches the following text with Samantha Jones, a 35-year-old woman from New York, received a phone call from someone claiming to be from her bank.
