In [5]:
import pandas as pd
import hashlib
from transformers import AutoTokenizer, AutoModel
import torch
import ast
from pinecone import Pinecone

# Step 1: Connect to the existing Pinecone index
def connect_to_pinecone(api_key, index_name="patient-symptoms"):
    pc = Pinecone(api_key=api_key)
    return pc.Index(index_name)

# Pinecone API key and index configuration
api_key = "00dfadae-35e0-4fcd-92b7-f88e21899500"  # Replace with your Pinecone API key
index_name = "patient-symptoms"

# Connect to Pinecone index
index = connect_to_pinecone(api_key, index_name)

# Load embedding model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name).to('cuda')

# Step 2: Function to generate embeddings
def generate_embeddings(text):
    inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to('cuda')
    embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.detach().cpu().numpy()

# Step 3: Function to upsert data
def store_in_vector_database(index, patient_name, symptoms):
    for symptom in symptoms:
        # Generate embedding vector for each symptom
        embedding_vector = generate_embeddings(symptom)

        # Generate a unique ID for the patient-symptom pair using MD5 hashing
        unique_id = hashlib.md5(f"{patient_name}-{symptom}".encode()).hexdigest()

        # Upsert the data into Pinecone with patient name and symptom as metadata
        index.upsert([(unique_id, embedding_vector.flatten(), {"patient_name": patient_name, "symptom": symptom})])

# Step 4: Load and process the dataset
file_path = "synthetic_patient_data_with_random_names.csv"  # Path to your dataset
data = pd.read_csv(file_path)

for _, row in data.iterrows():
    patient_name = row['name']
    # Parse symptoms column from string representation of a list
    symptoms = ast.literal_eval(row['extracted_symptoms'])
    # Remove numbering from symptoms for a clean input to Pinecone
    cleaned_symptoms = [symptom.split(". ", 1)[-1] for symptom in symptoms]
    # Upsert data into Pinecone
    store_in_vector_database(index, patient_name, cleaned_symptoms)

print("Data successfully upserted into Pinecone.")


Data successfully upserted into Pinecone.
