**Function to extract symptoms**

In [1]:
# Step 1: Install the necessary libraries
!pip install transformers torch

# Step 2: Import the required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Step 3: Load the LLaMA2 model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Adjust based on the specific version you want to use
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto').to('cuda')  # Move model to GPU

# Step 4: Set up a text generation pipeline
symptom_extraction = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)  # Use GPU device

# Step 5: Define a function to extract symptoms
def extract_symptoms(statement):
    # Refined prompt for better clarity
    prompt = f"Please extract and list any symptoms from the following patient statement:\n\n\"{statement}\"\n\nSymptoms (list them one per line):"

    # Generate response using the model
    response = symptom_extraction(prompt, max_new_tokens=50, num_return_sequences=1, temperature=0.2)

    # Extract the model's response text
    symptoms = response[0]['generated_text'].strip()

    # Extract only the symptoms from the response
    symptom_lines = [line.strip() for line in symptoms.splitlines() if line.strip().startswith('1.') or line.strip().startswith('2.') or line.strip().startswith('3.')]

    return symptom_lines

# Step 6: Test the function with a sample statement
patient_statement = "I have been experiencing constant headaches, occasional nausea, and sometimes I feel dizzy."
extracted_symptoms = extract_symptoms(patient_statement)

# Display the extracted symptoms
print("Extracted Symptoms:", extracted_symptoms)




tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Extracted Symptoms: ['1. Constant headaches', '2. Occasional nausea', '3. Dizziness']


**Function to Transcribe the audio**

In [3]:
# Step 1: Install the necessary libraries
!pip install transformers torchaudio

# Step 2: Import the required libraries
import torchaudio
from transformers import pipeline

# Step 3: Load the Whisper model for transcription
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# Step 4: Define a function to transcribe audio
def transcribe_audio(file_path):
    # Transcribe the audio file
    transcription = transcriber(file_path)
    return transcription['text']

# Step 5: Test the transcription function with an audio file
# Replace 'path_to_audio_file.wav' with the path to your audio file
audio_file_path = '/content/WhatsApp Audio 2024-11-05 at 6.02.09 PM (1).aac'  # Update with your audio file path
transcribed_text = transcribe_audio(audio_file_path)

# Display the transcribed text
print("Transcribed Text:", transcribed_text)




Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcribed Text:  I am having fever, constipation, headache and pain in my abdomen.


**Function to store the data in excel**

In [None]:
!pip install pandas openpyxl




In [None]:
# Step 1: Install necessary libraries (if not already installed)
# !pip install pandas openpyxl  # Uncomment this if you haven't installed these libraries yet

# Step 2: Import the required libraries
import pandas as pd
import os

# Step 3: Define a function to store symptoms in an Excel file
def store_symptoms_in_excel(extracted_symptoms, file_name='patient_symptoms.xlsx'):
    # Create a DataFrame to hold the data
    count = 0
    if os.path.exists(file_name):
        # Load existing data
        df = pd.read_excel(file_name)
        count = df.shape[0]  # Get the current number of entries to set patient_id
    else:
        df = pd.DataFrame(columns=['Patient ID', 'Symptoms'])

    patient_id = count + 1  # Auto-increment patient ID

    # Create a new entry
    new_entry = pd.DataFrame({'Patient ID': [patient_id], 'Symptoms': [', '.join(extracted_symptoms)]})

    # Concatenate the new entry to the existing DataFrame
    df = pd.concat([df, new_entry], ignore_index=True)

    # Save the DataFrame to an Excel file
    df.to_excel(file_name, index=False)

    print(f"Stored symptoms for Patient ID {patient_id}: {extracted_symptoms}")

# Example usage
extracted_symptoms = ['Fever', 'Constipation', 'Headache']  # Replace with actual symptoms
store_symptoms_in_excel(extracted_symptoms)


Stored symptoms for Patient ID 1: ['Fever', 'Constipation', 'Headache']


**Function to iniatialize the

In [8]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-5.3.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.1.0 (from pinecone)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-5.3.1-py3-none-any.whl (419 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.8/419.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone
Successfully installed pinecone-5.3.1 pinecone-plugin-inference-1.1.0 pinecone-plugi

In [15]:
from pinecone import Pinecone, ServerlessSpec

def initialize_pinecone_database(api_key, cloud='aws', region='us-west-2', index_name="patient-symptoms", dimension=384):
    # Create a Pinecone instance
    pc = Pinecone(api_key=api_key)

    # Check if the index already exists
    if index_name not in pc.list_indexes():  # Call list_indexes() directly
        # Create the index if it doesn't exist
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',  # Choose an appropriate distance metric (e.g., 'cosine' or 'euclidean')
            spec=ServerlessSpec(cloud=cloud, region=region)
        )
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")

    # Return the index object for future use
    return pc.Index(index_name)


In [16]:
# Set your Pinecone API key and environment details
api_key = "00dfadae-35e0-4fcd-92b7-f88e21899500"
cloud = "aws"               # Adjust based on your desired cloud provider
region = "us-east-1"         # Replace with the region associated with your Pinecone account
dimension = 384              # Adjust based on the embedding model's output size

# Initialize the Pinecone database and store the index
index = initialize_pinecone_database(api_key, cloud=cloud, region=region, dimension=dimension)


Index 'patient-symptoms' created successfully.


In [17]:
import hashlib
from transformers import AutoTokenizer, AutoModel
import torch

# Load embedding model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name).to('cuda')

# Function to generate embeddings
def generate_embeddings(text):
    inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to('cuda')
    embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.detach().cpu().numpy()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [18]:
# Function to store patient symptoms in Pinecone
def store_in_vector_database(index, patient_name, symptoms):
    for symptom in symptoms:
        # Generate embedding vector for each symptom
        embedding_vector = generate_embeddings(symptom)

        # Generate a unique ID for the patient-symptom pair using MD5 hashing
        unique_id = hashlib.md5(f"{patient_name}-{symptom}".encode()).hexdigest()

        # Upsert the data into Pinecone with patient name and symptom as metadata
        index.upsert([(unique_id, embedding_vector.flatten(), {"patient_name": patient_name, "symptom": symptom})])

    print(f"Data for {patient_name} stored in Pinecone.")


In [19]:
def process_audio_file(audio_file_path, patient_name, index):
    # Transcribe the audio to text
    transcribed_text = transcribe_audio(audio_file_path)
    print("Transcribed Text:", transcribed_text)

    # Extract symptoms from the transcribed text
    extracted_symptoms = extract_symptoms(transcribed_text)
    print("Extracted Symptoms:", extracted_symptoms)

    # Store extracted symptoms in the vector database
    store_in_vector_database(index, patient_name, extracted_symptoms)


In [20]:
# Get user input for audio file path and patient name
audio_file_path = input("Please enter the path to the audio file: ")  # User enters the audio file path
patient_name = input("Please enter the patient's name: ")  # User enters the patient's name

# Assuming you have already initialized the Pinecone index
process_audio_file(audio_file_path, patient_name, index)


Please enter the path to the audio file: /content/WhatsApp Audio 2024-11-05 at 6.02.09 PM (1).aac
Please enter the patient's name: rupankar




Transcribed Text:  I am having fever, constipation, headache and pain in my abdomen.
Extracted Symptoms: ['1. Fever', '2. Constipation', '3. Headache']
Data for rupankar stored in Pinecone.
