<a href="https://colab.research.google.com/github/KarthikeyanBaskaran/voice_to_form/blob/main/Healthcare_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Libraries

import pandas as pd
import numpy as np
import base64
import os
import time
import pandas as pd
from google import genai
from google.genai import types
from google.colab import userdata  # Assuming you are in Colab
import requests
from io import StringIO
import numpy as np

In [2]:
original = pd.read_csv('https://raw.githubusercontent.com/KarthikeyanBaskaran/voice_to_form/refs/heads/main/Dataset/Healthcare%20Merged%20Dataset.csv')

In [3]:
original = original.drop('Unnamed: 3', axis=1)
original.head()

Unnamed: 0,ID,Patient Information,Symptoms & History
0,1,"Name: Robert M., Age: 42, Gender: Male","Doctor, I’ve been feeling dizzy whenever I exe..."
1,2,"Name: Sarah T., Age: 25, Gender: Female","My face keeps breaking out, especially around ..."
2,3,"Name: John D., Age: 50, Gender: Male",I’ve been feeling weak and fatigued throughout...
3,4,"Name: Emily S., Age: 28, Gender: Female",There’s a painful knot in my shoulder blade th...
4,5,"Name: Michael B., Age: 34, Gender: Male","When I wake up in the morning, my body feels e..."


In [4]:
len(original)

544

## Using gemini api

In [5]:
# --- API Keys ---
API_KEYS = [
    userdata.get('Healthcareapi'),
    userdata.get('Sachin'),
    userdata.get('Lakshmi'),
    userdata.get('Akash'),
]
NUM_API_KEYS = len(API_KEYS)
current_api_key_index = 0
api_key_cycle_count = 0  # Track cycles through all API keys

# --- Rate Limits ---
MAX_RETRIES = 3  # Number of times to retry a failed request
WAIT_ON_ERROR_SECONDS = 60
DAILY_LIMIT_REACHED = False

# --- File to store processed data ---
OUTPUT_CSV_FILE = "extracted_data_progress.csv"

# --- Model Name ---
MODEL_NAME = "gemini-2.0-flash-lite"

# --- Initialize Client (will be configured with the current API key) ---
client = None

def configure_api(api_key):
    """Configures the Gemini API client with the given key."""
    global client
    client = genai.Client(api_key=api_key)
    print(f"Using API Key {API_KEYS.index(api_key) + 1}")

def switch_api_key():
    """Switches to the next available API key."""
    global current_api_key_index, api_key_cycle_count
    current_api_key_index = (current_api_key_index + 1) % NUM_API_KEYS
    configure_api(API_KEYS[current_api_key_index])
    if current_api_key_index == 0:
        api_key_cycle_count += 1

def extract_patient_info(text):
    """
    Extracts patient information from a text transcript using Gemini.
    Switches API key on any exception other than a successful response.
    """
    prompt = f"""
    Given the following patient transcript, extract the following information if available.
    If not available, return " ".

    Transcript:
    {text}

    Requested Information:
    Primary Symptoms:
    Duration of Symptoms:
    Severity (Mild, Moderate, Severe):
    Past Medical Conditions:
    Hospitalizations (reason and year):
    Allergies:
    Current Medications (name, dosage, frequency):
    Smoking (Yes/No, quantity per day):
    Alcohol Consumption (Yes/No, frequency):
    Exercise Routine (Yes/No, frequency):
    Dietary Habits (Vegetarian, Non-Vegetarian, Vegan):
    Sleep Pattern (Hours per day, quality of sleep):
    Triggering Factors:

    Output the information in a JSON-like format, where each requested information is a key and the extracted value is the corresponding value.
    """
    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt)],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        response_mime_type="text/plain",
    )

    retries = 0
    while retries < MAX_RETRIES and not DAILY_LIMIT_REACHED:
        try:
            response = client.models.generate_content(
                model=MODEL_NAME,
                contents=contents,
                config=generate_content_config,
            )
            result = response.text
            info = {}
            lines = result.split('\n')
            for line in lines:
                if ":" in line:
                    key, value = line.split(":", 1)
                    info[key.strip()] = value.strip()
            return info
        except Exception as e:
            print(f"An error occurred: {e}. Switching API key...")
            switch_api_key()
            retries += 1
            if retries == MAX_RETRIES:
                print("Max retries reached for this request.")
            time.sleep(WAIT_ON_ERROR_SECONDS)  # Wait after switching key on error

    return None

def fetch_processed_data_from_repo(repo_url):
    """Fetches the extracted data CSV from the Git repo as a pandas DataFrame."""
    try:
        response = requests.get(repo_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        csv_data = response.text
        return pd.read_csv(StringIO(csv_data))
    except requests.exceptions.RequestException as e:
        print(f"Error fetching CSV from repo: {e}")
        return pd.DataFrame()
    except pd.errors.EmptyDataError:
        print("The CSV file in the repository is empty.")
        return pd.DataFrame()
    except Exception as e:
        print(f"An unexpected error occurred while processing repo CSV: {e}")
        return pd.DataFrame()

def process_dataset_with_resume(df, repo_csv_url):
    """
    Processes a DataFrame, fetches existing data from a Git repo,
    processes new data, and saves a merged DataFrame.
    """
    global DAILY_LIMIT_REACHED, api_key_cycle_count
    recently_processed_data = []
    existing_data_from_repo_df = fetch_processed_data_from_repo(repo_csv_url)
    start_row_index = len(existing_data_from_repo_df)

    print(f"Fetched {start_row_index} rows from the Git repository.")

    num_rows = len(df)
    if start_row_index >= num_rows:
        print("All rows have been processed previously (based on Git repo).")
        if not existing_data_from_repo_df.empty:
            existing_data_from_repo_df.to_csv(OUTPUT_CSV_FILE, index=False)
            print(f"Saved fetched data to {OUTPUT_CSV_FILE}")
        return

    remaining_df = df.iloc[start_row_index:]
    configure_api(API_KEYS[current_api_key_index])

    for index, row in remaining_df.iterrows():
        if DAILY_LIMIT_REACHED:
            print("Daily limit reached. Stopping processing.")
            break

        extracted_info = extract_patient_info(row["Symptoms & History"])

        if extracted_info is not None:
            extracted_info["ID"] = row["ID"]
            recently_processed_data.append(extracted_info)
            print(f"Processed row {row['ID']}, recently processed: {len(recently_processed_data)}")
        else:
            print(f"Failed to extract information for ID: {row['ID']}")

        if api_key_cycle_count > 0 and current_api_key_index == 0:
            print(f"End of API key list reached for the {api_key_cycle_count} time. Waiting for {WAIT_ON_ERROR_SECONDS} seconds...")
            time.sleep(WAIT_ON_ERROR_SECONDS)
            api_key_cycle_count = 0 # Reset cycle count after waiting

    # Merge existing data from repo with newly processed data
    newly_processed_df = pd.DataFrame(recently_processed_data)
    if not existing_data_from_repo_df.empty and not newly_processed_df.empty:
        merged_df = pd.concat([existing_data_from_repo_df, newly_processed_df], ignore_index=True)
    elif not newly_processed_df.empty:
        merged_df = newly_processed_df
    else:
        merged_df = existing_data_from_repo_df

    # Save the merged DataFrame to the output CSV file
    if not merged_df.empty:
        merged_df.to_csv(OUTPUT_CSV_FILE, index=False)
        print(f"Successfully merged and saved data to {OUTPUT_CSV_FILE}")
    else:
        print(f"No data to save to {OUTPUT_CSV_FILE}")

# --- Git Repo URL for the progress CSV ---
REPO_CSV_URL = "https://raw.githubusercontent.com/KarthikeyanBaskaran/voice_to_form/refs/heads/main/Dataset/extracted_data_progress.csv"

# --- Process the dataset ---
print(f"Total number of records in the dataset: {len(original)}")
if not original.empty:
    process_dataset_with_resume(original, REPO_CSV_URL)
else:
    print("The 'original' DataFrame is empty.")

Total number of records in the dataset: 544
Fetched 544 rows from the Git repository.
All rows have been processed previously (based on Git repo).
Saved fetched data to extracted_data_progress.csv


In [6]:
REPO_CSV_URL = "https://raw.githubusercontent.com/KarthikeyanBaskaran/voice_to_form/refs/heads/main/Dataset/extracted_data_progress.csv"

In [7]:
df = pd.read_csv(REPO_CSV_URL)

# Remove quotation marks and commas, and strip extra spaces
df_cleaned = df.replace({'"': '', ',': ''}, regex=True)

In [8]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 14 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   "Primary Symptoms"                                    544 non-null    object
 1   "Duration of Symptoms"                                544 non-null    object
 2   "Severity (Mild, Moderate, Severe)"                   544 non-null    object
 3   "Past Medical Conditions"                             544 non-null    object
 4   "Hospitalizations (reason and year)"                  544 non-null    object
 5   "Allergies"                                           544 non-null    object
 6   "Current Medications (name, dosage, frequency)"       544 non-null    object
 7   "Smoking (Yes/No, quantity per day)"                  544 non-null    object
 8   "Alcohol Consumption (Yes/No, frequency)"             544 non-null    

In [9]:
df_cleaned.columns = df_cleaned.columns.str.replace('"', '')

In [11]:
import pandas as pd

# Assuming 'df_cleaned' is your DataFrame
for column in df_cleaned.columns:
    # Check if the column contains string data (object type)
    if df_cleaned[column].dtype == 'object':
        # Apply strip to each cell in the column
        df_cleaned[column] = df_cleaned[column].str.strip()
        # Replace single spaces in otherwise empty cells with NaN (or empty string if preferred)
        df_cleaned[column] = df_cleaned[column].replace(r'^\s*$', np.nan, regex=True)

  df_cleaned[column] = df_cleaned[column].replace(r'^\s*$', np.nan, regex=True)


In [12]:
df_cleaned.drop('ID', axis=1, inplace=True)
df_cleaned.dropna(how='all', inplace=True)

In [14]:
df_cleaned['ID'] = df_cleaned.index + 1
df_cleaned.head()

Unnamed: 0,Primary Symptoms,Duration of Symptoms,"Severity (Mild, Moderate, Severe)",Past Medical Conditions,Hospitalizations (reason and year),Allergies,"Current Medications (name, dosage, frequency)","Smoking (Yes/No, quantity per day)","Alcohol Consumption (Yes/No, frequency)","Exercise Routine (Yes/No, frequency)","Dietary Habits (Vegetarian, Non-Vegetarian, Vegan)","Sleep Pattern (Hours per day, quality of sleep)",Triggering Factors,ID
0,Dizziness,Two months,,Father suffered from heart issues,,,,,,,,,Exertion (lifting heavy objects climbing stairs),1
1,Breaking out on cheeks and forehead oily skin ...,,,,,,,,,,,,,2
2,Weakness fatigue shaky legs lightheadedness,A few weeks,,,,,,,,,,Enough sleep,Standing up too quickly,3
3,Painful knot in shoulder blade pain shooting t...,,,,,,,,,,,,Long hours at a desk job posture,4
4,Weakness exhaustion upon waking,,,,,,,,,,,Patient reports poor sleep quality,,5


In [15]:
original[['Name', 'Age', 'Gender']] = original['Patient Information'].str.extract(r'Name:\s*(.*?),\s*Age:\s*(.*?),\s*Gender:\s*(.*)')

In [16]:
final_df = pd.merge(df_cleaned, original, on='ID', how='left')
desired_order = ['ID', 'Name', 'Age', 'Gender'] + [col for col in final_df.columns if col not in ['ID', 'Name', 'Age', 'Gender']]

final_df = final_df[desired_order]
final_df.head()

Unnamed: 0,ID,Name,Age,Gender,Primary Symptoms,Duration of Symptoms,"Severity (Mild, Moderate, Severe)",Past Medical Conditions,Hospitalizations (reason and year),Allergies,"Current Medications (name, dosage, frequency)","Smoking (Yes/No, quantity per day)","Alcohol Consumption (Yes/No, frequency)","Exercise Routine (Yes/No, frequency)","Dietary Habits (Vegetarian, Non-Vegetarian, Vegan)","Sleep Pattern (Hours per day, quality of sleep)",Triggering Factors,Patient Information,Symptoms & History
0,1,Robert M.,42,Male,Dizziness,Two months,,Father suffered from heart issues,,,,,,,,,Exertion (lifting heavy objects climbing stairs),"Name: Robert M., Age: 42, Gender: Male","Doctor, I’ve been feeling dizzy whenever I exe..."
1,2,Sarah T.,25,Female,Breaking out on cheeks and forehead oily skin ...,,,,,,,,,,,,,"Name: Sarah T., Age: 25, Gender: Female","My face keeps breaking out, especially around ..."
2,3,John D.,50,Male,Weakness fatigue shaky legs lightheadedness,A few weeks,,,,,,,,,,Enough sleep,Standing up too quickly,"Name: John D., Age: 50, Gender: Male",I’ve been feeling weak and fatigued throughout...
3,4,Emily S.,28,Female,Painful knot in shoulder blade pain shooting t...,,,,,,,,,,,,Long hours at a desk job posture,"Name: Emily S., Age: 28, Gender: Female",There’s a painful knot in my shoulder blade th...
4,5,Michael B.,34,Male,Weakness exhaustion upon waking,,,,,,,,,,,Patient reports poor sleep quality,,"Name: Michael B., Age: 34, Gender: Male","When I wake up in the morning, my body feels e..."


In [17]:
final_df.to_csv('CleanedDataset.csv', index=False)