In [1]:
!pip install pytesseract pdf2image fitz spacy scispacy rapidfuzz
!python -m spacy download en_core_web_sm
!pip install mistralai
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!apt-get install poppler-utils

Collecting spacy
  Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (920 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.12
    Uninstalling thinc-8.1.12:
      Successfully uninstalled thinc-8.1.12
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-ner-bc5cdr-md 0.5.1 r

In [2]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz (120.2 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.5.0,>=3.4.1 (from en_ner_bc5cdr_md==0.5.1)
  Using cached spacy-3.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting thinc<8.2.0,>=8.1.0 (from spacy<3.5.0,>=3.4.1->en_ner_bc5cdr_md==0.5.1)
  Using cached thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 (from spacy<3.5.0,>=3.4.1->en_ner_bc5cdr_md==0.5.1)
  Using cached pydantic-1.10.21-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (153 kB)
Using cached spacy-3.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
Using cached pydantic-1.10.21-cp311-cp311-manylinux_2_17_x86_

In [3]:
import os
from google.colab import userdata

# Retrieve API key from Colab's secret storage
MISTRAL_API_TOKEN = userdata.get('Mistral_API')

# Store it in environment variables for secure access
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_TOKEN


In [5]:
import os
import cv2
import numpy as np
import pytesseract
import pdf2image
import spacy
import json
import requests
import re
from PIL import Image



#Retrieve API key securely
MISTRAL_API_TOKEN = os.getenv("MISTRAL_API_KEY")


  # Biomedical NER model

# Standard Medical Ranges for Comparison
STANDARD_RANGES = {
    "Hemoglobin": (13.0, 18.0, "g/dL"),
    "RBC": (4.5, 6.5, "x10^12/L"),
    "Hematocrit": (38.0, 52.0, "%"),
    "MCV": (76.0, 96.0, "fL"),
    "MCH": (27.0, 32.0, "pg"),
    "MCHC": (30.0, 37.0, "g/dL"),
    "Platelet Count": (150.0, 400.0, "x10^9/L"),
    "TLC": (4.0, 11.0, "x10^9/L"),
    "Neutrophils": (40.0, 75.0, "%"),
    "Lymphocytes": (20.0, 45.0, "%"),
    "Monocytes": (2.0, 10.0, "%"),
    "Eosinophils": (1.0, 5.0, "%"),
    "ESR": (0.0, 10.0, "mm/hr")
}

# ✅ Map alternative test names to standardized names

# ✅ Step 1: OCR - Extract Text from Image or PDF
def extract_text(file_path):
    text = ""

    if file_path.lower().endswith(".pdf"):
        images = pdf2image.convert_from_path(file_path)
        for img in images:
            text += pytesseract.image_to_string(img)
    elif file_path.lower().endswith((".png", ".jpg", ".jpeg")):
        """Extract text from an image and structure it into a clean format."""
        img = Image.open(file_path)
        raw_text = pytesseract.image_to_string(img)

        # Clean newlines and extra spaces
        text = re.sub(r"\n\s*\n", "\n", raw_text.strip())  # Remove extra blank lines
        text = re.sub(r"\s{2,}", " ", text)  # Normalize multiple spaces

    return text

# ✅ Step 2: Text Cleaning & Named Entity Recognition (NER)
# Specialized SciSpaCy extraction for conditions (accurate)
def process_text(text):
    text = text.replace("\n", " ").strip()  # Remove newlines
    text = " ".join(text.split())  # Normalize spaces


    nlp = spacy.load("en_ner_bc5cdr_md")
    doc = nlp(text)
    conditions = set(ent.text for ent in doc.ents if ent.label_ == 'DISEASE')
    return text,list(conditions)



# ✅ **Step 3: Extract JSON Using Mistral API**
def format_json_using_mistral(text):
    headers = {
        "Authorization": f"Bearer {MISTRAL_API_TOKEN}",
        "Content-Type": "application/json"
    }

    prompt = f"""
    You are an advanced **medical text processor**. Your job is to **extract medical test names** and their **corresponding values** from medical reports.

    **Instructions:**
    - The input text **can be a paragraph or a table**.
    - **For paragraph format:** Identify medical test names and extract only numerical values.
    - **For table format:** Use headers as test names and extract column values.
    - **Strictly return JSON format** (without Markdown formatting).
    - **Exclude units** (e.g., mg/dL, g/L) and extract only **pure numerical values**.
    - **Ignore irrelevant text** (such as history, symptoms, or non-numerical data).
    - If values are missing, **omit that test**.

    ---
    **Example Input (Paragraph Format):**
    ```
    The patient had a WBC of 5.2 x10^9/L and an RBC of 4.6 x10^12/L. Hemoglobin levels were at 13.9 g/dL, while the hematocrit percentage was 41.5%. His platelet count stood at 210 x10^9/L.
    ```

    ---
    **Example Input (Table Format):**
    ```
    | Test Name      | Result | Reference Range |
    |---------------|--------|----------------|
    | WBC           | 5.9    | 4.0 - 11.0     |
    | RBC           | 4.8    | 4.1 - 5.3      |
    | Hemoglobin    | 13.8   | 12.0 - 17.0    |
    | Hematocrit    | 40.5   | 36.0 - 50.0    |
    | MCV           | 86.2   | 80 - 95        |
    ```

    ---
    **Expected JSON Output:**
    ```json
    {{
        "WBC": "5.2",
        "RBC": "4.6",
        "Hemoglobin": "13.9",
        "Hematocrit": "41.5",
        "Platelet Count": "210"
    }}
    ```

    ---
    **Now process the following text and return only JSON format:**

    {text}
    """

    payload = {
        "model": "open-mixtral-8x7b",  # Use Mistral-7B (or open-mixtral-8x7b for better results)
        "messages": [
            {"role": "system", "content": "Extract medical test names and their values in JSON format."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.5  # Slightly increase for better formatting
    }

    response = requests.post("https://api.mistral.ai/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        return {"error": f"Mistral API failed: {response.text}"}

    try:
        mistral_response = response.json()["choices"][0]["message"]["content"]
        # Ensure strict JSON extraction (removing markdown-like formatting)
        mistral_response = mistral_response.replace("```json", "").replace("```", "").strip()
        extracted_json = json.loads(mistral_response)
    except (json.JSONDecodeError, KeyError):
        extracted_json = {"error": "Failed to parse Mistral JSON response"}

    return extracted_json


# ✅ Step 4: Compare Extracted Values with Standard Ranges
def compare_with_standard_ranges(extracted_values):
    comparison_results = {}

    for param, value in extracted_values.items():
        if param in ["Age", "Gender"]:  # Directly store these without comparison
            comparison_results[param] = value
            continue  # Skip further checking

        if param not in STANDARD_RANGES:
            comparison_results[param] = "No Reference"
            continue

        # Change is here: unpack all 3 values from the tuple
        low, high, unit = STANDARD_RANGES[param]
        ref_range = (low, high)  # Create a ref_range tuple

        try:
            if param == "Blood Pressure":
                systolic, diastolic = map(int, value.split('/'))
                ref_sys_low, ref_dia_low = map(int, ref_range[0].split('/'))
                ref_sys_high, ref_dia_high = map(int, ref_range[1].split('/'))

                if systolic < ref_sys_low or diastolic < ref_dia_low:
                    comparison_results[param] = f"Low ({value} {unit})"
                elif systolic > ref_sys_high or diastolic > ref_dia_high:
                    comparison_results[param] = f"High ({value} {unit})"
                else:
                    comparison_results[param] = f"Normal ({value} {unit})"

            else:
                numeric_val = float(value)
                low, high = ref_range  # Unpack low, high from ref_range
                if numeric_val < low:
                    comparison_results[param] = f"Low ({value} {unit})"
                elif numeric_val > high:
                    comparison_results[param] = f"High ({value} {unit})"
                else:
                    comparison_results[param] = f"Normal ({value} {unit})"
        except ValueError:
            comparison_results[param] = f"Invalid data ({value})"

    return comparison_results

def flatten_extracted_values(extracted_values):
    flat_dict = {}

    for category, tests in extracted_values.items():
        if isinstance(tests, dict):  # If it's a nested dictionary
            for test_name, test_value in tests.items():
                flat_dict[test_name] = test_value
        else:
            flat_dict[category] = tests  # If it's already a key-value pair

    return flat_dict



# ✅ Main Processing Pipeline
def process_medical_report(file_path):
    extracted_text = extract_text(file_path)
    cleaned_text, named_entities = process_text(extracted_text)
    extracted_values = format_json_using_mistral(cleaned_text)
    extracted_values = flatten_extracted_values(extracted_values)
    comparison_results = compare_with_standard_ranges(extracted_values)

    return cleaned_text, extracted_values, named_entities, comparison_results

# ✅ Run on Your File
file_path = "/content/CBC - Test - 4.jpg"  # Replace with actual file path
cleaned_text, extracted_values, named_entities, comparison_results= process_medical_report(file_path)

# ✅ Display Results
print("comparison results")
for parameter, result in comparison_results.items():
    print(f"{parameter}: {result}")
print("\nextracted values")
for parameter, result in extracted_values.items():
    print(f"{parameter}: {result}")


print("named_entities:", named_entities)

# print("comparison_results",comparison_results)


comparison results
Red blood cells: No Reference
Hemoglobin level: No Reference
Hematocrit: Low (0.48 %)
MCV: High (106 fL)
MCH: High (42 pg)
MCHC: High (400 g/dL)
Leukocytes: No Reference
Neutrophils polynuclear: No Reference

extracted values
Red blood cells: 1.69
Hemoglobin level: 151
Hematocrit: 0.48
MCV: 106
MCH: 42
MCHC: 400
Leukocytes: 25
Neutrophils polynuclear: 1.7
named_entities: []


In [6]:
!pip install pandas scikit-learn numpy




In [7]:
import zipfile
import os
import pandas as pd

# Define the path where the ZIP file is stored
zip_path = "/content/archive (11).zip"  # Update with your file path

# Define the extraction directory
extract_path = "/content/extracted_dataset"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"✅ Dataset extracted successfully to: {extract_path}")

# List extracted files
os.listdir(extract_path)


✅ Dataset extracted successfully to: /content/extracted_dataset


['nasariancad.csv']

In [8]:
import os

# List all files inside the extracted dataset folder
files = os.listdir(extract_path)
print("Extracted Files:", files)


Extracted Files: ['nasariancad.csv']


In [9]:
# Identify CSV file
csv_file = [f for f in files if f.endswith(".csv")]
if not csv_file:
    raise FileNotFoundError("No CSV file found in the ZIP archive!")
csv_file_path = os.path.join(extract_path, csv_file[0])
print(f"✅ CSV File Found: {csv_file[0]}")

# Load dataset
df = pd.read_csv(csv_file_path)

# Display first few rows
print("✅ Dataset Loaded Successfully!")
df.head()

✅ CSV File Found: nasariancad.csv
✅ Dataset Loaded Successfully!


Unnamed: 0,heartattack,Age,Weight,Length,BMI,DM,HTN,FAMILYHTN,CurrentSmoker,EXSmoker,...,BUN,RBC,HB,POLY,WBC,Lymph,eo,PLT,HTC,angiographyCAD
0,1,59,75,177,23.93,2,1,2,1,2,...,13.2,4.12,11.9,51,7700,42,0,287,36.0,2
1,1,48,82,185,27.39,1,1,1,1,2,...,14.2,3.88,16.1,59,7500,40,1,244,41.0,2
2,1,51,95,174,31.02,1,1,1,1,1,...,13.4,5.04,12.5,60,6500,44,0,325,40.0,1
3,1,55,70,172,24.8,1,1,1,1,1,...,11.5,4.16,13.0,57,6000,45,1,203,39.0,2
4,1,51,104,167,37.29,2,2,1,1,1,...,16.4,4.4,13.3,50,6000,52,1,237,40.0,1


In [10]:
# Check missing values
print("Missing Values Before Handling:\n", df.isnull().sum())

# Fill numerical columns with mean
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# Fill categorical columns with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("✅ Missing Values Handled Successfully!")
df.isnull().sum()


Missing Values Before Handling:
 heartattack            0
Age                    0
Weight                 0
Length                 0
BMI                    0
DM                     0
HTN                    0
FAMILYHTN              0
CurrentSmoker          0
EXSmoker               0
FH                     0
Obesity                0
CHAGHISHEKAMI          0
CRF                    0
CVA                    0
Airwaydisease          0
ThyroidDisease         0
HLP                    0
STRESS                 0
noise                  0
shiftwork              0
BP                     0
PR                     0
Edema                  0
WeakPeripheralPulse    0
Lungrales              0
SystolicMurmur         0
DiastolicMurmur        0
 ChestPain             0
Dyspnea                0
heartbeat              0
syanoz                 0
Function\n             0
exercisetest           0
arytmi                 0
FBS                    0
CR                     0
TG                     0
LDL              

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Unnamed: 0,0
heartattack,0
Age,0
Weight,0
Length,0
BMI,0
DM,0
HTN,0
FAMILYHTN,0
CurrentSmoker,0
EXSmoker,0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define Features (X) and Target (y)
X = df.drop(columns=["heartattack"])  # Drop target column
y = df["heartattack"]  # Target variable

# Convert categorical values to numerical (if any)
X = pd.get_dummies(X)  # Convert categorical columns into one-hot encoding

# Normalize numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("\n✅ Data Splitting Completed!")
print(f"📊 Training Set: {X_train.shape}, Testing Set: {X_test.shape}")



✅ Data Splitting Completed!
📊 Training Set: (120, 49), Testing Set: (30, 49)


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Model
print("\n🚀 Training the Model...")
model.fit(X_train, y_train)

print("✅ Model Training Completed!")



🚀 Training the Model...
✅ Model Training Completed!


In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Make Predictions
y_pred = model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Model Accuracy: {accuracy:.2f}")

# Display Classification Report
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))



🎯 Model Accuracy: 0.90

📊 Classification Report:
              precision    recall  f1-score   support

           1       0.90      1.00      0.95        27
           2       0.00      0.00      0.00         3

    accuracy                           0.90        30
   macro avg       0.45      0.50      0.47        30
weighted avg       0.81      0.90      0.85        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import pickle

# Save the trained model
model_filename = "/content/trained_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"\n✅ Model Saved Successfully as: {model_filename}")



✅ Model Saved Successfully as: /content/trained_model.pkl


In [17]:
import numpy as np
import pandas as pd

# Define the required feature columns from the dataset
required_features = [
    "Age", "Weight", "Length", "BMI", "DM", "HTN", "FAMILYHTN", "CurrentSmoker", "EXSmoker", "FH",
    "Obesity", "CHAGHISHEKAMI", "CRF", "CVA", "Airwaydisease", "ThyroidDisease", "HLP", "STRESS", "noise",
    "shiftwork", "BP_Systolic", "BP_Diastolic", "Heart Rate", "Edema", "WeakPeripheralPulse", "Lungrales",
    "SystolicMurmur", "DiastolicMurmur", "ChestPain", "Dyspnea", "heartbeat", "syanoz", "Function",
    "exercisetest", "arytmi", "FBS", "CR", "TG", "LDL", "HDL", "BUN", "RBC", "Hemoglobin", "POLY",
    "WBC", "Lymph", "eo", "Platelets", "Hematocrit"
]

# Extracted values from PDF (Example: Dynamically obtained)
pdf_values = extracted_values

# Convert BP to Systolic & Diastolic if available
if "Blood Pressure" in pdf_values:
    systolic_bp, diastolic_bp = map(int, pdf_values["Blood Pressure"].split('/'))
    pdf_values["BP_Systolic"] = systolic_bp
    pdf_values["BP_Diastolic"] = diastolic_bp

# ✅ Dynamically Create Input Array with Missing Values as 0
new_input= {feature: pdf_values.get(feature, 0) for feature in required_features}

print(new_input)

# Convert new input to DataFrame with feature names
columns = df.drop(columns=["heartattack"]).columns  # Assuming 'heartattack' is the target column
new_input_df = pd.DataFrame([new_input], columns=columns)

# Scale the new input
new_input_scaled = scaler.transform(new_input_df)

# Load the trained model
with open("/content/trained_model.pkl", "rb") as file:
    trained_model = pickle.load(file)

# Make prediction
prediction = trained_model.predict(new_input_scaled)

# Print Prediction Result
print("\n🔍 Prediction Result:")
print("✅ Heart Attack Risk: Yes" if prediction[0] == 1 else "❌ No Heart Attack Risk")


{'Age': 0, 'Weight': 0, 'Length': 0, 'BMI': 0, 'DM': 0, 'HTN': 0, 'FAMILYHTN': 0, 'CurrentSmoker': 0, 'EXSmoker': 0, 'FH': 0, 'Obesity': 0, 'CHAGHISHEKAMI': 0, 'CRF': 0, 'CVA': 0, 'Airwaydisease': 0, 'ThyroidDisease': 0, 'HLP': 0, 'STRESS': 0, 'noise': 0, 'shiftwork': 0, 'BP_Systolic': 0, 'BP_Diastolic': 0, 'Heart Rate': 0, 'Edema': 0, 'WeakPeripheralPulse': 0, 'Lungrales': 0, 'SystolicMurmur': 0, 'DiastolicMurmur': 0, 'ChestPain': 0, 'Dyspnea': 0, 'heartbeat': 0, 'syanoz': 0, 'Function': 0, 'exercisetest': 0, 'arytmi': 0, 'FBS': 0, 'CR': 0, 'TG': 0, 'LDL': 0, 'HDL': 0, 'BUN': 0, 'RBC': 0, 'Hemoglobin': 0, 'POLY': 0, 'WBC': 0, 'Lymph': 0, 'eo': 0, 'Platelets': 0, 'Hematocrit': '0.48'}

🔍 Prediction Result:
✅ Heart Attack Risk: Yes


In [22]:
import numpy as np
import pandas as pd
import pickle

# ✅ Define Standard Mapping with Multiple Variants
standard_mapping = {
    # RBC Variants
    "Red blood cell count (RBC)": "RBC",
    "RBC Count": "RBC",
    "Erythrocyte Count": "RBC",
    "Red Cells": "RBC",
    "Red blood cells":"RBC",

    # Hemoglobin Variants
    "Hemoglobin": "Hemoglobin",
    "Hb": "Hemoglobin",
    "Hb(HEMOGLOBIN)": "Hemoglobin",
    "Hemoglobin level": "Hemoglobin",

    # Hematocrit Variants
    "Hematocrit": "Hematocrit",
    "Het": "Hematocrit",
    "HCT": "Hematocrit",
    "PCV (Packed Cell Volume)": "Hematocrit",

    # WBC Variants
    "White blood cell count (WBC)": "WBC",
    "WBC Count": "WBC",
    "Leukocytes": "WBC",
    "Total Leukocyte Count (TLC)": "WBC",

    # Platelets Variants
    "Platelet Count": "Platelets",
    "Platelets": "Platelets",
    "PLT": "Platelets",
    "Thrombocytes": "Platelets",

    # Blood Pressure Variants (Systolic & Diastolic)
    "Blood Pressure": ["BP_Systolic", "BP_Diastolic"],
    "BP": ["BP_Systolic", "BP_Diastolic"],
    "Systolic BP": "BP_Systolic",
    "Diastolic BP": "BP_Diastolic",

    # Other Test Variants
    "Mean Corpuscular Volume (MCV)": "MCV",
    "MCV": "MCV",
    "Mean Cell Volume": "MCV",

    "Mean Corpuscular Hemoglobin (MCH)": "MCH",
    "MCH": "MCH",
    "Mean Cell Hemoglobin": "MCH",

    "Mean Corpuscular Hemoglobin Concentration (MCHC)": "MCHC",
    "MCHC": "MCHC",
    "Mean Cell Hemoglobin Concentration": "MCHC",

    "Neutrophils": "DC Neutrophils",
    "Neutrophils polynuclear": "DC Neutrophils",
    "DC Neutrophils": "DC Neutrophils",
    "DC Neutrophil Count": "DC Neutrophils",
    "Lymphocytes": "Lymphocytes",
    "Monocytes": "Monocytes",
    "Eosinophils": "Eosinophils",
    "Eosinophil Count": "Eosinophils",

    "ESR": "ESR",
    "Erythrocyte Sedimentation Rate": "ESR",

    "FBS": "FBS",
    "Fasting Blood Sugar": "FBS",
    "Fasting Glucose": "FBS",

    "LDL Cholesterol": "LDL",
    "HDL Cholesterol": "HDL",
    "Triglycerides (TG)": "TG",
    "Triglycerides": "TG",
}

# ✅ Define required model feature columns
required_features = [
    "Age", "Weight", "Length", "BMI", "DM", "HTN", "FAMILYHTN", "CurrentSmoker", "EXSmoker", "FH",
    "Obesity", "CHAGHISHEKAMI", "CRF", "CVA", "Airwaydisease", "ThyroidDisease", "HLP", "STRESS", "noise",
    "shiftwork", "BP_Systolic", "BP_Diastolic", "Heart Rate", "Edema", "WeakPeripheralPulse", "Lungrales",
    "SystolicMurmur", "DiastolicMurmur", "ChestPain", "Dyspnea", "heartbeat", "syanoz", "Function",
    "exercisetest", "arytmi", "FBS", "CR", "TG", "LDL", "HDL", "BUN", "RBC", "Hemoglobin", "POLY",
    "WBC", "Lymph", "eo", "Platelets", "Hematocrit"
]

# ✅ Example Extracted Values from PDF
pdf_values = extracted_values

# ✅ Standardize the extracted keys using multiple variant mappings
mapped_values = {}

for key, value in pdf_values.items():
    if key in standard_mapping:
        mapped_key = standard_mapping[key]
        if isinstance(mapped_key, list) and key in ["Blood Pressure", "BP"]:
            # Special Case: Split BP into Systolic & Diastolic
            systolic_bp, diastolic_bp = map(int, value.split('/'))
            mapped_values["BP_Systolic"] = systolic_bp
            mapped_values["BP_Diastolic"] = diastolic_bp
        else:
            mapped_values[mapped_key] = value
    else:
        mapped_values[key] = value  # Use original key if no mapping exists

# ✅ Dynamically Create Input Array with Missing Values as 0
new_input = {feature: float(mapped_values.get(feature, 0)) for feature in required_features}

print("\n✅ **Mapped Input for Model:**")
print(new_input)

#✅ Convert new input to DataFrame with feature names, matching training set columns
# Use the original DataFrame's columns to guarantee the correct order and names
columns = df.drop(columns=["heartattack"]).columns  # Get columns from the original training DataFrame
new_input_df = pd.DataFrame([new_input], columns=columns)  # Use these columns in your new DataFrame


# ✅ Load the trained model
with open("/content/trained_model.pkl", "rb") as file:
    trained_model = pickle.load(file)

# ✅ Scale the new input
new_input_scaled = scaler.transform(new_input_df)

# ✅ Make Prediction
prediction = trained_model.predict(new_input_scaled)

# ✅ Print Prediction Result
print("\n🔍 Prediction Result:")
print("✅ Heart Attack Risk: Yes" if prediction[0] == 1 else "❌ No Heart Attack Risk")



✅ **Mapped Input for Model:**
{'Age': 0.0, 'Weight': 0.0, 'Length': 0.0, 'BMI': 0.0, 'DM': 0.0, 'HTN': 0.0, 'FAMILYHTN': 0.0, 'CurrentSmoker': 0.0, 'EXSmoker': 0.0, 'FH': 0.0, 'Obesity': 0.0, 'CHAGHISHEKAMI': 0.0, 'CRF': 0.0, 'CVA': 0.0, 'Airwaydisease': 0.0, 'ThyroidDisease': 0.0, 'HLP': 0.0, 'STRESS': 0.0, 'noise': 0.0, 'shiftwork': 0.0, 'BP_Systolic': 0.0, 'BP_Diastolic': 0.0, 'Heart Rate': 0.0, 'Edema': 0.0, 'WeakPeripheralPulse': 0.0, 'Lungrales': 0.0, 'SystolicMurmur': 0.0, 'DiastolicMurmur': 0.0, 'ChestPain': 0.0, 'Dyspnea': 0.0, 'heartbeat': 0.0, 'syanoz': 0.0, 'Function': 0.0, 'exercisetest': 0.0, 'arytmi': 0.0, 'FBS': 0.0, 'CR': 0.0, 'TG': 0.0, 'LDL': 0.0, 'HDL': 0.0, 'BUN': 0.0, 'RBC': 1.69, 'Hemoglobin': 151.0, 'POLY': 0.0, 'WBC': 25.0, 'Lymph': 0.0, 'eo': 0.0, 'Platelets': 0.0, 'Hematocrit': 0.48}

🔍 Prediction Result:
✅ Heart Attack Risk: Yes


In [23]:
!pip install google-generativeai




In [25]:
import os
import json
import requests

# ✅ Retrieve Mistral API Key Securely
MISTRAL_API_TOKEN = os.getenv("MISTRAL_API_KEY")

def generate_medical_explanation(numeric_values):
    """Dynamically generates an AI-powered medical explanation using Mistral."""

    if not numeric_values:
        return "Error: No extracted medical values found."

    # Format extracted values into a readable format
    values_summary = "\n".join([f"{k}: {v}" for k, v in numeric_values.items()])

    # AI Prompt using extracted medical values
    prompt = f"""
    You are a medical expert. Based on the following patient's lab test results:

    {values_summary}

    Generate a structured **concise** medical report including:
    - Explanation of any abnormal conditions.
    - Causes and associated risks.
    - Recommended lifestyle changes.
    - Urgent precautions if necessary.
    - Keep it **brief but informative**.
    """

    # ✅ API Call to Mistral
    headers = {
        "Authorization": f"Bearer {MISTRAL_API_TOKEN}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "open-mixtral-8x7b",  # Best for structured output
        "messages": [
            {"role": "system", "content": "Generate a structured medical report based on the patient's lab test results."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.5
    }

    response = requests.post("https://api.mistral.ai/v1/chat/completions", headers=headers, json=payload)

    if response.status_code != 200:
        return f"Error: Mistral API failed - {response.text}"

    try:
        # Extract the AI response text
        mistral_response = response.json()["choices"][0]["message"]["content"]
        return mistral_response.strip()
    except (json.JSONDecodeError, KeyError):
        return "Error: Failed to parse Mistral response."

# ✅ Example usage with extracted values
numeric_values = extracted_values  # Replace with your actual extracted values

report = generate_medical_explanation(numeric_values)

# ✅ Print AI-generated report
print("\n📝 AI-Generated Medical Report:\n")
print(report)



📝 AI-Generated Medical Report:

Medical Report:

1. Red Blood Cells (RBC): The RBC count is 1.69 million cells/mcL, which is within the normal range (4.2-5.4 million cells/mcL for adult males and 3.6-4.8 million cells/mcL for adult females). Therefore, this result is normal.

2. Hemoglobin Level: The hemoglobin level is 15.1 g/dL, which is higher than the typical range for both men (13.5-17.5 g/dL) and women (12.0-15.5 g/dL). This may indicate a condition called polycythemia, where there is an overproduction of RBCs. This can increase blood viscosity, leading to potential complications such as clot formation and impaired blood flow.

3. Hematocrit: The hematocrit is 0.48 L/L, also higher than the normal range for both sexes (0.41-0.53 L/L). This further supports the possibility of polycythemia.

4. MCV, MCH, MCHC: These values are all within normal ranges, suggesting no abnormalities in red cell size or hemoglobin content.

5. Leukocytes: The white blood cell count is 25,000 cells/mcL

In [28]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==3.1.0a0

In [29]:
from googletrans import Translator

def translate_report(report_text, target_language):
    """Translates the AI-generated report into the specified language."""
    translator = Translator()
    translated_text = translator.translate(report_text, dest=target_language).text
    return translated_text

# ✅ Translate into Telugu & Tamil
translated_telugu = translate_report(report, "te")
translated_tamil = translate_report(report, "ta")

# ✅ Print Translated Reports
print("\n📄 AI-Generated Report (Telugu):\n", translated_telugu)
print("\n📄 AI-Generated Report (Tamil):\n", translated_tamil)



📄 AI-Generated Report (Telugu):
 వైద్య నివేదిక:

1. ఎర్ర రక్త కణాలు (ఆర్‌బిసి): ఆర్‌బిసి లెక్కింపు 1.69 మిలియన్ కణాలు/ఎంసిఎల్, ఇది సాధారణ పరిధిలో ఉంది (వయోజన పురుషులకు 4.2-5.4 మిలియన్ కణాలు/ఎంసిఎల్ మరియు వయోజన ఆడవారికి 3.6-4.8 మిలియన్ కణాలు/ఎంసిఎల్). కాబట్టి, ఈ ఫలితం సాధారణం.

2. హిమోగ్లోబిన్ స్థాయి: హిమోగ్లోబిన్ స్థాయి 15.1 గ్రా/డిఎల్, ఇది ఇద్దరికీ (13.5-17.5 గ్రా/డిఎల్) మరియు మహిళలు (12.0-15.5 గ్రా/డిఎల్) ఇద్దరికీ సాధారణ పరిధి కంటే ఎక్కువ. ఇది పాలిసిథెమియా అని పిలువబడే పరిస్థితిని సూచిస్తుంది, ఇక్కడ RBC ల యొక్క అధిక ఉత్పత్తి ఉంది. ఇది రక్త స్నిగ్ధతను పెంచుతుంది, ఇది గడ్డకట్టడం మరియు బలహీనమైన రక్త ప్రవాహం వంటి సమస్యలకు దారితీస్తుంది.

3. హేమాటోక్రిట్: హేమాటోక్రిట్ 0.48 ఎల్/ఎల్, ఇది రెండు లింగాలకు (0.41-0.53 ఎల్/ఎల్) సాధారణ పరిధి కంటే ఎక్కువ. ఇది పాలిసిథెమియా యొక్క అవకాశానికి మరింత మద్దతు ఇస్తుంది.

4.

5. ల్యూకోసైట్లు: తెల్ల రక్త కణాల సంఖ్య 25,000 కణాలు/mcl, ఇది కొద్దిగా పెరిగింది (సాధారణ పరిధి: 4,500-11,000 కణాలు/mcl). ఇది చిన్న ఇన్ఫెక్షన్లు, మంట లేదా ఒత్తిడి వల్ల కావచ్చు.

6. న్యూట

In [30]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=3c6c705f461de66e1ee3f35551aec44b71d6e84d7a5175a1c20b691038a2f23d
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [31]:
from fpdf import FPDF

def save_report_as_pdf(report_text, filename="Medical_Report.pdf"):
    """Saves the AI-generated medical report as a PDF file."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add title
    pdf.set_font("Arial", style='B', size=16)
    pdf.cell(200, 10, "Medical Report", ln=True, align='C')
    pdf.ln(10)

    # Add content, encoding the text with 'utf-8'
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, report_text.encode('utf-8').decode('latin-1','ignore')) # Encode with utf-8 and decode ignoring errors

    # Save the PDF
    pdf.output(filename)
    print(f"✅ Report saved as {filename}")

# ✅ Save reports as PDF
save_report_as_pdf(report, "Medical_Report_English.pdf")
save_report_as_pdf(translated_telugu, "Medical_Report_Telugu.pdf")
save_report_as_pdf(translated_tamil, "Medical_Report_Tamil.pdf")

✅ Report saved as Medical_Report_English.pdf
✅ Report saved as Medical_Report_Telugu.pdf
✅ Report saved as Medical_Report_Tamil.pdf
