In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


In [2]:
def generate_synthetic_data_with_diseases(num_samples=1000):
    data = pd.DataFrame({
        'age': np.random.randint(20, 65, num_samples),
        'height': np.random.randint(150, 200, num_samples),
        'weight': np.random.randint(50, 120, num_samples),
        'BMI': np.random.uniform(18, 35, num_samples),
        'red_blood_cells': np.random.uniform(4.5, 6.1, num_samples),
        'white_blood_cells': np.random.uniform(4.0, 10.8, num_samples),
        'platelets': np.random.uniform(150, 400, num_samples),
        'hemoglobin': np.random.uniform(13.0, 17.0, num_samples),
        'hematocrit': np.random.uniform(40, 52, num_samples),
        'BUN': np.random.uniform(6, 20, num_samples),
        'creatinine': np.random.uniform(0.6, 1.3, num_samples),
        'glucose': np.random.uniform(70, 200, num_samples),
        'CO2': np.random.uniform(23, 29, num_samples),
        'calcium': np.random.uniform(8.5, 10.2, num_samples),
        'sodium': np.random.uniform(135, 150, num_samples),
        'potassium': np.random.uniform(3.0, 5.2, num_samples),
        'chloride': np.random.uniform(96, 106, num_samples),
        'HDL': np.random.uniform(20, 100, num_samples),
        'LDL': np.random.uniform(0, 200, num_samples),
        'T3': np.random.uniform(80, 180, num_samples),
        'T4': np.random.uniform(0.8, 1.8, num_samples),
        'TSH': np.random.uniform(0.5, 4, num_samples),
        'hs_cTn': np.random.uniform(0, 1, num_samples),
        'BNP': np.random.uniform(0, 300, num_samples),
        'NT_proBNP': np.random.uniform(0, 300, num_samples),
        'CK': np.random.uniform(30, 200, num_samples),
        'CK_MB': np.random.uniform(0, 12, num_samples),
        'RR_interval': np.random.uniform(0.6, 1.2, num_samples),
        'P_wave': np.random.uniform(80, 80, num_samples),
        'PR_interval': np.random.uniform(120, 200, num_samples),
        'PR_segment': np.random.uniform(50, 120, num_samples),
        'QRS_complex': np.random.uniform(80, 100, num_samples),
        'ST_segment': np.random.uniform(80, 120, num_samples),
        'T_wave': np.random.uniform(160, 160, num_samples),
        'QT_interval': np.random.uniform(0, 420, num_samples),
        'Diabetes': np.random.choice([0, 1], num_samples),
        'Hypertension': np.random.choice([0, 1], num_samples),
        'Heart_Disease': np.random.choice([0, 1], num_samples),
        'Kidney_Disease': np.random.choice([0, 1], num_samples),
    })
    
    # Apply some logic to correlate diseases with risk factors
    data['Diabetes'] = ((data['glucose'] > 126) | (data['BMI'] > 30)).astype(int)
    data['Hypertension'] = ((data['sodium'] > 145) | (data['potassium'] < 3.5) | (data['BMI'] > 30)).astype(int)
    data['Heart_Disease'] = ((data['LDL'] > 100) | (data['HDL'] < 40) | (data['hs_cTn'] > 0.4) | (data['BNP'] > 100)).astype(int)
    data['Kidney_Disease'] = ((data['BUN'] > 20) | (data['creatinine'] > 1.3)).astype(int)
    
    return data
    

In [3]:
# Generate synthetic data with diseases
data = generate_synthetic_data_with_diseases()

In [4]:
# Define feature and target variables
X = data.drop(columns=['Diabetes', 'Hypertension', 'Heart_Disease', 'Kidney_Disease'])
y = data[['Diabetes', 'Hypertension', 'Heart_Disease', 'Kidney_Disease']]

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [7]:
# Train the model
rf_classifier.fit(X_train, y_train)

In [8]:
# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.94

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       130
           1       1.00      1.00      1.00       126
           2       0.95      1.00      0.97       190
           3       0.00      0.00      0.00         0

   micro avg       0.97      1.00      0.99       446
   macro avg       0.73      0.75      0.74       446
weighted avg       0.97      1.00      0.99       446
 samples avg       0.97      1.00      0.98       446



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Save the model
joblib.dump(rf_classifier, 'disease_prediction_model.pkl')


['disease_prediction_model.pkl']

In [11]:
# Example of specific test case
specific_test_case = [[40, 180, 80, 25, 5.0, 8.0, 190, 15.0, 45, 15, 1.0, 100, 30, 9.5, 150, 5.0, 100, 60, 130, 120, 1.5, 3.0, 0.6, 70, 250, 120, 15, 1.5, 90, 180, 100, 95, 100, 180, 420]]


In [12]:
# Convert specific test case to DataFrame
specific_test_case_df = pd.DataFrame(specific_test_case, columns=X.columns)


In [13]:
# Predict using the trained Random Forest model
predicted_diseases = rf_classifier.predict(specific_test_case_df)[0]

In [14]:
# Interpret the predictions
disease_names = ['Diabetes', 'Hypertension', 'Heart_Disease', 'Kidney_Disease']
predicted_diseases_dict = {disease: bool(predicted) for disease, predicted in zip(disease_names, predicted_diseases)}


In [15]:
print("\nPredicted diseases for the specific test case:")
for disease, present in predicted_diseases_dict.items():
    status = "Present" if present else "Not Present"
    print(f"{disease}: {status}")


Predicted diseases for the specific test case:
Diabetes: Not Present
Hypertension: Present
Heart_Disease: Present
Kidney_Disease: Not Present
