In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Mount Google Drive (optional, if files are stored there)
from google.colab import drive
drive.mount('/content/drive')

# Define file paths (update paths if necessary)
file_paths = {
    "diabetes": "/content/diabetes_data.csv",
    "heart_disease": "/content/heart_disease_data.csv",
    "parkinsons": "/content/parkinson_data.csv",
    "hypothyroid": "/content/prepocessed_hypothyroid.csv",
    "lung_cancer": "/content/prepocessed_lungs_data.csv"
}

# Load datasets safely
datasets = {}

for name, path in file_paths.items():
    if os.path.exists(path):
        datasets[name] = pd.read_csv(path)
    else:
        print(f"Warning: {name} dataset not found at {path}. Skipping.")

# Function to preprocess data
def preprocess_data(df, target_column):
    X = df.drop(columns=[target_column])
    X = pd.get_dummies(X)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test, scaler

# Train and save models
def train_and_save_model(data, target_column, model_name):
    if data is None:
        print(f"Skipping {model_name} training due to missing data.")
        return

    # Print the columns of the DataFrame before dropping
    print(f"Columns in DataFrame: {data.columns.tolist()}")

    # Check if the target column exists in the DataFrame
    if target_column not in data.columns:
        print(f"Error: Target column '{target_column}' not found in the DataFrame.")
        return

    X_train, X_test, y_train, y_test, scaler = preprocess_data(data, target_column)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Model Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    # Save models in Colab workspace
    joblib.dump(model, f"/content/{model_name}.sav")
    joblib.dump(scaler, f"/content/{model_name}_scaler.sav")

# Train models if data is available
if "diabetes" in datasets:
    train_and_save_model(datasets["diabetes"], 'Outcome', 'diabetes_model')

if "heart_disease" in datasets:
    train_and_save_model(datasets["heart_disease"], 'target', 'heart_disease_model')

if "parkinsons" in datasets:
    train_and_save_model(datasets["parkinsons"], 'status', 'parkinsons_model')

if "hypothyroid" in datasets:
    train_and_save_model(datasets["hypothyroid"], 'binaryClass', 'thyroid_model')

if "lung_cancer" in datasets:
    train_and_save_model(datasets["lung_cancer"], 'LUNG_CANCER', 'lungs_disease_model')

print("Training process completed!")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Columns in DataFrame: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
diabetes_model Model Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154

Columns in DataFrame: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
heart_disease_model Model Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.84      0.84      0.84        32

    accurac