In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import pickle

# Load dataset
file_path = "C:\\Users\\jagin\\general_disease_diagnosis.csv"
df = pd.read_csv(file_path)

# Data preprocessing
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
target_column = next((col for col in df.columns if 'condition' in col.lower() or 
                      'disease' in col.lower() or 'diagnosis' in col.lower()), None)

# Fill missing values
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Prepare features and target
features = [col for col in numeric_columns if col != target_column]
X = df[features]
y = df[target_column]

# Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Address class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Random Forest hyperparameters for tuning
rf_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 3, 5]
}

# Train the Random Forest model using GridSearchCV
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
grid_search_rf = GridSearchCV(rf_model, rf_params, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train)

# Get the best Random Forest model and evaluate it
best_rf_model = grid_search_rf.best_estimator_
y_pred = best_rf_model.predict(X_test_scaled)
f1 = f1_score(y_test, y_pred, average="weighted")
print(f"Random Forest Best F1 Score: {f1:.4f} with parameters: {grid_search_rf.best_params_}")

# Save the trained Random Forest model, scaler, and label encoder to a pickle file
model_filename = "best_rf_model_with_scaler_encoder.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump({
        'model': best_rf_model,
        'scaler': scaler,
        'label_encoder': label_encoder
    }, file)

print(f"Random Forest model, scaler, and label encoder saved to {model_filename}")


Random Forest Best F1 Score: 0.3216 with parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500}
Random Forest model, scaler, and label encoder saved to best_rf_model_with_scaler_encoder.pkl


In [23]:
import pandas as pd
import pickle

# Load the pre-trained RandomForest model, scaler, and label encoder
with open("C:\\Users\\jagin\\best_rf_model_with_scaler_encoder.pkl", 'rb') as file:
    saved_data = pickle.load(file)

model = saved_data['model']
scaler = saved_data['scaler']
label_encoder = saved_data['label_encoder']

# Load the CSV file with missing 'Disease' column
data = pd.read_csv("C:\\Users\\jagin\\disease.csv")  # Replace with your actual CSV file path

# Check if the required features are present in the data
required_features = ['Age', 'Weight_kg', 'Height_cm', 'Blood_Pressure_mmHg']  # Replace with your actual feature names
for feature in required_features:
    if feature not in data.columns:
        raise ValueError(f"Missing required feature: {feature}")

# Handle any potential missing values in the dataset
data[required_features] = data[required_features].fillna(data[required_features].mean())

# Scale the features using the loaded scaler
scaled_features = scaler.transform(data[required_features])

# Make predictions using the loaded model
predicted_labels = model.predict(scaled_features)

# Convert numeric predictions back to original labels using the label encoder
data['Disease'] = label_encoder.inverse_transform(predicted_labels)

# Save the result to a new CSV file
output_path = 'data_with_predicted_disease.csv'
data.to_csv(output_path, index=False)
print(f"Predictions saved to '{output_path}'")


Predictions saved to 'data_with_predicted_disease.csv'
