In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer


In [2]:
df_heart = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\Heart_Disease_Prediction.csv")
df_diabetes = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\diabetes_new.csv")
df_stroke = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\stroke.csv")
df_fatty_liver = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\fatty_liver.csv")
df_metabolic_syndrome = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\Metabolic Syndrome.csv")
df_hypertension = pd.read_csv(r"C:\Users\rajes\Datascience_jp\Disease-al-datasets\Hypertension_data.csv")


In [3]:
import pandas as pd

# Load your datasets with only the required columns
def load_data(file_path, use_cols=None):
    return pd.read_csv(file_path, usecols=use_cols, dtype={
        'Age': 'int8',
        'BMI': 'float32',
        'Glucose': 'float32',
        'BP': 'float32',
        'Cholesterol': 'float32',
        'Insulin': 'float32',
        'Gender': 'category'  # Using category to save memory
    })

# Function to sample data with a smaller sample size
def sample_data(df, n=500):
    if len(df) < n:
        print(f"Warning: Dataset has only {len(df)} rows, sampling all.")
        return df  
    return df.sample(n=n, random_state=42)

# Sample rows from each DataFrame
df_heart = sample_data(df_heart)
df_diabetes = sample_data(df_diabetes)
df_stroke = sample_data(df_stroke)
df_fatty_liver = sample_data(df_fatty_liver)
df_metabolic_syndrome = sample_data(df_metabolic_syndrome)
df_hypertension = sample_data(df_hypertension)

# Combine DataFrames carefully
df_combined = df_heart.copy()  # Start with heart data

# Merge DataFrames step by step, checking memory
for df_to_merge in [df_diabetes, df_stroke, df_fatty_liver, df_metabolic_syndrome, df_hypertension]:
    # Check memory usage before merge
    try:
        df_combined = pd.merge(df_combined, df_to_merge, how='outer', on='Age', suffixes=('', f'_{df_to_merge.columns[-1]}'))
        print(f"Merged with {df_to_merge.columns[-1]} data.")
    except MemoryError:
        print("MemoryError: Unable to merge, skipping this DataFrame.")
        continue

# Display the final DataFrame shape
print("Final combined DataFrame shape:", df_combined.shape)

# Save the combined DataFrame to a compressed pickle file for future use
df_combined.to_pickle('combined_health_data.pkl', compression='bz2')
print("Combined DataFrame saved as 'combined_health_data.pkl'.")


Merged with Diabetes data.
Merged with Stroke data.
Merged with Fatty_Liver data.


  df_combined = pd.merge(df_combined, df_to_merge, how='outer', on='Age', suffixes=('', f'_{df_to_merge.columns[-1]}'))
  df_combined = pd.merge(df_combined, df_to_merge, how='outer', on='Age', suffixes=('', f'_{df_to_merge.columns[-1]}'))
  df_combined = pd.merge(df_combined, df_to_merge, how='outer', on='Age', suffixes=('', f'_{df_to_merge.columns[-1]}'))


Merged with Metabolic_Syndrome data.


  df_combined = pd.merge(df_combined, df_to_merge, how='outer', on='Age', suffixes=('', f'_{df_to_merge.columns[-1]}'))


Merged with Hypertension data.
Final combined DataFrame shape: (12781321, 26)
Combined DataFrame saved as 'combined_health_data.pkl'.


In [4]:
df_combined.shape

(12781321, 26)

In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
import joblib
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.sparse import csr_matrix

# Load the combined DataFrame from the pickle file
print("Loaded DataFrame shape:", df_combined.shape)

# Define features and target variables
features = ['Age', 'Gender', 'BP', 'Cholesterol', 'Heart Rate', 'Glucose', 'Insulin', 'BMI']
target_columns = ['Heart_Disease', 'Diabetes', 'Stroke', 'Fatty_Liver', 'Metabolic_Syndrome', 'Hypertension']

# Fill missing values for the features using SimpleImputer
imputer = SimpleImputer(strategy='mean')
df_combined[features] = imputer.fit_transform(df_combined[features])

# Ensure target columns are present; fill missing target values with 0
for target in target_columns:
    df_combined[target] = df_combined.get(target, 0).fillna(0)

# Prepare your features and target variables
y = df_combined[target_columns]
X = df_combined[features]

# Convert features DataFrame to a sparse matrix
X_sparse = csr_matrix(X.values)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
xgb_model = XGBRegressor(n_jobs=-1, verbose=True)

# Define a smaller parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200],  # Reduced from 100, 300, 500 to 100, 200
    'learning_rate': [0.01, 0.05],  # Reduced from 0.01, 0.05, 0.1 to 0.01, 0.05
    'max_depth': [3, 5],  # Reduced from [3, 5, 7] to [3, 5]
}

# Set up RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, cv=3,
                                   scoring='neg_mean_squared_error', verbose=1, random_state=42)

# Fit the model with training data and enable verbosity
random_search.fit(X_train, y_train)

# Get the best model after tuning
best_model = random_search.best_estimator_

# Print the best parameters
print(f"Best parameters: {random_search.best_params_}")

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"MSE: {mse}, MAE: {mae}, R-squared: {r2}")

# Save the best model to a file
model_filename = '2nd_xgboost_best_model.pkl'
joblib.dump(best_model, model_filename)
print(f"Best model saved as {model_filename}.")


Loaded DataFrame shape: (12781321, 26)




Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "v

Best parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05}
MSE: 0.046503002424643176, MAE: 0.10308372242778291, R-squared: 0.5468174815177917
Best model saved as 2nd_xgboost_best_model.pkl.


In [4]:
imputer = SimpleImputer(strategy='mean')
df_combined.fillna(0, inplace=True)


In [5]:
features = ['Age', 'Gender', 'BP', 'Cholesterol', 'Heart Rate', 'Glucose', 'Insulin', 'BMI']
df_combined['Heart_Disease'] = df_combined.get('Heart_Disease', 0)
df_combined['Diabetes'] = df_combined.get('Diabetes', 0)
df_combined['Stroke'] = df_combined.get('Stroke', 0)
df_combined['Fatty_Liver'] = df_combined.get('Fatty_Liver', 0)
df_combined['Metabolic_Syndrome'] = df_combined.get('Metabolic_Syndrome', 0)
df_combined['Hypertension'] = df_combined.get('Hypertension', 0)


In [8]:
import joblib
from sklearn.ensemble import RandomForestRegressor
y = df_combined[['Heart_Disease', 'Diabetes', 'Stroke', 'Fatty_Liver', 'Metabolic_Syndrome', 'Hypertension']]

# Handling categorical features (if any) and encoding
X = df_combined[features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model for multi-label classification
model = RandomForestRegressor()
model.fit(X_train, y_train)


# Save the trained model to a file
model_filename = 'random_forest_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved as {model_filename}.")


Model saved as random_forest_model.pkl.


In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'R² Score: {r2:.4f}')


Mean Squared Error (MSE): 0.0450
Mean Absolute Error (MAE): 0.0899
R² Score: 0.5540


In [13]:
# Function to predict diseases based on user input using regression
def predict_diseases(user_input, threshold=0.5):
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_input])
    
    # Handle missing values if any
    user_df.fillna(0, inplace=True)
    
    # Make predictions (continuous values)
    prediction = model.predict(user_df[features])
    
    # Map prediction results to disease names with a threshold
    diseases = ['Heart Disease', 'Diabetes', 'Stroke', 'Fatty Liver', 'Metabolic Syndrome', 'Hypertension']
    predicted_diseases = {disease: bool(pred > threshold) for disease, pred in zip(diseases, prediction[0])}
    
    return predicted_diseases

# Example user input
user_input = {
    'Age': 65,
    'Gender': 1,       # 1 for Male, 0 for Female (or as per your dataset encoding)
    'BP': 130,         # Blood Pressure
    'Cholesterol': 250,
    'Heart Rate': 80,
    'Glucose': 100,
    'Insulin': 30,
    'BMI': 25
}

# Predict diseases based on user input
result = predict_diseases(user_input)
print(result)


{'Heart Disease': True, 'Diabetes': False, 'Stroke': False, 'Fatty Liver': False, 'Metabolic Syndrome': False, 'Hypertension': True}


In [14]:
# Function to predict diseases based on user input using regression
def predict_diseases(user_input):
    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_input])
    
    # Handle missing values if any
    user_df.fillna(0, inplace=True)
    
    # Make predictions (continuous values)
    prediction = model.predict(user_df[features])
    
    # Map prediction results to disease names
    diseases = ['Heart Disease', 'Diabetes', 'Stroke', 'Fatty Liver', 'Metabolic Syndrome', 'Hypertension']
    predicted_diseases = {disease: pred for disease, pred in zip(diseases, prediction[0])}
    
    # predicted_diseases = {disease: bool(pred > threshold) for disease, pred in zip(diseases, prediction[0])}

    
    return predicted_diseases

# Example user input
user_input = {
    'Age': 65,
    'Gender': 1,       # 1 for Male, 0 for Female (or as per your dataset encoding)
    'BP': 130,         # Blood Pressure
    'Cholesterol': 250,
    'Heart Rate': 80,
    'Glucose': 100,
    'Insulin': 30,
    'BMI': 25
}

# Predict diseases based on user input
result = predict_diseases(user_input)
print(result)


{'Heart Disease': 1.0, 'Diabetes': 0.0, 'Stroke': 0.0, 'Fatty Liver': 0.0, 'Metabolic Syndrome': 0.48417557455081783, 'Hypertension': 1.0}
