Heart_Disease_ Deployment_Final

In [1]:
# 1. Install necessary packages:
!pip install Flask joblib
!pip install imbalanced-learn

# 2. Model Training and Saving:
import pandas as pd
import requests
import zipfile
import io
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Download and extract the dataset
f_zip = 'https://archive.ics.uci.edu/static/public/45/heart+disease.zip'
r = requests.get(f_zip, stream=True)
heart_disease_zip = zipfile.ZipFile(io.BytesIO(r.content))
heart_disease_zip.extractall()

# Define column names and load the datasets
col_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'class']
urls = ['processed.cleveland.data', 'processed.hungarian.data', 'processed.switzerland.data', 'processed.va.data']
dataFrames = [pd.read_csv(url, header=None, names=col_names, na_values='?') for url in urls]

# Combine all data frames into one
heart_disease_df = pd.concat(dataFrames, ignore_index=True)

# Data Cleaning: Handle missing values
heart_disease_df = heart_disease_df.dropna()

# Define features and target
X = heart_disease_df.drop(columns='class')
y = heart_disease_df['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Use SMOTE to address class imbalance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define the model and parameter grid
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Get the best model and evaluate it
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

# Print the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

# Save the model and the scaler
joblib.dump(best_model, 'heart_disease_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# 3. Performing Predictions:
import joblib
import pandas as pd

# Load the model and the scaler
clf = joblib.load('heart_disease_model.pkl')
scaler = joblib.load('scaler.pkl')

# Load the test dataset (for demonstration purposes, using the same dataset)
test = pd.read_csv('processed.cleveland.data', header=None, names=col_names, na_values='?')
test = test.dropna()

# Separate features and target for test data
X_test = test.drop(columns='class')
y_test = test['class']

# Standardize the features
X_test = scaler.transform(X_test)

# Make predictions
y_pred = clf.predict(X_test)

# Create a DataFrame for predictions
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predictions.head(10)

# 4. Evaluating Predictions:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the predictions
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        32
           1       0.25      0.18      0.21        11
           2       0.00      0.00      0.00         7
           3       0.33      0.43      0.38         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        60
   macro avg       0.29      0.32      0.30        60
weighted avg       0.53      0.62      0.57        60

Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       160
           1       0.88      0.83      0.86        54
           2       0.88      0.80      0.84        35
           3       0.84      0.89      0.86        35
           4       0.91      0.77      0.83 