In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nTarget Variable Distribution:")
print(df['DEATH_EVENT'].value_counts())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Basic statistics
print("\nBasic Statistics:")
print(df.describe())

# Feature Engineering and Data Preprocessing
# Separate features and target
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

print(f"\nFeature columns: {list(X.columns)}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features for certain algorithms
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training and Evaluation
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Train and evaluate models
results = {}

print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

for name, model in models.items():
    if name in ['Logistic Regression', 'SVM']:
        # Use scaled data for these models
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    else:
        # Use original data for tree-based models
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)

    accuracy = accuracy_score(y_test, y_pred)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()

    results[name] = {
        'accuracy': accuracy,
        'cv_mean': cv_mean,
        'cv_std': cv_std,
        'model': model
    }

    print(f"\n{name}:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"CV Score: {cv_mean:.4f} (+/- {cv_std*2:.4f})")

# Find the best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']

print(f"\n" + "="*60)
print(f"BEST MODEL: {best_model_name}")
print(f"Best Test Accuracy: {best_accuracy:.4f}")
print("="*60)

# Hyperparameter tuning for the best performing model
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42),
                              param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

else:
    grid_search = None

# Use tuned model if hyperparameter tuning was performed
if grid_search:
    final_model = grid_search.best_estimator_
    print(f"\nBest Parameters: {grid_search.best_params_}")

    # Evaluate tuned model
    if best_model_name in ['Logistic Regression', 'SVM']:
        final_predictions = final_model.predict(X_test_scaled)
    else:
        final_predictions = final_model.predict(X_test)

    final_accuracy = accuracy_score(y_test, final_predictions)
    print(f"Tuned Model Accuracy: {final_accuracy:.4f}")

    if final_accuracy > best_accuracy:
        best_model = final_model
        best_accuracy = final_accuracy
        print("Tuned model is better!")
    else:
        print("Original model is better!")

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\nFeature Importance ({best_model_name}):")
    print(feature_importance)

# Final evaluation
print(f"\n" + "="*60)
print("FINAL MODEL PERFORMANCE")
print("="*60)

if best_model_name in ['Logistic Regression', 'SVM']:
    final_predictions = best_model.predict(X_test_scaled)
else:
    final_predictions = best_model.predict(X_test)

print(f"Final Model: {best_model_name}")
print(f"Final Accuracy: {best_accuracy:.4f}")
print(f"Accuracy > 80%: {'YES' if best_accuracy > 0.8 else 'NO'}")

print(f"\nClassification Report:")
print(classification_report(y_test, final_predictions))

print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, final_predictions))

# Save the model and scaler
joblib.dump(best_model, 'heart_failure_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print(f"\nModel saved as 'heart_failure_model.pkl'")
print(f"Scaler saved as 'scaler.pkl'")

# Save model info for Flask app
model_info = {
    'model_name': best_model_name,
    'accuracy': best_accuracy,
    'features': list(X.columns),
    'use_scaling': best_model_name in ['Logistic Regression', 'SVM']
}

import json
with open('model_info.json', 'w') as f:
    json.dump(model_info, f)

print(f"Model info saved as 'model_info.json'")

Dataset Shape: (299, 13)

First few rows:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1 

In [None]:
import joblib


In [None]:
joblib.dump(model, 'model.pkl')


['model.pkl']

In [None]:
from google.colab import files
files.download('model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('scaler.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('model_info.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('heart_failure_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>