In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib

statlog_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"
statlog_df = pd.read_csv(statlog_url, sep='\s+', header=None)
statlog_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                   'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
statlog_df.columns = statlog_columns
statlog_df['target'] = statlog_df['target'].apply(lambda x: 1 if x == 2 else 0)


datasets = {
    # 'Heart_Disease_Prediction (1).csv': pd.read_csv('Heart_Disease_Prediction (1).csv'), # This file was not found
    'Cardiovascular_Disease_Dataset.csv': pd.read_csv('Cardiovascular_Disease_Dataset.csv'),
    'hear_LAPPt.csv': pd.read_csv('hear_LAPPt.csv'),
    'heart nandal.csv': pd.read_csv('heart nandal.csv'),
    'statlog': statlog_df
}


standard_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'target']
column_mappings = {
    'Heart_Disease_Prediction (1).csv': {
        'Age': 'age', 'Sex': 'sex', 'Chest pain type': 'cp', 'BP': 'trestbps',
        'Cholesterol': 'chol', 'FBS over 120': 'fbs', 'EKG results': 'restecg',
        'Max HR': 'thalach', 'Exercise angina': 'exang', 'ST depression': 'oldpeak',
        'Slope of ST': 'slope', 'Number of vessels fluro': 'ca', 'Heart Disease': 'target'
    },
    'Cardiovascular_Disease_Dataset.csv': {
        'age': 'age', 'gender': 'sex', 'chestpain': 'cp', 'restingBP': 'trestbps',
        'serumcholestrol': 'chol', 'fastingbloodsugar': 'fbs', 'restingrelectro': 'restecg',
        'maxheartrate': 'thalach', 'exerciseangia': 'exang', 'oldpeak': 'oldpeak',
        'slope': 'slope', 'noofmajorvessels': 'ca', 'target': 'target'
    },
    'heart nandal.csv': {
        'trtbps': 'trestbps', 'thalachh': 'thalach', 'exng': 'exang', 'slp': 'slope',
        'caa': 'ca', 'output': 'target'
    },
    'hear_LAPPt.csv': {
        'age': 'age', 'sex': 'sex', 'cp': 'cp', 'trestbps': 'trestbps',
        'chol': 'chol', 'fbs': 'fbs', 'restecg': 'restecg', 'thalach': 'thalach',
        'exang': 'exang', 'oldpeak': 'oldpeak', 'slope': 'slope', 'ca': 'ca',
        'target': 'target'
    }
}


for name, df in datasets.items():
    if name in column_mappings:
        df.rename(columns=column_mappings[name], inplace=True)
    datasets[name] = df[[col for col in standard_columns if col in df.columns]]

combined_df = pd.concat(datasets.values(), ignore_index=True)
initial_rows = combined_df.shape[0]
combined_df = combined_df.drop_duplicates()
print(f"Rows before removing duplicates: {initial_rows}, after: {combined_df.shape[0]}")


numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
categorical_cols = ['cp', 'restecg', 'slope']
other_cols = ['sex', 'fbs', 'exang']
for col in numerical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mean())
for col in categorical_cols + other_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0])


combined_df['target'] = combined_df['target'].apply(lambda x: 1 if x in ['Presence', 1, '1'] else 0 if x in [0, '0', 'Absence'] else np.nan)
combined_df = combined_df.dropna(subset=['target'])


combined_df = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)
scaler = MinMaxScaler()
combined_df[numerical_cols] = scaler.fit_transform(combined_df[numerical_cols])
combined_df['target'] = combined_df['target'].astype(int)


X = combined_df.drop('target', axis=1)
y = combined_df['target']
rf_temp = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=8412)
rf_temp.fit(X, y)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_temp.feature_importances_
}).sort_values(by='Importance', ascending=False)
top_features = feature_importance['Feature'].head(12).tolist()  # Top 12 features
print("Top 12 Features:", top_features)


X_top = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.3, random_state=8412)


rf_model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=2, class_weight='balanced', random_state=8412)
rf_model.fit(X_train, y_train)


y_pred_train = rf_model.predict(X_train)
print("\nRandom Forest Train Metrics (12 features):")
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train) * 100:.2f}%")
print(f"Precision: {precision_score(y_train, y_pred_train) * 100:.2f}%")
print(f"Recall: {recall_score(y_train, y_pred_train) * 100:.2f}%")
print(f"F1-Score: {f1_score(y_train, y_pred_train) * 100:.2f}%")


y_pred_test = rf_model.predict(X_test)
print("\nRandom Forest Test Metrics (12 features):")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test) * 100:.2f}%")
print(f"Precision: {precision_score(y_test, y_pred_test) * 100:.2f}%")
print(f"Recall: {recall_score(y_test, y_pred_test) * 100:.2f}%")
print(f"F1-Score: {f1_score(y_test, y_pred_test) * 100:.2f}%")


cv_recall_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='recall')
print(f"CV Recall: {cv_recall_rf.mean():.4f} ± {cv_recall_rf.std():.4f}")


seeds = [44, 57, 98, 76]
for seed in seeds:
    np.random.seed(seed)
    sample_indices = np.random.choice(X_test.index, 500, replace=False)
    X_sample = X_test.loc[sample_indices]
    y_sample_actual = y_test[sample_indices]
    y_sample_pred = rf_model.predict(X_sample)
    print(f"Accuracy on 500 samples (seed {seed}): {accuracy_score(y_sample_actual, y_sample_pred) * 100:.2f}%")


np.random.seed(44)
sample_indices = np.random.choice(X_test.index, 500, replace=False)
X_sample = X_test.loc[sample_indices]
y_sample_actual = y_test[sample_indices]
y_sample_pred = rf_model.predict(X_sample)
comparison_df = X_sample[['thalach', 'oldpeak', 'ca']].copy()
comparison_df['Actual'] = y_sample_actual
comparison_df['Predicted'] = y_sample_pred
comparison_df.to_csv('sample_predictions_500.csv', index=False)
print(f"\nAccuracy on 500 samples: {accuracy_score(y_sample_actual, y_sample_pred) * 100:.2f}%")
print("\nSample Predictions (first 10):")
print(comparison_df.head(10))


joblib.dump(rf_model, 'rf_model.pkl')
metrics = {
    'Train Accuracy': accuracy_score(y_train, y_pred_train) * 100,
    'Train Precision': precision_score(y_train, y_pred_train) * 100,
    'Train Recall': recall_score(y_train, y_pred_train) * 100,
    'Train F1-Score': f1_score(y_train, y_pred_train) * 100,
    'Test Accuracy': accuracy_score(y_test, y_pred_test) * 100,
    'Test Precision': precision_score(y_test, y_pred_test) * 100,
    'Test Recall': recall_score(y_test, y_pred_test) * 100,
    'Test F1-Score': f1_score(y_test, y_pred_test) * 100,
    'CV Recall': cv_recall_rf.mean(),
    'CV Recall Std': cv_recall_rf.std(),
    'False Negatives': confusion_matrix(y_test, y_pred_test)[1, 0]
}
pd.DataFrame([metrics]).to_csv('model_results.csv', index=False)

  statlog_df = pd.read_csv(statlog_url, sep='\s+', header=None)


FileNotFoundError: [Errno 2] No such file or directory: 'Cardiovascular_Disease_Dataset.csv'

# New Section

In [None]:
top_features = feature_importance['Feature'].head(12).tolist()
X_top = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.3, random_state=8412)
rf_model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=2, class_weight='balanced', random_state=8412)
rf_model.fit(X_train, y_train)
cv_recall_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='recall')
print(f"CV Recall with 12 features: {cv_recall_rf.mean():.4f} ± {cv_recall_rf.std():.4f}")

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300, random_state=8412)
rfe = RFE(estimator=rf, n_features_to_select=12)
rfe.fit(X_train, y_train)
selected_features = X_train.columns[rfe.support_].tolist()
print(selected_features)

In [None]:
# From confusion matrix
cm = [[228, 29], [22, 283]]
accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
print(f"Test Accuracy: {accuracy:.4f}")  # Output: 0.9093

In [None]:
np.random.seed(77)
sample_indices = np.random.choice(X_test.index, 500, replace=False)
X_sample = X_test.loc[sample_indices]
y_sample_actual = y_test[sample_indices]
y_sample_pred = rf_model.predict(X_sample)

available_features = ['thalach', 'oldpeak', 'ca']
comparison_df = X_sample[available_features].copy()
comparison_df['Actual'] = y_sample_actual
comparison_df['Predicted'] = y_sample_pred

accuracy_sample = accuracy_score(y_sample_actual, y_sample_pred)
print(f"Accuracy on 500 samples: {accuracy_sample * 100:.2f}%")
print("\nSample Predictions (first 10):")
print(comparison_df.head(10))
comparison_df.to_csv('sample_predictions_500.csv', index=False)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Metrics (12 features):")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%")
print(f"Precision: {precision_score(y_test, y_pred_rf) * 100:.2f}%")
print(f"Recall: {recall_score(y_test, y_pred_rf) * 100:.2f}%")
print(f"F1-Score: {f1_score(y_test, y_pred_rf) * 100:.2f}%")
seeds = [44, 57, 98, 76]
for seed in seeds:
    np.random.seed(seed)
    sample_indices = np.random.choice(X_test.index, 500, replace=False)
    X_sample = X_test.loc[sample_indices]
    y_sample_actual = y_test[sample_indices]
    y_sample_pred = rf_model.predict(X_sample)
    print(f"Accuracy on 500 samples (seed {seed}): {accuracy_score(y_sample_actual, y_sample_pred) * 100:.2f}%")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib

# Add UCI Statlog dataset
statlog_url = ""https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat
statlog_df = pd.read_csv(statlog_url, sep='\s+', header=None)
statlog_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                   'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
statlog_df.columns = statlog_columns
statlog_df['target'] = statlog_df['target'].apply(lambda x: 1 if x == 2 else 0)

# Current datasets
datasets = {
    'Heart_Disease_Prediction (1).csv': pd.read_csv('Heart_Disease_Prediction (1).csv'),
    'Cardiovascular_Disease_Dataset.csv': pd.read_csv('Cardiovascular_Disease_Dataset.csv'),
    'hear_LAPPt.csv': pd.read_csv('hear_LAPPt.csv'),
    'heart nandal.csv': pd.read_csv('heart nandal.csv'),
    'statlog': statlog_df
}

# Standard columns (exclude thal)
standard_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'target']
column_mappings = {
    'Heart_Disease_Prediction (1).csv': {
        'Age': 'age', 'Sex': 'sex', 'Chest pain type': 'cp', 'BP': 'trestbps',
        'Cholesterol': 'chol', 'FBS over 120': 'fbs', 'EKG results': 'restecg',
        'Max HR': 'thalach', 'Exercise angina': 'exang', 'ST depression': 'oldpeak',
        'Slope of ST': 'slope', 'Number of vessels fluro': 'ca', 'Heart Disease': 'target'
    },
    'Cardiovascular_Disease_Dataset.csv': {
        'age': 'age', 'gender': 'sex', 'chestpain': 'cp', 'restingBP': 'trestbps',
        'serumcholestrol': 'chol', 'fastingbloodsugar': 'fbs', 'restingrelectro': 'restecg',
        'maxheartrate': 'thalach', 'exerciseangia': 'exang', 'oldpeak': 'oldpeak',
        'slope': 'slope', 'noofmajorvessels': 'ca', 'target': 'target'
    },
    'heart nandal.csv': {
        'trtbps': 'trestbps', 'thalachh': 'thalach', 'exng': 'exang', 'slp': 'slope',
        'caa': 'ca', 'output': 'target'
    },
    'hear_LAPPt.csv': {
        'age': 'age', 'sex': 'sex', 'cp': 'cp', 'trestbps': 'trestbps',
        'chol': 'chol', 'fbs': 'fbs', 'restecg': 'restecg', 'thalach': 'thalach',
        'exang': 'exang', 'oldpeak': 'oldpeak', 'slope': 'slope', 'ca': 'ca',
        'target': 'target'
    }
}

# Preprocess
for name, df in datasets.items():
    if name in column_mappings:
        df.rename(columns=column_mappings[name], inplace=True)
    datasets[name] = df[[col for col in standard_columns if col in df.columns]]

combined_df = pd.concat(datasets.values(), ignore_index=True)
initial_rows = combined_df.shape[0]
combined_df = combined_df.drop_duplicates()
print(f"Rows before removing duplicates: {initial_rows}, after: {combined_df.shape[0]}")

# Impute missing values
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
categorical_cols = ['cp', 'restecg', 'slope']
other_cols = ['sex', 'fbs', 'exang']
for col in numerical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mean())
for col in categorical_cols + other_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0])

# Standardize target
combined_df['target'] = combined_df['target'].apply(lambda x: 1 if x in ['Presence', 1, '1'] else 0 if x in [0, '0', 'Absence'] else np.nan)
combined_df = combined_df.dropna(subset=['target'])

# Preprocessing
combined_df = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)
scaler = MinMaxScaler()
combined_df[numerical_cols] = scaler.fit_transform(combined_df[numerical_cols])
combined_df['target'] = combined_df['target'].astype(int)

# Feature selection
X = combined_df.drop('target', axis=1)
y = combined_df['target']
rf_temp = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=8412)
rf_temp.fit(X, y)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_temp.feature_importances_
}).sort_values(by='Importance', ascending=False)
top_features = feature_importance['Feature'].head(12).tolist()  # Top 12 features
print("Top 12 Features:", top_features)

# Train-test split
X_top = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.3, random_state=8412)

# Random Forest with best params
rf_model = RandomForestClassifier(n_estimators=200, max_depth=30, min_samples_split=2, class_weight='balanced', random_state=8412)
rf_model.fit(X_train, y_train)

# Train metrics
y_pred_train = rf_model.predict(X_train)
print("\nRandom Forest Train Metrics (12 features):")
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_pred_train))
print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train) * 100:.2f}%")
print(f"Precision: {precision_score(y_train, y_pred_train) * 100:.2f}%")
print(f"Recall: {recall_score(y_train, y_pred_train) * 100:.2f}%")
print(f"F1-Score: {f1_score(y_train, y_pred_train) * 100:.2f}%")

# Test metrics
y_pred_test = rf_model.predict(X_test)
print("\nRandom Forest Test Metrics (12 features):")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test) * 100:.2f}%")
print(f"Precision: {precision_score(y_test, y_pred_test) * 100:.2f}%")
print(f"Recall: {recall_score(y_test, y_pred_test) * 100:.2f}%")
print(f"F1-Score: {f1_score(y_test, y_pred_test) * 100:.2f}%")

# CV Recall
cv_recall_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='recall')
print(f"CV Recall: {cv_recall_rf.mean():.4f} ± {cv_recall_rf.std():.4f}")

# Sample accuracy (500 samples)
seeds = [44, 57, 98, 76]
for seed in seeds:
    np.random.seed(seed)
    sample_indices = np.random.choice(X_test.index, 500, replace=False)
    X_sample = X_test.loc[sample_indices]
    y_sample_actual = y_test[sample_indices]
    y_sample_pred = rf_model.predict(X_sample)
    print(f"Accuracy on 500 samples (seed {seed}): {accuracy_score(y_sample_actual, y_sample_pred) * 100:.2f}%")

# Save sample predictions
np.random.seed(44)
sample_indices = np.random.choice(X_test.index, 500, replace=False)
X_sample = X_test.loc[sample_indices]
y_sample_actual = y_test[sample_indices]
y_sample_pred = rf_model.predict(X_sample)
comparison_df = X_sample[['thalach', 'oldpeak', 'ca']].copy()
comparison_df['Actual'] = y_sample_actual
comparison_df['Predicted'] = y_sample_pred
comparison_df.to_csv('sample_predictions_500.csv', index=False)
print(f"\nAccuracy on 500 samples: {accuracy_score(y_sample_actual, y_sample_pred) * 100:.2f}%")
print("\nSample Predictions (first 10):")
print(comparison_df.head(10))

# Save model and metrics
joblib.dump(rf_model, 'rf_model.pkl')
metrics = {
    'Train Accuracy': accuracy_score(y_train, y_pred_train) * 100,
    'Train Precision': precision_score(y_train, y_pred_train) * 100,
    'Train Recall': recall_score(y_train, y_pred_train) * 100,
    'Train F1-Score': f1_score(y_train, y_pred_train) * 100,
    'Test Accuracy': accuracy_score(y_test, y_pred_test) * 100,
    'Test Precision': precision_score(y_test, y_pred_test) * 100,
    'Test Recall': recall_score(y_test, y_pred_test) * 100,
    'Test F1-Score': f1_score(y_test, y_pred_test) * 100,
    'CV Recall': cv_recall_rf.mean(),
    'CV Recall Std': cv_recall_rf.std(),
    'False Negatives': confusion_matrix(y_test, y_pred_test)[1, 0]
}
pd.DataFrame([metrics]).to_csv('model_results.csv', index=False)

In [None]:
# SHAP plot
import shap
import matplotlib.pyplot as plt
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test)
plt.figure()
shap.summary_plot(shap_values, X_test, feature_names=top_features, show=False)


In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
# Confusion matrix heatmap
import seaborn as sns
y_pred = rf_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')


In [None]:
# Feature importance bar plot
plt.figure(figsize=(10, 6))
plt.barh(top_features, rf_model.feature_importances_, color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Top 12 Feature Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})', color='blue')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()