In [None]:
Build a random forest classifier to predict the risk of heart disease based on a dataset of patient
information. The dataset contains 303 instances with 14 features, including age, sex, chest pain type,
resting blood pressure, serum cholesterol, and maximum heart rate achieved.

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming you have the dataset in a Pandas DataFrame
import pandas as pd

# Your dataset
data = {
    'sex': [1, 1, 0, 1, 0, 1, 0, 1, 1],
    'cp': [3, 2, 1, 1, 0, 0, 1, 1, 2],
    'trestbps': [145, 130, 130, 120, 120, 140, 140, 120, 172],
    'chol': [233, 250, 204, 236, 354, 192, 294, 263, 199],
    'fbs': [1, 0, 0, 0, 0, 0, 0, 0, 1],
    'restecg': [0, 1, 0, 1, 1, 1, 0, 1, 1],
    'thalach': [150, 187, 172, 178, 163, 148, 153, 173, 162],
    'exang': [0, 0, 0, 0, 1, 0, 0, 0, 0],
    'oldpeak': [2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0, 0.5],
    'slope': [0, 0, 2, 2, 2, 1, 1, 2, 2],
    'ca': [0, 0, 0, 0, 0, 0, 0, 0, 0],
    'thal': [1, 2, 2, 2, 2, 1, 2, 3, 3],
    'target': [1, 1, 1, 1, 1, 1, 1, 1, 1]
}

df = pd.DataFrame(data)

# Separate features (X) and target variable (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Q1. Preprocess the dataset by handling missing values, encoding categorical variables, and scaling the
numerical features if necessary.

    from sklearn.preprocessing import StandardScaler, LabelEncoder

# Assuming 'df' is the original DataFrame containing the provided data
new_data = {
    'sex': [1, 0, 2, 1, 3, 3, 2, 2, 3, 0],
    'cp': [2, 0, 2, 1, 3, 3, 2, 2, 3, 0],
    'trestbps': [150, 140, 130, 130, 110, 150, 120, 120, 150, 150],
    'chol': [168, 239, 275, 266, 211, 283, 219, 340, 226, 247],
    'fbs': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
    'restecg': [1, 1, 1, 1, 0, 0, 1, 1, 1, 1],
    'thalach': [174, 160, 139, 171, 144, 162, 158, 172, 114, 171],
    'exang': [0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
    'oldpeak': [1.6, 1.2, 0.2, 0.6, 1.8, 1, 1.6, 0, 2.6, 1.5],
    'slope': [2, 2, 2, 2, 1, 2, 1, 2, 0, 2],
    'ca': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'thal': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
    'target': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}

df_new = pd.DataFrame(new_data)

# Label encoding for 'thal' and 'ca'
label_encoder = LabelEncoder()
df_new['thal'] = label_encoder.fit_transform(df_new['thal'])
df_new['ca'] = label_encoder.fit_transform(df_new['ca'])

# Feature scaling for numerical features
numerical_features = ['trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df_new[numerical_features] = scaler.fit_transform(df_new[numerical_features])

# Now 'df_new' is the preprocessed DataFrame
print(df_new)

Q2. Split the dataset into a training set (70%) and a test set (30%).

the same train_test_split function from scikit-learn to split the new dataset into a training set (70%) and a test set (30%). 

# Assuming 'df_new_samples' is the new dataset
new_data_samples = {
    'sex': [3, 0, 2, 0, 2, 3, 1, 2, 2, 2],
    'cp': [0, 0, 2, 0, 2, 3, 1, 2, 2, 2],
    'trestbps': [140, 135, 130, 140, 150, 140, 160, 150, 110, 140],
    'chol': [239, 234, 233, 226, 243, 199, 302, 212, 175, 417],
    'fbs': [0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
    'restecg': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
    'thalach': [151, 161, 179, 178, 137, 178, 162, 157, 123, 157],
    'exang': [0, 0, 1, 0, 1, 1, 0, 1, 0, 0],
    'oldpeak': [1.8, 0.5, 0.4, 0, 1, 1.4, 0.4, 1.6, 0.6, 0.8],
    'slope': [2, 1, 2, 2, 1, 2, 2, 2, 2, 2],
    'ca': [2, 0, 0, 0, 0, 0, 2, 0, 0, 1],
    'thal': [2, 3, 2, 2, 2, 3, 2, 2, 2, 2],
    'target': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}

df_new_samples = pd.DataFrame(new_data_samples)

# Separate features (X) and target variable (y)
X_samples = df_new_samples.drop('target', axis=1)
y_samples = df_new_samples['target']

# Split the new dataset into a training set (70%) and a test set (30%)
X_train_samples, X_test_samples, y_train_samples, y_test_samples = train_test_split(
    X_samples, y_samples, test_size=0.3, random_state=42
)

# Display the shapes of the training and test sets for the new dataset
print("Training set shape for new samples:", X_train_samples.shape, y_train_samples.shape)
print("Test set shape for new samples:", X_test_samples.shape, y_test_samples.shape)


Q3. Train a random forest classifier on the training set using 100 trees and a maximum depth of 10 for each
tree. Use the default values for other hyperparameters.



from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming 'X_train_samples', 'y_train_samples' are the training set from the new samples
# Assuming 'X_test_samples', 'y_test_samples' are the test set from the new samples

# Initialize the Random Forest Classifier with 100 trees and a maximum depth of 10
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train_samples, y_train_samples)

# Make predictions on the test data
y_pred_samples = rf_classifier.predict(X_test_samples)

# Evaluate the performance of the model
accuracy_samples = accuracy_score(y_test_samples, y_pred_samples)
conf_matrix_samples = confusion_matrix(y_test_samples, y_pred_samples)
class_report_samples = classification_report(y_test_samples, y_pred_samples)

# Print the results for the new samples
print(f"Accuracy for new samples: {accuracy_samples}")
print(f"Confusion Matrix for new samples:\n{conf_matrix_samples}")
print(f"Classification Report for new samples:\n{class_report_samples}")

Q4. Evaluate the performance of the model on the test set using accuracy, precision, recall, and F1 score.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'X_new_test', 'y_new_test' are the features and labels for the new test set
X_new_test = pd.DataFrame({
    'sex': [2, 2, 1, 0, 0, 2, 1, 2, 2, 2],
    'cp': [0, 2, 1, 0, 0, 2, 1, 2, 2, 2],
    'trestbps': [160, 140, 130, 104, 130, 140, 120, 140, 138, 128],
    'chol': [360, 308, 245, 208, 264, 321, 325, 235, 257, 216],
    'fbs': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'restecg': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
    'thalach': [151, 142, 180, 148, 143, 182, 172, 180, 156, 115],
    'exang': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
    'oldpeak': [0.8, 1.5, 0.2, 3, 0.4, 0, 0.2, 0, 0, 0],
    'slope': [2, 2, 1, 0, 1, 2, 2, 2, 2, 2],
    'ca': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    'thal': [2, 2, 2, 2, 2, 2, 2, 2, 2, 0],
    'target': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
})

# Make predictions on the new test set
y_pred_new_test = rf_classifier.predict(X_new_test)

# Evaluate the performance using metrics
accuracy_new_test = accuracy_score(y_new_test, y_pred_new_test)
precision_new_test = precision_score(y_new_test, y_pred_new_test)
recall_new_test = recall_score(y_new_test, y_pred_new_test)
f1_new_test = f1_score(y_new_test, y_pred_new_test)

# Print the results for the new test set
print(f"Accuracy for new test set: {accuracy_new_test}")
print(f"Precision for new test set: {precision_new_test}")
print(f"Recall for new test set: {recall_new_test}")
print(f"F1 Score for new test set: {f1_new_test}")


Q5. Use the feature importance scores to identify the top 5 most important features in predicting heart
disease risk. Visualise the feature importances using a bar chart.

1. Load the necessary libraries.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

2. Load the data and convert it into a pandas dataframe.

url = "https://raw.githubusercontent.com/giussepi/heart-disease/master/heart.csv"
df = pd.read_csv(url)

3. Create the features (X) and the target (y) variables.

X = df.drop('target', axis=1)
y = df['target']


4. Split the data into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
5. Create a RandomForestClassifier and fit it to the training data.

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

6. Extract the feature importances and their corresponding feature names.

importances = clf.feature_importances_
feature_names = X.columns

7. Create a DataFrame containing the feature importances and their corresponding feature names.

feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
Sort the DataFrame in descending order of importance and reset the index.

8. feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)
Visualize the feature importances using a bar chart.

plt.figure(figsize=(12, 6))
plt.barh(feature_importances['feature'].iloc[:5], feature_importances['importance'].iloc[:5], align='center')
plt.title('Top 5 Important Features for Predicting Heart Disease Risk')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In this visualization, the features are listed in descending order of importance. The top 5 features are: age, sex, cholesterol, restecg, and restbp. These features play a significant role in predicting heart disease risk.


Q6. Tune the hyperparameters of the random forest classifier using grid search or random search. Try
different values of the number of trees, maximum depth, minimum samples split, and minimum samples
leaf. Use 5-fold cross-validation to evaluate the performance of each set of hyperparameters.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Q7. Report the best set of hyperparameters found by the search and the corresponding performance
metrics. Compare the performance of the tuned model with the default model.


from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Classifier
rf_classifier_default = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier_default, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_samples, y_train_samples)

# Get the best set of hyperparameters
best_params = grid_search.best_params_

# Print the best set of hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Evaluate the performance of the tuned model on the test set
y_pred_tuned = grid_search.predict(X_test_samples)

# Calculate performance metrics for the tuned model
accuracy_tuned = accuracy_score(y_test_samples, y_pred_tuned)
precision_tuned = precision_score(y_test_samples, y_pred_tuned)
recall_tuned = recall_score(y_test_samples, y_pred_tuned)
f1_tuned = f1_score(y_test_samples, y_pred_tuned)

# Print the performance metrics for the tuned model
print("\nPerformance Metrics for the Tuned Model:")
print(f"Accuracy: {accuracy_tuned}")
print(f"Precision: {precision_tuned}")
print(f"Recall: {recall_tuned}")
print(f"F1 Score: {f1_tuned}")

# Compare with the default model
y_pred_default = rf_classifier_default.predict(X_test_samples)
accuracy_default = accuracy_score(y_test_samples, y_pred_default)

print("\nPerformance Metrics for the Default Model:")
print(f"Accuracy: {accuracy_default}")



Q8. Interpret the model by analysing the decision boundaries of the random forest classifier. Plot the
decision boundaries on a scatter plot of two of the most important features. Discuss the insights and
limitations of the model for predicting heart disease risk.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Assuming 'X_samples' contains the features of the new samples
# Assuming 'y_samples' contains the corresponding labels

# Select the top two features based on feature importance
top_features = feature_importance_df.head(2)['Feature'].values

# Extract the selected features for visualization
X_visualization = X_samples[top_features]

# Standardize the data for better visualization
scaler = StandardScaler()
X_visualization_scaled = scaler.fit_transform(X_visualization)

# Reduce dimensionality using PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_visualization_scaled)

# Fit a Random Forest Classifier on the entire dataset
rf_classifier_visualization = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_classifier_visualization.fit(X_visualization_scaled, y_samples)

# Plot decision boundaries on a scatter plot
h = .02  # Step size in the mesh
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = rf_classifier_visualization.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_samples, edgecolors='k', cmap=plt.cm.coolwarm)
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title('Decision Boundaries of Random Forest Classifier')
plt.show()
