In [7]:
import pandas as pd

# Load the dataset
url = "https://drive.google.com/uc?export=download&id=1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ"
df = pd.read_csv(url)


In [9]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [11]:
df.keys()

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handle missing values
imputer = SimpleImputer(strategy="mean")
df_filled = imputer.fit_transform(df)

# Convert the filled array back to a DataFrame
df_filled = pd.DataFrame(df_filled, columns=df.columns)

# Encode categorical variables
categorical_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
label_encoder = LabelEncoder()

for col in categorical_cols:
    df_filled[col] = label_encoder.fit_transform(df_filled[col])

# Scale numerical features
numerical_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
scaler = StandardScaler()

df_filled[numerical_cols] = scaler.fit_transform(df_filled[numerical_cols])


In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and target (y)
X = df_filled.drop("target", axis=1)
y = df_filled["target"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [20]:
y

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
298    0.0
299    0.0
300    0.0
301    0.0
302    0.0
Name: target, Length: 303, dtype: float64

In [21]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.952197,1,3,0.763956,-0.256334,1,0,0.015443,0,1.087338,0,0.0,1
1,-1.915313,1,2,-0.092738,0.072199,0,1,1.633471,0,2.122573,0,0.0,2
2,-1.474158,0,1,-0.092738,-0.816773,0,0,0.977514,0,0.310912,2,0.0,2
3,0.180175,1,1,-0.663867,-0.198357,0,1,1.239897,0,-0.206705,2,0.0,2
4,0.290464,0,0,-0.663867,2.082050,0,1,0.583939,1,-0.379244,2,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.290464,0,0,0.478391,-0.101730,0,1,-1.165281,1,-0.724323,1,0.0,3
299,-1.033002,1,3,-1.234996,0.342756,0,1,-0.771706,0,0.138373,1,0.0,3
300,1.503641,1,0,0.706843,-1.029353,1,1,-0.378132,0,2.036303,1,2.0,3
301,0.290464,1,0,-0.092738,-2.227533,0,1,-1.515125,1,0.138373,1,1.0,3


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_classifier.fit(X_train, y_train)


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.8131868131868132
Precision: 0.8367346938775511
Recall: 0.82
F1 Score: 0.8282828282828283


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the grid search object
grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

# Perform grid search to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


In [16]:
# Train a random forest classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(**best_params)
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best = best_rf_classifier.predict(X_test)

# Calculate performance metrics
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print("Best Hyperparameters:", best_params)
print("Accuracy (Best):", accuracy_best)
print("Precision (Best):", precision_best)
print("Recall (Best):", recall_best)
print("F1 Score (Best):", f1_best)


Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy (Best): 0.8461538461538461
Precision (Best): 0.8461538461538461
Recall (Best): 0.88
F1 Score (Best): 0.8627450980392156


In [17]:
print("Accuracy (Default):", accuracy)
print("Precision (Default):", precision)
print("Recall (Default):", recall)
print("F1 Score (Default):", f1)

print("Accuracy (Best):", accuracy_best)
print("Precision (Best):", precision_best)
print("Recall (Best):", recall_best)
print("F1 Score (Best):", f1_best)


Accuracy (Default): 0.8131868131868132
Precision (Default): 0.8367346938775511
Recall (Default): 0.82
F1 Score (Default): 0.8282828282828283
Accuracy (Best): 0.8461538461538461
Precision (Best): 0.8461538461538461
Recall (Best): 0.88
F1 Score (Best): 0.8627450980392156


In [19]:
import numpy as np
import matplotlib.pyplot as plt

# Get feature importances
feature_importances = best_rf_classifier.feature_importances_
feature_names = X.columns

# Get indices of the top 2 most important features
top_features_indices = np.argsort(feature_importances)[-2:]

# Select the top 2 features
top_features = X.columns[top_features_indices]

# Create a scatter plot of the top 2 features
plt.figure(figsize=(8, 6))
plt.scatter(X_test[top_features[0]], X_test[top_features[1]], c=y_pred_best, cmap='coolwarm')
plt.xlabel(top_features[0])
plt.ylabel(top_features


SyntaxError: incomplete input (879179912.py, line 18)