In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.inspection import permutation_importance

In [2]:
df = pd.read_csv("heart_disease.csv")
df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Diseased
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [3]:
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')

In [4]:
df.corr()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,Diseased
age,1.0,-0.097542,0.104139,0.284946,0.20895,0.11853,0.148868,-0.393806,0.091661,0.203805,0.16177,0.362605,0.127389,0.22312
sex,-0.097542,1.0,0.010084,-0.064456,-0.199915,0.047862,0.021647,-0.048663,0.146201,0.102173,0.037533,0.093185,0.380936,0.276816
cp,0.104139,0.010084,1.0,-0.036077,0.072319,-0.039975,0.067505,-0.334422,0.38406,0.202277,0.15205,0.233214,0.265246,0.414446
restbp,0.284946,-0.064456,-0.036077,1.0,0.13012,0.17534,0.14656,-0.045351,0.064762,0.189171,0.117382,0.098773,0.133554,0.150825
chol,0.20895,-0.199915,0.072319,0.13012,1.0,0.009841,0.171043,-0.003432,0.06131,0.046564,-0.004062,0.119,0.014214,0.085164
fbs,0.11853,0.047862,-0.039975,0.17534,0.009841,1.0,0.069564,-0.007854,0.025665,0.005747,0.059894,0.145478,0.071358,0.025264
restecg,0.148868,0.021647,0.067505,0.14656,0.171043,0.069564,1.0,-0.083389,0.084867,0.114133,0.133946,0.128343,0.024531,0.169202
thalach,-0.393806,-0.048663,-0.334422,-0.045351,-0.003432,-0.007854,-0.083389,1.0,-0.378103,-0.343085,-0.385601,-0.264246,-0.279631,-0.417167
exang,0.091661,0.146201,0.38406,0.064762,0.06131,0.025665,0.084867,-0.378103,1.0,0.288223,0.257748,0.14557,0.32968,0.431894
oldpeak,0.203805,0.102173,0.202277,0.189171,0.046564,0.005747,0.114133,-0.343085,0.288223,1.0,0.577537,0.295832,0.341004,0.42451


In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
restbp      0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
Diseased    0
dtype: int64

In [6]:
df['ca'].unique()

array([ 0.,  3.,  2.,  1., nan])

In [7]:
df['thal'].unique()

array([ 6.,  3.,  7., nan])

In [8]:
mode_thal = df['thal'].mode()[0]
mode_ca = df['ca'].mode()[0]

In [9]:
df['thal'].fillna(mode_thal,inplace=True)
df['ca'].fillna(mode_ca,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['thal'].fillna(mode_thal,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ca'].fillna(mode_ca,inplace=True)


In [10]:
df['ca'].unique()

array([0., 3., 2., 1.])

In [11]:
df['thal'].unique()

array([6., 3., 7.])

In [12]:
X = df[['cp', 'thalach', 'exang', 'oldpeak', 'ca', 'thal']]
y = df['Diseased']

In [13]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [14]:
X

Unnamed: 0,0,1,2,3,4,5
0,-2.251775,0.017197,-0.696631,1.087338,-0.711131,0.660004
1,0.877985,-1.821905,1.435481,0.397182,2.504881,-0.890238
2,0.877985,-0.902354,1.435481,1.346147,1.432877,1.176752
3,-0.165268,1.637359,-0.696631,2.122573,-0.711131,-0.890238
4,-1.208521,0.980537,-0.696631,0.310912,-0.711131,-0.890238
...,...,...,...,...,...,...
298,-2.251775,-0.770990,-0.696631,0.138373,-0.711131,1.176752
299,0.877985,-0.376896,-0.696631,2.036303,1.432877,1.176752
300,0.877985,-1.515388,1.435481,0.138373,0.360873,1.176752
301,-1.208521,1.068113,-0.696631,-0.896862,0.360873,-0.890238


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
rf = RandomForestClassifier(random_state=42, oob_score=True)
rf.fit(X_train, y_train)

In [17]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

In [18]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
mcc = matthews_corrcoef(y_test, y_pred)

print("Random Forest Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")

Random Forest Performance:
Accuracy: 0.8361
Precision: 0.8667
Recall: 0.8125
F1-Score: 0.8387
AUC: 0.9359
MCC: 0.6738


In [19]:
param_dist = {'n_estimators': np.arange(50, 500, 50)}
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=3, random_state=42)
random_search.fit(X_train, y_train)
print(f"Best n_estimators: {random_search.best_params_['n_estimators']}")

Best n_estimators: 50


In [20]:
importances = rf.feature_importances_
feature_names = ['cp', 'thalach', 'exang', 'oldpeak', 'ca', 'thal']
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
   Feature  Importance
1  thalach    0.236615
3  oldpeak    0.218052
0       cp    0.171048
4       ca    0.157815
5     thal    0.149897
2    exang    0.066574


In [21]:
oob_score = rf.oob_score_
print(f"OOB Score: {oob_score:.4f}")

OOB Score: 0.7645


In [22]:
n_estimators_list = [50, 100, 200, 300, 400]
for n in n_estimators_list:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"n_estimators={n}, Accuracy: {accuracy:.4f}")

n_estimators=50, Accuracy: 0.8525
n_estimators=100, Accuracy: 0.8361
n_estimators=200, Accuracy: 0.8361
n_estimators=300, Accuracy: 0.8525
n_estimators=400, Accuracy: 0.8525
