In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data/heart.csv")

In [3]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
df['ChestPainType'].value_counts()

ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

def convert_to_num(dfname):
    le = LabelEncoder()
    dfname['Sex'] = le.fit_transform(dfname['Sex'])
    dfname['ChestPainType'] = le.fit_transform(dfname['ChestPainType'])
    dfname['RestingECG'] = le.fit_transform(dfname['RestingECG'])
    dfname['ExerciseAngina'] = le.fit_transform(dfname['ExerciseAngina'])
    dfname['ST_Slope'] = le.fit_transform(dfname['ST_Slope'])

convert_to_num(df)
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


In [7]:
X = df.drop('HeartDisease',axis=1)

In [8]:
y = df['HeartDisease']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
model = RandomForestClassifier()

In [13]:
model.fit(X_train,y_train)

In [14]:
model.score(X_test,y_test) * 100

90.21739130434783

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
cross_val_score(model,X,y,cv=10).mean()

0.8582417582417582

In [17]:
from sklearn.metrics import classification_report

In [18]:
y_preds = model.predict(X_test)

In [19]:
pd.DataFrame(classification_report(y_test,y_preds,output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.901235,0.902913,0.902174,0.902074,0.902156
recall,0.879518,0.920792,0.902174,0.900155,0.902174
f1-score,0.890244,0.911765,0.902174,0.901004,0.902057
support,83.0,101.0,0.902174,184.0,184.0


In [20]:
columns = ['Age','Sex','ChestPainType','RestingBP','Cholesterol','FastingBS','RestingECG','MaxHR','ExerciseAngina','Oldpeak','ST_Slope']

new_data = pd.DataFrame([[54,'F','NAP',130,250,0,'ST',160,'Y',2.5,'Flat']], columns=columns)


In [21]:
convert_to_num(new_data)

In [22]:
new_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,54,0,0,130,250,0,0,160,0,2.5,0


In [23]:
model.predict(new_data)

array([1], dtype=int64)

In [24]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [25]:
def all_score(y_test,y_preds):
    from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    acc = accuracy_score(y_test,y_preds)
    recall = recall_score(y_test,y_preds)
    pres = precision_score(y_test,y_preds)
    f1 = f1_score(y_test,y_preds)
    met_dict = {
        "accuracy score": acc*100,
        "Recall":recall,
        "Precision":pres,
        "f1 score":f1
    }
    return met_dict

In [26]:
all_score(y_test,y_preds)

{'accuracy score': 90.21739130434783,
 'Recall': 0.9207920792079208,
 'Precision': 0.9029126213592233,
 'f1 score': 0.9117647058823529}

In [27]:
from sklearn.model_selection import RandomizedSearchCV

In [28]:
grid = {
   'n_estimators': [10, 50, 100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30, 40],  # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider
    'bootstrap': [True, False]  # Use bootstrapping or not
}

In [29]:
clf_2 = RandomizedSearchCV(estimator=model,param_distributions=grid,n_iter=15,cv=5,verbose=2)

In [30]:
clf_2.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators

30 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterErro

In [31]:
clf_2.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': True}

In [32]:
from sklearn.metrics import accuracy_score
# Get the best model
best_model = clf_2.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy*100:.4f}")

Test Accuracy: 89.1304


In [None]:
import joblib

In [None]:
joblib.dump(model,"heart-preds.pkl")