## import library

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

##load data

In [None]:
#if the pachage in not install in the colab, uncomment the following line


!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets
# all data (features and target)
d= pd.concat([X, y], axis=1)

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


##explore data

In [None]:
# names of feature (column)
d.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [None]:
#num of rows
d.shape[0]

303

In [None]:
##num of columns
d.shape[1]

14

In [None]:
# display first 10 line
d.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
5,56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0
6,62,0,4,140,268,0,2,160,0,3.6,3,2.0,3.0,3
7,57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0,0
8,63,1,4,130,254,0,2,147,0,1.4,2,1.0,7.0,2
9,53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0,1


In [None]:
# display last 10 line
d.tail(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
293,63,1,4,140,187,0,2,144,1,4.0,1,2.0,7.0,2
294,63,0,4,124,197,0,0,136,1,0.0,2,0.0,3.0,1
295,41,1,2,120,157,0,0,182,0,0.0,1,0.0,3.0,0
296,59,1,4,164,176,1,2,90,0,1.0,2,2.0,6.0,3
297,57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0,1
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1
302,38,1,3,138,175,0,0,173,0,0.0,1,,3.0,0


In [None]:
#shape for feature only
X.shape

(303, 13)

In [None]:
#shape for data
d.shape

(303, 14)

## describe data & find missing value

In [None]:
#description of our data
d.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [None]:
#information of our data
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB


In [None]:
#number of misssing values for all data
d.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [None]:
#number of misssing values for feature only
X.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64


## handel missing values

In [None]:
# Instantiate imputer into variable imp
imp = SimpleImputer(strategy='mean')
# Perform mean imputation
x_imp = imp.fit_transform(X)
# Convert imputed data to DataFrame with original column names in x
new_x_df = pd.DataFrame(x_imp, columns=X.columns)

In [None]:

from sklearn.preprocessing import OneHotEncoder

# Assuming x_df contains the dataset

# Selecting categorical features
categorical_features = new_x_df.select_dtypes(include=['object']).columns

# Check if there are any categorical features
if not categorical_features.empty:
    # Encoding using one-hot encoding
    encoder = OneHotEncoder()
    # Perform one-hot encoding on categorical features
    x_encoded = encoder.fit_transform(new_x_df[categorical_features])
else:
    # No categorical features found, so no encoding is necessary
    x_encoded = new_x_df

In [None]:
# check on missing value before imputation
X.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64

In [None]:
# check on missing value after imputation
new_x_df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64


## split data into train and test

In [None]:
# split data using train_test_split with train 80 % and test 20%
x_train,x_test,y_train,y_test=train_test_split(new_x_df,y,train_size=0.8,random_state=0)


## fit model

In [None]:
# using RandomForestClassifier to train and fit model
from sklearn.ensemble import RandomForestClassifier
RFC_model = RandomForestClassifier()
RFC_model.fit(x_train, y_train)

  RFC_model.fit(x_train, y_train)


### predictions

In [None]:
Ypredcit = RFC_model.predict(x_test)

# Evaluate the model's performance

In [None]:
# display accuracy of model
print("ACCURACY :", accuracy_score(y_test, Ypredcit))


ACCURACY : 0.5737704918032787


In [None]:
# display results ofevaluation preformance by mae and mse
print("MEAN ABSOLUTE ERROR :", mean_absolute_error(y_test, Ypredcit))
print("MEAN SQUARED ERROR :", mean_squared_error(y_test, Ypredcit))


MEAN ABSOLUTE ERROR : 0.6557377049180327
MEAN SQUARED ERROR : 1.2131147540983607


explain evaluation results:
accuracy of this mode it is around 57.38% which is relatively low

Mean Absolute Error (mae): it measures average of difference between predicted values (Ypredcit) and actual target values (y_test) which is around 0.656 , Lower MAE values indicate better model performance, as they signify predictions that are closer to the actual values on average
Mean Squared Error (mse):it measures average squared difference between predicted values and actual target values ) which is around 1.21, Lower MSE values indicate better model performance, as they signify predictions that are closer to the actual values on average, with stronger emphasis on minimizing large errors


## Fine-tune the model

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define parameter in variable param
Hy_param= {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create instance of RandomForestClassifier
rf_model = RandomForestClassifier()

# perform grid search with cross-validation
gridsearch = GridSearchCV(rf_model, Hy_param, cv=5)
# Fit the GridSearchCV object to train data (x_train, y_train)
gridsearch.fit(x_train, y_train)

#getting best hyperparameters
best_hyparams = gridsearch.best_params_
#getting  best model
best_rf_model = gridsearch.best_estimator_

# Evaluate best model on the test set
Y_predcit = best_rf_model.predict(x_test)
print("predict for new data:", Y_predcit)
print("best accuracy:", gridsearch.best_score_)
print("best parameter:", gridsearch.best_params_)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

predict for new data: [0 0 1 1 0 0 3 1 3 0 0 0 2 0 0 0 2 0 0 0 0 0 0 2 0 0 0 2 1 3 0 0 0 2 3 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 2 0]
best accuracy: 0.6199829931972789
best parameter: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
