In [212]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [213]:
dataset = pd.read_csv('heart.csv')

In [214]:
dataset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [215]:
dataset.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [216]:
dataset.duplicated().sum()

np.int64(0)

In [217]:
dataset.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [218]:
cat_cols = dataset.select_dtypes(include='object').columns

for i in cat_cols:
    print(f"Column: {i}")
    print(dataset[i].unique())
    print('-' * 50)

Column: Sex
['M' 'F']
--------------------------------------------------
Column: ChestPainType
['ATA' 'NAP' 'ASY' 'TA']
--------------------------------------------------
Column: RestingECG
['Normal' 'ST' 'LVH']
--------------------------------------------------
Column: ExerciseAngina
['N' 'Y']
--------------------------------------------------
Column: ST_Slope
['Up' 'Flat' 'Down']
--------------------------------------------------


In [219]:
from sklearn.preprocessing import LabelEncoder

In [None]:
for i in cat_cols:
    label = LabelEncoder()
    dataset[i] = label.fit_transform(dataset[i])

In [221]:
for i in cat_cols:
    print(f"Column: {i}")
    print(dataset[i].unique())
    print('-' * 50)

Column: Sex
[1 0]
--------------------------------------------------
Column: ChestPainType
[1 2 0 3]
--------------------------------------------------
Column: RestingECG
[1 2 0]
--------------------------------------------------
Column: ExerciseAngina
[0 1]
--------------------------------------------------
Column: ST_Slope
[2 1 0]
--------------------------------------------------


In [222]:
dataset.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.78976,0.781046,132.396514,198.799564,0.233115,0.989107,136.809368,0.404139,0.887364,1.361656,0.553377
std,9.432617,0.407701,0.956519,18.514154,109.384145,0.423046,0.631671,25.460334,0.490992,1.06657,0.607056,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,0.0,120.0,173.25,0.0,1.0,120.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,0.0,130.0,223.0,0.0,1.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,1.0,2.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,2.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [None]:
outlier_cols = dataset[['Age', 'ChestPainType', 'RestingBP', 'Cholesterol', 'RestingECG', 'MaxHR', 'Oldpeak', 'ST_Slope']]

for col in outlier_cols.columns:
    q1 = dataset[col].quantile(0.25)
    q3 = dataset[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    dataset[col] = dataset[col].clip(lower=lower_bound, upper=upper_bound)

In [224]:
X = dataset.drop('HeartDisease', axis=1)
Y = dataset['HeartDisease']

In [225]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [227]:
dataset.HeartDisease.value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

---

In [228]:
from imblearn.over_sampling import SMOTE

In [229]:
smote = SMOTE()

In [230]:
X_train, Y_train = smote.fit_resample(X_train, Y_train)

In [231]:
Y_train.value_counts()

HeartDisease
0    401
1    401
Name: count, dtype: int64

---

In [232]:
from sklearn.preprocessing import StandardScaler

scalor = StandardScaler()

X_train = scalor.fit_transform(X_train)
X_test = scalor.transform(X_test)

---

In [233]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [234]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "KNN Neighbours": KNeighborsClassifier()
}

In [235]:
for name, model in models.items():
    model.fit(X_train, Y_train)

    Y_train_prediction = model.predict(X_train)
    Y_test_prediction = model.predict(X_test)
    
    print(f'{name} Training Accuracy: ', accuracy_score(Y_train, Y_train_prediction)* 100)
    print(f'{name} Testing Accuracy: ', accuracy_score(Y_test, Y_test_prediction)* 100)
    print('-'* 60)

Random Forest Training Accuracy:  100.0
Random Forest Testing Accuracy:  87.5
------------------------------------------------------------
Logistic Regression Training Accuracy:  85.91022443890274
Logistic Regression Testing Accuracy:  84.78260869565217
------------------------------------------------------------
KNN Neighbours Training Accuracy:  89.15211970074813
KNN Neighbours Testing Accuracy:  82.6086956521739
------------------------------------------------------------


In [242]:
from sklearn.model_selection import GridSearchCV

In [None]:
lr_param = {
    'C': [1,2,3,4],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['liblinear', 'saga', 'lbfgs'],
    'max_iter': [100, 200, 500, 1000]
}

In [244]:
lr = LogisticRegression(random_state=42)

random_search = GridSearchCV(estimator=lr, param_distributions=lr_param, cv=5, verbose=2, n_jobs=-1)

TypeError: GridSearchCV.__init__() got an unexpected keyword argument 'param_distributions'

In [245]:
random_search.fit(X_train, Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [246]:
from sklearn.model_selection import cross_val_score

In [241]:
result = pd.DataFrame(random_search.cv_results_)
result[['param_solver','param_penalty','param_max_iter','param_C','mean_test_score',]]

Unnamed: 0,param_solver,param_penalty,param_max_iter,param_C,mean_test_score
0,liblinear,none,500,0.012743,
1,liblinear,l2,1000,0.233572,0.850365
2,lbfgs,l1,100,1438.449888,
3,saga,elasticnet,200,29.763514,
4,lbfgs,elasticnet,200,29.763514,
5,liblinear,l1,100,1438.449888,0.849107
6,liblinear,none,1000,545.559478,
7,liblinear,elasticnet,200,545.559478,
8,lbfgs,elasticnet,1000,0.000695,
9,saga,none,500,11.288379,
