In [1]:
# Importing Basic Packages
import pandas as pd
import numpy as np

# Importing ML Packages
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.feature_selection import SelectKBest, chi2, RFE # For Selecting the top features
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Importing Data

In [2]:
df = pd.read_csv("heart.csv")

## Basic overview of data

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.shape

(303, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


# Looking at some top features






In [6]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [7]:
x = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', "thal"]]

y = df["target"]

In [8]:
skb = SelectKBest(score_func = chi2, k=10)
bf = skb.fit(x,y)

In [9]:
feature_scores = pd.DataFrame(bf.scores_, columns = ["Feature Scores"])

In [10]:
feature_cn = pd.DataFrame(x.columns, columns = ["Feature Names"])

In [11]:
best_features = pd.concat([feature_cn, feature_scores], axis = 1)
best_features = best_features.sort_values(by = "Feature Scores", ascending = False)

In [12]:
best_features

Unnamed: 0,Feature Names,Feature Scores
7,thalach,188.320472
9,oldpeak,72.644253
11,ca,66.440765
2,cp,62.598098
8,exang,38.914377
4,chol,23.936394
0,age,23.286624
3,trestbps,14.823925
10,slope,9.804095
1,sex,7.576835


### Splitting our data

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x ,y, test_size = 0.33,random_state = 101)

## Regression

In [14]:
lr = LogisticRegression(max_iter = 1000)
lr.fit(x_train, y_train)

lr.score(x_test,y_test)

0.84

In [15]:
y_pred = lr.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.77      0.82        48
           1       0.81      0.90      0.85        52

    accuracy                           0.84       100
   macro avg       0.85      0.84      0.84       100
weighted avg       0.84      0.84      0.84       100



## Random Forest

In [16]:
rt = RandomForestClassifier(n_estimators = 200)
rt.fit(x_train, y_train)

rt.score(x_test, y_test)

0.83

In [17]:
y_pre = rt.predict(x_test)
print(classification_report(y_test, y_pre))

              precision    recall  f1-score   support

           0       0.86      0.77      0.81        48
           1       0.81      0.88      0.84        52

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100



# Saving our Model

In [18]:
import joblib

In [19]:
file = open("lr_model", "wb")
joblib.dump(lr,file)
file.close()

In [20]:
file1 = open("rf_model", "wb")
joblib.dump(rt, file1)
file1.close()