In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

df=pd.read_csv('heart.csv')

display(df)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [2]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [26]:
# a. Data cleaning(Remove NA, ?, Negative values etc.)  

df.replace('?',pd.NA,inplace=True)
df.dropna(inplace=True)
df = df[(df >= 0).all(axis=1)]  # added by me (drop negative values)


In [27]:
# b. Error correcting(Outlier detection and removal)  

for col in df:
    low=df[col].quantile(0.01)
    high=df[col].quantile(0.99)
    df=df[(df[col]>=low) & df[col]<=high]

In [28]:
# c. Data Transformation

scaler=StandardScaler()
feature=df.drop(['target'],axis=1)
feature_scaled=scaler.fit_transform(feature)

In [29]:
# d. Build Data model using regression and kNN methods and compare accuracy of heart disease prediction.
X= feature_scaled
y=df['target']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_test)


knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn_pred=knn.predict(X_test)

print('Accuracy of Logistic Regression:', accuracy_score(y_test,lr_pred))
print('Accuracy of KNN:',accuracy_score(y_test,knn_pred))

Accuracy of Logistic Regression: 0.865979381443299
Accuracy of KNN: 0.788659793814433


In [10]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("heart.csv")  # replace with your filename if different


In [11]:
# Replace '?' with NaN and convert all columns to numeric where possible
df.replace("?", np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')

# Remove rows with any missing or negative values
df.dropna(inplace=True)
df = df[(df >= 0).all(axis=1)]

display(df)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [12]:
from scipy.stats import zscore

# Remove outliers using Z-score method (threshold 3)
z_scores = np.abs(zscore(df))
df = df[(z_scores < 3).all(axis=1)]


In [13]:
from sklearn.preprocessing import MinMaxScaler

# Separate features and target
X = df.drop("target", axis=1)
y = df["target"]

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=["cp", "thal", "slope"], drop_first=True)

# Normalize
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

# k-NN (k=5)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

# Accuracy comparison
print(f"Logistic Regression Accuracy: {acc_lr:.2f}")
print(f"k-NN Accuracy: {acc_knn:.2f}")


Logistic Regression Accuracy: 0.86
k-NN Accuracy: 0.88


In [17]:
from sklearn.metrics import classification_report, confusion_matrix

print("\nLogistic Regression Report:\n", classification_report(y_test, y_pred_lr))
print("\nKNN Report:\n", classification_report(y_test, y_pred_knn))



Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85        89
           1       0.88      0.86      0.87       105

    accuracy                           0.86       194
   macro avg       0.86      0.86      0.86       194
weighted avg       0.86      0.86      0.86       194


KNN Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87        89
           1       0.94      0.83      0.88       105

    accuracy                           0.88       194
   macro avg       0.88      0.88      0.88       194
weighted avg       0.88      0.88      0.88       194

