In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('heart.csv')

# a. Data cleaning: Remove NA, ?, and negative values
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Drop rows with NaN values
df.dropna(inplace=True)

# Remove negative values (e.g., in numeric columns)
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
df = df[(df[numeric_cols] >= 0).all(axis=1)]

# b. Error correcting: Outlier detection and removal using IQR
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# c. Data transformation: Normalize numerical features
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Prepare features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# d. Build models and compare accuracy
# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)

# kNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)

# Print results
print("Logistic Regression Accuracy:", logreg_accuracy)
print("kNN Accuracy:", knn_accuracy)
print("Difference (Logistic Regression - kNN):", logreg_accuracy - knn_accuracy)

Logistic Regression Accuracy: 0.8426966292134831
kNN Accuracy: 0.848314606741573
Difference (Logistic Regression - kNN): -0.005617977528089901
