In [59]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('heart.csv')

# CLEANING
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)
df = df[(df >= 0).all(axis=1)]

# OUTLIER REMOVAL
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# DATA TRANSFORMATION
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

features = df.drop('target', axis=1)
scaler = StandardScaler()
df[features.columns] = scaler.fit_transform(features)


# Split into X and y
from sklearn.model_selection import train_test_split
X = features
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sanity Check - ensure no NaNs remain
print("NaNs in training set:", X_train.isna().sum().sum())
print("NaNs in test set:", X_test.isna().sum().sum())

# MODEL - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# MODEL - KNN
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can tune this number
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))



NaNs in training set: 0
NaNs in test set: 0
Logistic Regression Accuracy: 0.8571428571428571
KNN Accuracy: 0.6363636363636364
