<a href="https://colab.research.google.com/github/pmalu9211/DS/blob/main/13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# Cell 1: Import core libraries
import pandas as pd                                 # Data handling :contentReference[oaicite:0]{index=0}
import numpy as np                                  # Numeric operations :contentReference[oaicite:1]{index=1}

# Read the CSV file into DataFrame
# sep=',' because the file uses commas
df = pd.read_csv('/content/Cleavland.csv', header=None)
# Assign column names (14 predictors + target)
df.columns = [
    'age','sex','cp','trestbps','chol','fbs','restecg','thalach',
    'exang','oldpeak','slope','ca','thal','target'
]
# Peek at the data
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [32]:
# Cell 2a: Identify and drop missing or invalid entries

# Sometimes files use '?' for missing. Convert '?' → NaN
df.replace('?', np.nan, inplace=True)


# Drop any rows containing NaN now
df.dropna(inplace=True)

# Cell 2b: Remove negative values (not valid for these features)
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df = df[df[col] >= 0]
# df.reset_index(drop=True, inplace=True)
# df.info()

In [21]:
# Cell 3: Remove outliers via Z‑score method

from scipy.stats import zscore                         # :contentReference[oaicite:6]{index=6}

# Compute Z‑scores for numeric columns
z_scores = np.abs(zscore(df[num_cols]))
# Keep only rows where all Z‑scores < 3
mask = (z_scores < 3).all(axis=1)                      # :contentReference[oaicite:7]{index=7}
df_clean = df[mask].reset_index(drop=True)

print("Before:", len(df), "After:", len(df_clean))


Before: 297 After: 288


In [47]:
# Cell 4: Scale features & binarize target

from sklearn.preprocessing import StandardScaler        # :contentReference[oaicite:8]{index=8}

# Separate features (X) and target (y);
X= df_clean.drop(columns='target').values
# Convert target to binary: 0=no disease, 1=present (any nonzero)
y = (df_clean['target'] > 0).astype(int).values

# Scale features to mean=0, std=1 (important for kNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [60]:
# Cell 5: Train/test split and model comparison

from sklearn.model_selection import train_test_split     # :contentReference[oaicite:9]{index=9}
from sklearn.linear_model import LogisticRegression      # :contentReference[oaicite:10]{index=10}
from sklearn.neighbors import KNeighborsClassifier       # :contentReference[oaicite:11]{index=11}
from sklearn.metrics import accuracy_score               # :contentReference[oaicite:12]{index=12}

# Split 80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2
)

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

# k-Nearest Neighbors (k=5)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

print(f"Logistic Regression Accuracy: {acc_lr:.3f}")
print(f"kNN Accuracy:                  {acc_knn:.3f}")


Logistic Regression Accuracy: 0.776
kNN Accuracy:                  0.707
