# EDA & Logistic Regression (Scratch vs scikit-learn)
This notebook runs in CI via papermill and produces an HTML artifact.

In [None]:

import sys
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# Add src/ to path so we can import our package without installing
sys.path.append(os.path.abspath('src'))
from ai_starter.logistic import LogisticRegressionScratch

data = load_breast_cancer(as_frame=True)
df = data.frame
df.head()


In [None]:

X = df.drop(columns=['target']).values
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

model = LogisticRegressionScratch(lr=0.1, epochs=800, l2=1e-3, random_state=0)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
acc_scratch = accuracy_score(y_test, y_pred)
acc_scratch


In [None]:

# Compare with scikit-learn
from sklearn.linear_model import LogisticRegression
sk = LogisticRegression(max_iter=500)
sk.fit(X_train_s, y_train)
acc_sklearn = sk.score(X_test_s, y_test)
acc_scratch, acc_sklearn


In [None]:

cm = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix (Scratch)')
plt.xlabel('Predicted')
plt.ylabel('True')
for (i, j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout()
fig
