# Logistic Regression from Scratch for Stroke Prediction
This notebook implements Logistic Regression using only NumPy to predict stroke probability. To handle the class imbalance and improve metrics (Recall/F1), we use SMOTE (Synthetic Minority Over-sampling Technique).
**Note:** This notebook was generated to address the requirement of not using sklearn's implementation and improving recall/f1 scores.

In [None]:
!pip install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download dataset
!wget -O healthcare-dataset-stroke-data.csv https://raw.githubusercontent.com/aydanbabayeva/dataset/refs/heads/main/healthcare-dataset-stroke-data.csv

# Load data
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.head()

## Data Preprocessing
Cleaning the data, handling missing values, and scaling features.

In [None]:
# Drop ID column
data = data.drop("id", axis=1)

# Handle missing BMI values with mean
data["bmi"] = data["bmi"].fillna(data["bmi"].mean())

# One-hot encoding for categorical variables
data = pd.get_dummies(data, drop_first=True)

# Separate Features and Target
X = data.drop("stroke", axis=1)
y = data["stroke"]

# Split data (Stratified split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Handling Class Imbalance with SMOTE
The target class 'stroke' is highly imbalanced. We use SMOTE to oversample the minority class in the training set.

In [None]:
from imblearn.over_sampling import SMOTE

print("Distribution before SMOTE:")
print(y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nDistribution after SMOTE:")
print(y_train_resampled.value_counts())

## Logistic Regression from Scratch
Implementation using NumPy.

In [None]:
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Convert y to numpy array if it isn't already, for dot products 
        y = np.array(y)

        # Gradient Descent
        for _ in range(self.n_iterations):
            # Linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Activation
            y_predicted = self.sigmoid(linear_model)

            # Gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict_proba(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X, threshold=0.5):
        y_predicted_cls = self.predict_proba(X)
        return [1 if i > threshold else 0 for i in y_predicted_cls]

In [None]:
# Instantiate and Train
# We use a higher number of iterations and a reasonable learning rate
model = LogisticRegressionScratch(learning_rate=0.1, n_iterations=3000)
model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Metrics
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))