In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [6]:
%ls -al

total 12
drwxr-xr-x 3 root root 4096 Aug  3 14:26 [0m[01;34m.[0m/
drwxr-xr-x 5 root root 4096 Aug  3 14:26 [01;34m..[0m/
drwxr-xr-x 2 root root 4096 Aug  3 14:26 [01;34m.virtual_documents[0m/


In [7]:
# Load and preprocess data
# Taken from https://www.kaggle.com/datasets/seifdavio/healthcare-dataset-stroke-data-csv
df = pd.read_csv("/kaggle/input/healthcare-dataset-stroke-data-csv/healthcare-dataset-stroke-data.csv")
df.dropna(subset=["bmi"], inplace=True)
df["bmi"].fillna(df["bmi"].mean(), inplace=True)
df.set_index("id", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["bmi"].fillna(df["bmi"].mean(), inplace=True)


In [8]:
categorical = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
for col in categorical:
    df[col] = pd.factorize(df[col])[0]

features = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi"] + categorical
X = df[features]
y = df["stroke"]

In [18]:
# Balance dataset
stroke_df = df[df["stroke"] == 1]
no_stroke_df = df[df["stroke"] == 0].sample(n=len(stroke_df), random_state=42)
balanced_df = pd.concat([stroke_df, no_stroke_df])
X = balanced_df[features]
y = balanced_df["stroke"]

In [20]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
# Save scaler
import joblib
joblib.dump(scaler, "scaler.save")

['scaler.save']

In [22]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [23]:
# PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(np.array(y_train), dtype=torch.float32).unsqueeze(1)

train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)

In [25]:
# Neural network
class CardioNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = CardioNet(X.shape[1])
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [27]:

# Training loop
for epoch in range(100):
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")

Epoch 0: Loss = 0.4083
Epoch 10: Loss = 0.2593
Epoch 20: Loss = 0.6678
Epoch 30: Loss = 0.2941
Epoch 40: Loss = 0.4449
Epoch 50: Loss = 0.2696
Epoch 60: Loss = 0.3804
Epoch 70: Loss = 0.3744
Epoch 80: Loss = 0.1950
Epoch 90: Loss = 0.2300


In [28]:
# Save model
torch.save(model.state_dict(), "cardio_model.pt")
print("Model saved as cardio_model.pt")

Model saved as cardio_model.pt
