In [None]:
# ===========================
# Step 1: Load Full Dataset
# ===========================

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/My Drive/datasets/stroke_prediction_dataset.csv'
data = pd.read_csv(file_path)

print("Full dataset shape:", data.shape)

Mounted at /content/drive
Full dataset shape: (15000, 22)


In [None]:
# 2️⃣ Install TabPFN
!pip install tabpfn --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/160.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.8/160.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 3️⃣ Load Dataset
import pandas as pd
data_path = "/content/drive/My Drive/datasets/stroke_prediction_dataset.csv"
df = pd.read_csv(data_path)

In [None]:
# Preview dataset
print(df.head())
print("Shape:", df.shape)

# 4️⃣ Define Features & Target
target_col = "Diagnosis"  # Assuming this is the label column
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode categorical features
from sklearn.preprocessing import LabelEncoder
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

   Patient ID       Patient Name  Age Gender  Hypertension  Heart Disease  \
0       18153    Mamooty Khurana   56   Male             0              1   
1       62749  Kaira Subramaniam   80   Male             0              0   
2       32145      Dhanush Balan   26   Male             1              1   
3        6154        Ivana Baral   73   Male             0              0   
4       48973  Darshit Jayaraman   51   Male             1              1   

  Marital Status      Work Type Residence Type  Average Glucose Level  ...  \
0        Married  Self-employed          Rural                 130.91  ...   
1         Single  Self-employed          Urban                 183.73  ...   
2        Married   Never Worked          Rural                 189.00  ...   
3        Married   Never Worked          Urban                 185.29  ...   
4       Divorced  Self-employed          Urban                 177.34  ...   

     Alcohol Intake Physical Activity Stroke History Family History 

In [None]:
# Step 3: Initial Cleanup
cols_to_drop = ['Patient ID', 'Patient Name']
for col in cols_to_drop:
    if col in data.columns:
        data.drop(columns=[col], inplace=True)

In [None]:
# Separate numerical and categorical columns
import numpy as np
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Fill missing values
# Numerical: KNN Imputer
from sklearn.impute import KNNImputer
num_imputer = KNNImputer(n_neighbors=5)
data[num_cols] = num_imputer.fit_transform(data[num_cols])

# Categorical: fill with mode
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Step 5: Feature Engineering (Optional but improves accuracy)
# Example: age groups and interaction feature
if 'Age' in data.columns:
    data['Age_group'] = pd.cut(data['Age'], bins=[0,30,50,70,100], labels=[1,2,3,4]).astype(int)
    cat_cols = cat_cols.tolist() + ['Age_group'] # Add Age_group to categorical columns
if 'Hypertension' in data.columns and 'Age' in data.columns:
    data['Hypertension_x_Age'] = data['Hypertension'] * data['Age']
    # Decide if 'Hypertension_x_Age' should be treated as categorical or numerical based on its values.
    # Since it's a product, likely numerical, so not adding to cat_cols here.

# Step 4: Encode Categorical Features (Target Encoding) - Moved after feature engineering
from category_encoders import TargetEncoder
target_col = 'Diagnosis'
te = TargetEncoder()
data[cat_cols] = te.fit_transform(data[cat_cols], data[target_col])


# Step 6: Separate Features and Target
X = data.drop(target_col, axis=1)
y = data[target_col]

# Step 7: Handle Class Imbalance with SMOTENC
# Identify categorical feature indices in the data *before* scaling
from imblearn.over_sampling import SMOTENC
categorical_indices = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]
smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

print("Original shape:", X.shape, y.value_counts())
print("Resampled shape:", X_resampled.shape, np.bincount(y_resampled))

# Step 8: Scale Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


# Step 9: Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_scaled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

Original shape: (15000, 21) Diagnosis
0.0    7532
1.0    7468
Name: count, dtype: int64
Resampled shape: (15064, 21) [7532 7532]


In [None]:
from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the pre-trained TabPFN model
# TabPFN loads a pretrained model by default when initialized
# Make sure you have the correct model file downloaded,
# the library handles this automatically on first use or you can specify model_path
pretrained_model = TabPFNClassifier(device='cpu') # Using 'cpu' for broader compatibility

# Apply the model to the test set
# The model expects numpy arrays, and the features should be scaled, which X_test already is
predictions = pretrained_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

NotFittedError: This TabPFNClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
!pip install category_encoders --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h