In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv(r"C:\Users\Sujith Reddy\Desktop\zip2ext\healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
df = df.drop('id', axis=1)

In [4]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [6]:
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
df = df[df['gender'] != 'Other']

In [8]:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['ever_married'] = df['ever_married'].map({'No': 0, 'Yes': 1})
df['Residence_type'] = df['Residence_type'].map({'Rural': 0, 'Urban': 1})

In [9]:
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'])

In [10]:
X = df.drop('stroke', axis=1)
y = df['stroke']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [13]:
print(f"Original Stroke Count: {sum(y_train)}")
print(f"Balanced Stroke Count (SMOTE): {sum(y_train_balanced)}")

Original Stroke Count: 187
Balanced Stroke Count (SMOTE): 3900


In [14]:
custom_weight = 25

In [15]:
base_model = xgb.XGBClassifier(
    scale_pos_weight=25, 
    gamma=0.1, 
    subsample=0.8, 
    colsample_bytree=0.8,
    eval_metric='aucpr',
    use_label_encoder=False
)

In [16]:
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05], # We test values near your 0.03
    'max_depth': [4, 6, 8],              # We test values near your 6
    'n_estimators': [200, 300, 400]      # We test values near your 300
}

# 3. Set up the Search (Focusing purely on RECALL)
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring='recall', # This tells the tuner to prioritize catching strokes
    cv=3,             # Cross-validation for stability
    verbose=1
)

In [17]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'learning_rate': [0.01, 0.03, ...], 'max_depth': [4, 6, ...], 'n_estimators': [200, 300, ...]}"
,scoring,'recall'
,n_jobs,
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [18]:
# Get probabilities
y_probs = grid_search.predict_proba(X_test)[:, 1]

# Apply a lower clinical threshold (0.3 instead of 0.5) to be safer
y_pred_clinical = (y_probs > 0.3).astype(int)

print("\n--- CLINICAL EVALUATION REPORT ---")
print(classification_report(y_test, y_pred_clinical))

# Plot Confusion Matrix



--- CLINICAL EVALUATION REPORT ---
              precision    recall  f1-score   support

           0       0.99      0.53      0.69       960
           1       0.11      0.94      0.20        62

    accuracy                           0.55      1022
   macro avg       0.55      0.73      0.44      1022
weighted avg       0.94      0.55      0.66      1022



In [19]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

# 1. We check thresholds between 0.2 and 0.5
for thresh in np.arange(0.2, 0.5, 0.05):
    temp_preds = (y_probs > thresh).astype(int)
    p = precision_score(y_test, temp_preds)
    r = recall_score(y_test, temp_preds)
    
    print(f"Threshold: {thresh:.2f} | Precision: {p:.2f} | Recall: {r:.2f}")

# 2. Pick the one where Precision crosses 0.25 while Recall stays above 0.80

Threshold: 0.20 | Precision: 0.10 | Recall: 0.98
Threshold: 0.25 | Precision: 0.11 | Recall: 0.97
Threshold: 0.30 | Precision: 0.11 | Recall: 0.94
Threshold: 0.35 | Precision: 0.12 | Recall: 0.90
Threshold: 0.40 | Precision: 0.13 | Recall: 0.89
Threshold: 0.45 | Precision: 0.14 | Recall: 0.85


In [20]:
import pandas as pd
import numpy as np

# 1. Define the Human-Readable Input
# Change these values to test different "Weather" scenarios for health
test_patient = {
    'gender': 0,                   # 0 = Male, 1 = Female
    'age': 72.0,                   # High Age = High Risk
    'hypertension': 1,             # 1 = Yes
    'heart_disease': 1,            # 1 = Yes (AFib)
    'ever_married': 1,             # 1 = Yes
    'Residence_type': 1,           # 1 = Urban
    'avg_glucose_level': 215.5,    # High Glucose
    'bmi': 33.8,                   # High BMI
    
    # Work Type (Only set ONE to 1, others to 0)
    'work_type_Govt_job': 0,
    'work_type_Never_worked': 0,
    'work_type_Private': 1,
    'work_type_Self-employed': 0,
    'work_type_children': 0,
    
    # Smoking Status (Only set ONE to 1, others to 0)
    'smoking_status_Unknown': 0,
    'smoking_status_formerly smoked': 0,
    'smoking_status_never smoked': 0,
    'smoking_status_smokes': 1
}

# 2. Convert to DataFrame
test_df = pd.DataFrame([test_patient])

# 3. CRITICAL: Match the training column order exactly
# This ensures the model reads the data in the right sequence
test_df = test_df[X_train.columns]

# 4. Predict using the live model variable
# We use the 'best_threshold' from your previous tuning (e.g., 0.40)
prob = grid_search.predict_proba(test_df)[:, 1][0]
prediction = 1 if prob >= 0.40 else 0 

print("--- ONE-HOT ENCODING PREDICTION TEST ---")
print(f"Risk Probability: {prob:.4f} ({prob*100:.2f}%)")
print(f"Final Prediction: {'⚠️ STROKE RISK ALERT' if prediction == 1 else '✅ NORMAL'}")

--- ONE-HOT ENCODING PREDICTION TEST ---
Risk Probability: 0.8119 (81.19%)
Final Prediction: ⚠️ STROKE RISK ALERT


In [21]:
import joblib
import os

# Go ONE LEVEL UP from notebook to ASSIGNMENT folder
BASE_DIR = os.path.dirname(os.getcwd())  

MODEL_DIR = os.path.join(BASE_DIR, "backend", "app", "ml_assets")

# Don't create new backend — only create ml_assets if missing
os.makedirs(MODEL_DIR, exist_ok=True)

# Save model (use your trained model variable)
model_path = os.path.join(MODEL_DIR, "xgboost_stroke_model.pkl")
joblib.dump(grid_search, model_path)

print(f"Model saved at: {model_path}")


Model saved at: c:\Users\Sujith Reddy\Desktop\assignment\backend\app\ml_assets\xgboost_stroke_model.pkl
