<a href="https://colab.research.google.com/github/shivamsinghtomar78/ML-Projects-/blob/main/stroke.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [16]:
df = pd.read_csv('/content/stroke.csv')

In [17]:
# Handle missing values
df['bmi'].fillna(df['bmi'].median(), inplace=True)
df['smoking_status'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['smoking_status'].fillna('Unknown', inplace=True)


In [18]:
# Encode categorical variables
label_encoders = {}
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [19]:
# Define features and target
X = df.drop(columns=['id', 'stroke'])  # Drop ID as it's not useful
Y = df['stroke']

In [20]:
# Handle class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X, Y = smote.fit_resample(X, Y)

In [21]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [22]:
# Scale numerical features
scaler = StandardScaler()
X_train[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(X_train[['age', 'avg_glucose_level', 'bmi']])
X_test[['age', 'avg_glucose_level', 'bmi']] = scaler.transform(X_test[['age', 'avg_glucose_level', 'bmi']])


In [23]:
# Train Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8992287917737789
              precision    recall  f1-score   support

           0       0.94      0.86      0.89       975
           1       0.87      0.94      0.90       970

    accuracy                           0.90      1945
   macro avg       0.90      0.90      0.90      1945
weighted avg       0.90      0.90      0.90      1945



In [25]:
# Train XGBoost Classifier
xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.9316195372750643
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       975
           1       0.92      0.95      0.93       970

    accuracy                           0.93      1945
   macro avg       0.93      0.93      0.93      1945
weighted avg       0.93      0.93      0.93      1945



In [26]:
# Hyperparameter tuning for XGBoost
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(XGBClassifier(random_state=42), params, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_xgb = grid_search.best_estimator_

y_pred_best_xgb = best_xgb.predict(X_test)
print("Best XGBoost Accuracy:", accuracy_score(y_test, y_pred_best_xgb))
print(classification_report(y_test, y_pred_best_xgb))


Best XGBoost Accuracy: 0.9573264781491002
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       975
           1       0.94      0.97      0.96       970

    accuracy                           0.96      1945
   macro avg       0.96      0.96      0.96      1945
weighted avg       0.96      0.96      0.96      1945



In [27]:
import pickle

In [28]:
# Saving the model using pickle
with open("stroke_model.pkl", "wb") as model_file:
    pickle.dump(best_xgb, model_file)