In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings 
warnings.filterwarnings('ignore')

In [8]:
d1=pd.read_csv('C:CSV/healthcare-dataset-stroke-data.csv')

In [9]:
d1

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [11]:
# Separating features (X) and target variable (y)
X = d1.drop('stroke', axis=1)
y = d1['stroke']

# Defining numerical and categorical features
numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Creating transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Creating a column transformer to apply transformers to the correct features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fitting and transforming the data using the preprocessor
X_preprocessed = preprocessor.fit_transform(X)


In [12]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initializing the Logistic Regression model
model = LogisticRegression()

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Printing a classification report
print(classification_report(y_test, predictions))


Accuracy: 0.9393346379647749
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



    ***Feature Engineering***

In [15]:
from sklearn.preprocessing import StandardScaler

# Creating a scaler for numerical features
scaler = StandardScaler()

# Fitting and transforming the scaler on training data, then transforming test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)

predictions_scaled = model.predict(X_test_scaled)

# Evaluating the model on scaled data
accuracy_scaled = accuracy_score(y_test, predictions_scaled)
print(f'Accuracy (scaled): {accuracy_scaled}')


Accuracy (scaled): 0.9393346379647749


In [16]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid 
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

model = LogisticRegression()

# Best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

# Make predictions on the test set using the above model
predictions_best = best_model.predict(X_test_scaled)

# Evaluating
accuracy_best = accuracy_score(y_test, predictions_best)
print(f'Best Accuracy: {accuracy_best}')


Best Accuracy: 0.9393346379647749


In [17]:
#intercept
coefficients = best_model.coef_
intercept = best_model.intercept_

feature_names = numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients.flatten()})
print(coefficients_df)


                           Feature  Coefficient
0                              age     1.415259
1                     hypertension     0.115340
2                    heart_disease     0.080343
3                avg_glucose_level     0.176436
4                              bmi     0.009971
5                    gender_Female    -0.000462
6                      gender_Male     0.000462
7                     gender_Other     0.000000
8                  ever_married_No     0.049298
9                 ever_married_Yes    -0.049298
10              work_type_Govt_job    -0.039489
11          work_type_Never_worked    -0.052791
12               work_type_Private     0.033380
13         work_type_Self-employed    -0.088340
14              work_type_children     0.097410
15            Residence_type_Rural    -0.012859
16            Residence_type_Urban     0.012859
17          smoking_status_Unknown     0.005312
18  smoking_status_formerly smoked    -0.006318
19     smoking_status_never smoked    -0