In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

Problem Statement: Data with health care attributes and other personal details to classify if the person gets stroke or not based on their health atrributes

# Data Loading and Analysis


In [None]:
data = pd.read_csv('healthcare-dataset.csv')
data.describe()

In [None]:
data.info()

** Few Early Observations**

1. There are 3 binary featuers - {hypertension,heart_disease,stroke'}
2. There are 2 numerical Features which needs normalization('Bmi, glucose_level and age')
3. We need to convert categorical variables to numerical values.
4. Features are less. Need to create custom featurs.

In [None]:
#looking for some basic data discrepency
display(data.isnull().sum())
display(data.shape)

In [None]:
plt.style.use('ggplot')

plots = ['age', 'avg_glucose_level', 'bmi']

plt.figure(figsize=(15, 5))  # Adjust figsize to fit all plots comfortably

for i, column in enumerate(plots):
    plt.subplot(1, 3, i+1)
    sns.histplot(data[column], color='black', bins='auto', kde=True)
    plt.title(column)
    plt.grid(True)

plt.tight_layout()  
plt.show()


**The distribution of values of avg-glucose_level is not a normal distribution**

**avg_glucose_level seems to be skewed plus is multimodel distribution so MinMaxScaler and StandardScaler might not work here**

**No duplicate values in the data**

**For missing values in the BMI imputation, going to use  mean  value  for imputation as the distribution of the bmi data seems to be a normal distribution**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
#1st impute/fill the missing values
#only BMI has missing values 
imputer = SimpleImputer(strategy = 'mean')
data['bmi']=imputer.fit_transform(data[['bmi']])
encoded_data= data.copy()
#2nd Scale down the numerical features
features_to_scale=['age','bmi']
scaler = MinMaxScaler()
encoded_data[features_to_scale]=scaler.fit_transform(encoded_data[features_to_scale])

In [None]:
#as the 'avg glucose level dosent have a normal distribution hence i am usign QuantileTransformer here 
from sklearn.preprocessing import QuantileTransformer

# Initialize QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform')

# Apply quantile transformation to avg_glucose_level
encoded_data['avg_glucose_level'] = scaler.fit_transform(encoded_data[['avg_glucose_level']])


In [None]:
df = encoded_data.copy()
df.columns

In [None]:
# List of columns to one-hot encode
columns_to_encode = ['Residence_type', 'work_type', 'smoking_status','ever_married','gender']

# Iterate through each column and apply pd.get_dummies
for column in columns_to_encode:
    encoded_column = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, encoded_column], axis=1)
    df = df.drop(columns=[column],axis=1)

# Convert boolean to integers
df = df.astype(int)


In [None]:
df.drop('id',axis=1,inplace=True)

**The data now has around 23 features **

**The dataframe 'df' now has clean data**

In [None]:
df.drop(['Residence_type_Rural','work_type_Private','smoking_status_Unknown', 'smoking_status_formerly smoked',
         'ever_married_Yes', 'gender_Male'], axis=1, inplace=True)

In [None]:
df.columns

# Data Split

In [None]:
X = df.drop('stroke',axis=1)
y = df['stroke']
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.2,random_state=42) 


# Model Selection

**Train Logistic Regression Classifier**

In [None]:
# Instantiate Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)


Predict the model output

In [None]:

# Predict on validation set
y_pred = logreg.predict(X_valid)

# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred)
print(f'Accuracy: {accuracy}')