In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**IMPORTING DATASET**

In [None]:
df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

**FINDING CORRELATIONS BETWEEN THE FEATURES(IF ANY)**

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

**CHECKING FOR MISSING VALUES**

In [None]:
df.isnull().sum()

In [None]:
sns.histplot(df["bmi"],bins=20)

**TAKING CARE OF MISSING DATA**

In [None]:
median=df["bmi"].median()

In [None]:
df_m=df.copy()

In [None]:
df_m["bmi"]=df_m["bmi"].fillna(median)

In [None]:
df_m.isnull().sum()

**ENCODING CATEGORICAL FEATURES**

As in our dataset we have:

1.Gender

2.ever_married

3.work_type

4.Residence_type

5.smoking_status

These are our categorical features so we will encode them by label encoder.

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df_m['gender'] = le.fit_transform(df_m['gender'])
df_m['ever_married'] = le.fit_transform(df_m['ever_married'])
df_m['work_type'] = le.fit_transform(df_m['work_type'])
df_m['Residence_type'] = le.fit_transform(df_m['Residence_type'])
df_m['smoking_status'] = le.fit_transform(df_m['smoking_status'])

In [None]:
df_m.head()

In [None]:
g=sns.FacetGrid(df_m, col='stroke')
g = g.map(sns.kdeplot, 'work_type')

In [None]:
g=sns.FacetGrid(df_m, col='stroke')
g = g.map(sns.kdeplot, 'smoking_status')

Dropping id column as it is of no use to us

In [None]:
df_m.drop("id",axis=True)

In [None]:
X=df_m.iloc[:,:-1]
y=df_m.iloc[:,-1]

In [None]:
print(X)

In [None]:
print(y)

**SPLITTING DATASET INTO TRAIN AND TEST SET**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

**TRAINING LOGISTIC REGRESSION MODEL ON THE DATASET**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
lr.score(X_test,y_test)

**MAKING CONFUSION MATRIX**

In [None]:
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test,lr.predict(X_test)),annot=True)

**EXPLORING BMI WITH RESPECT TO OTHER COLUMNS**

In [None]:
sns.lineplot(x='age',y='bmi',hue='gender',data=df)

In [None]:
import matplotlib as mpl
sns.set_theme(style='ticks')
f, ax = plt.subplots(figsize=(10, 7))
sns.despine(f)

sns.histplot(
    df,
    x="bmi", hue="gender",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
ax.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter())
ax.set_xticks([10,20,30,40,50])

Now I will calculate BMI based on two factors age and gender , i will find median for 3 different age groups for male and female and other

Three age groups :

1.0-30

2.30-50

3.>50

In [None]:
data_male=df[df['gender']=='Male']

In [None]:
data_male.head()

In [None]:
print('Median BMI of male with age less than 30 : ',data_male[data_male['age']<30]['bmi'].median())
print('Median BMI of male with age more than 30 and less than 50 : ',data_male[(data_male['age']>30) & (data_male['age']<50)]['bmi'].median())
print('Median BMI of male with age greater than 50 : ',data_male[data_male['age']>50]['bmi'].median())

In [None]:
data_female=df[df["gender"]=="Female"]
print('Median BMI of Female with age less than 30 : ',data_female[data_female['age']<30]['bmi'].median())
print('Median BMI of Female with age more than 30 and less than 50 : ',data_female[(data_female['age']>30) & (data_female['age']<50)]['bmi'].median())
print('Median BMI of Female with age greater than 50 : ',data_female[data_female['age']>50]['bmi'].median())

In [None]:
data_other = df[df['gender']=='Other']
print('Median BMI of Other with age less than 30 : ',data_other[data_other['age']<30]['bmi'].median())
print('Median BMI of Other with age more than 30 and less than 50 : ',data_other[(data_other['age']>30) & (data_other['age']<50)]['bmi'].median())
print('Median BMI of Other with age greater than 50 : ',data_other[data_other['age']>50]['bmi'].median())