In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import Dropdown,Accordion,Label,link
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.drop(["id"],axis=1,inplace=True)

# EDA

In [None]:
clean = df.dropna().reset_index().drop(["index"],axis=1)

In [None]:
px.imshow(clean.corr())

The Data is not much correlated

In [None]:
sns.countplot(x="stroke",data=clean)

In [None]:
plt.figure(figsize=(30,35))
n=0
for i in ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status']:
    n+=1
    plt.subplot(5,2,n)
    sns.histplot(clean,x=i,hue="stroke")
plt.show()

OBSERVATIONS

*  There is a greater chance of Stroke in 
        
        self-employed people.
        In people having BMI around the range of 30
        In people of age more than 70
        In former smokers and almost equally likely in smokers
        In males

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x="bmi",y="avg_glucose_level",data=clean,hue="stroke")
plt.xlabel("BMI")
plt.ylabel("AVG-GLUCOSE-LEVEL")
plt.title("BMI vs AVG-GLUCOSE-LEVEL")

Average Glucose Level of more than 150 while having a BMI of 20 to 40 might lead to Stroke

In [None]:
yes = clean.where(clean["stroke"]==1).dropna()

## Effects of Categorical Variables

In [None]:
for i in ['gender', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type','smoking_status']:
    fig = px.scatter(yes, x="avg_glucose_level", y="bmi",size="age",title= f"Effects of {i}" ,color=i,hover_name=i, log_x=True, size_max=10)
    fig.show()

# Analysis and Model

## Preprocessing

In [None]:
# most people in the 30 to 40 range, so I am filling values with the mean
df["bmi"].replace({np.nan:np.mean(df["bmi"])},inplace=True)
df.isnull().sum()

In [None]:
# One-Hot Encoding
df = pd.get_dummies(df)
reorder = list(df.columns)
reorder.remove("stroke")
reorder.append("stroke")
df = df[reorder]
df.head()

In [None]:
# Handling low distribution
from sklearn.utils import resample
df_majority = df[df.stroke == 0]
df_minority = df[df.stroke == 1]

df_upsampled = resample(df_minority,
                       replace=True,
                       n_samples=4861,
                       random_state=0)

df_upsampled = pd.concat([df_majority,df_upsampled])

df_upsampled.stroke.value_counts()

In [None]:
# Splitting the Data
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

from sklearn.model_selection import train_test_split as tts

x_train,x_val,y_train,y_val = tts(X,y,test_size=0.4,random_state=0)
x_val,x_test,y_val,y_test = tts(x_val,y_val,test_size=0.5,random_state=0)

print([s.shape for s in [x_train,x_val,x_test,y_train,y_val,y_test]])

In [None]:
# Classification using Gradient_Boosting Algorithm
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=1, random_state=0).fit(x_train, y_train)
model.score(x_test,y_test)

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)
target_names = ["NO","YES"]
print(classification_report(y_test, y_pred, target_names=target_names,zero_division=1))