In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly_express as pe
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import missingno as msno

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df

In [None]:
# Data Info
df.info()
print("Shape of Data {}".format(df.shape))

In [None]:
df.describe().T

# NA Values

In [None]:
msno.matrix(df)

# Categorical Values

In [None]:
cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status","stroke"]
for col in cols:
    fig = pe.pie(values=df[col].groupby(df[col]).count(),names=df[col].groupby(df[col]).count().index,title="Distribution of {}".format(col))
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()

In [None]:
# Other value imbalanced

df = df[(df["gender"] =="Male")|(df["gender"] =="Female")]

# Numerical Values

### Distrubition of BMI 

In [None]:
st.kstest(df["bmi"].dropna(),"norm")

* BMI is not normal.

In [None]:
st.kstest(df["age"],"norm")

Age is not normal.

In [None]:
st.kstest(df["avg_glucose_level"],"norm")

avg_glucose_level is not normal.

In [None]:
df["bmi"].fillna(df["bmi"].median(),inplace=True)

In [None]:
cols = ["age","bmi","avg_glucose_level"]
for col in cols:
    sns.distplot(df[col])
    plt.show()

# Data Visualization

In [None]:
cols = ["bmi","age","avg_glucose_level"]
fig = pe.box(df[cols],points="all")
fig.show()

In [None]:
fig = pe.box(x = df["gender"],y = df["bmi"],color = df["gender"])
fig.show()

In [None]:
fig = pe.box(x = df["gender"],y = df["avg_glucose_level"],color = df["gender"])
fig.show()

In [None]:
fig = pe.strip(x = df["stroke"],y = df["avg_glucose_level"],color = df["gender"])
fig.show()

In [None]:
fig = pe.strip(x = df["stroke"],y = df["bmi"],color = df["gender"])
fig.show()

In [None]:
fig = pe.strip(x = df["stroke"],y = df["age"],color = df["gender"])
fig.show()

In [None]:
fig = pe.strip(x = df["stroke"],y = df["age"],color = df["hypertension"])
fig.show()

In [None]:
fig = pe.strip(x = df["smoking_status"],y = df["avg_glucose_level"],color = df["gender"])
fig.show()

# Outliers

In [None]:
#bmi
q3 = df["bmi"].quantile(0.75)
q1 = df["bmi"].quantile(0.25)
iqr = q3-q1
mini = q1-1.5*iqr;maxi = q3+1.5*iqr
df = df[(df["bmi"]>=mini) & (df["bmi"]<=maxi)]

#avg_glucose_level
q3 = df["avg_glucose_level"].quantile(0.75)
q1 = df["avg_glucose_level"].quantile(0.25)
iqr = q3-q1
mini = q1-1.5*iqr;maxi = q3+1.5*iqr
df = df[(df["avg_glucose_level"]>=mini) & (df["avg_glucose_level"]<=maxi)]

# normalization

In [None]:
from sklearn.preprocessing import StandardScaler
cols = ["age","bmi","avg_glucose_level"]
scale = StandardScaler()
df[cols] = scale.fit_transform(df[cols])

# encoding

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ["gender","ever_married","work_type","Residence_type","smoking_status"]
encoder = LabelEncoder()
for i in cols:
    df[i] = encoder.fit_transform(df[i])

In [None]:
df.drop("id",axis = 1,inplace=True)

# model

## lightgbm

In [None]:
X = df.drop("stroke",axis = 1)
y = df["stroke"]

In [None]:
from sklearn.model_selection import train_test_split,KFold
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state =0)

In [None]:
from sklearn.utils import class_weight
class_weights = dict(zip(np.unique(y_train), class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)))


In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix   
from sklearn.metrics import accuracy_score,classification_report,f1_score
lgbm_model=LGBMClassifier(class_weight=class_weights).fit(X_train,y_train)  
y_pred=lgbm_model.predict(X_test) 
print("Accuracy : ",accuracy_score(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

y_proba= lgbm_model.predict_proba(X_test)
roc_auc_score(y_test, y_proba[:, 1])

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

* Our model is not very good at detecting Stroke = 1 values, but it is good at detecting values of Stroke = 0.

## catboost

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(od_type="Iter")
cat_model = cat.fit(X_train,y_train)
y_pred =cat.predict(X_test)
accuracy_score(y_test,y_pred)


In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

y_proba= cat_model.predict_proba(X_test)
roc_auc_score(y_test, y_proba[:, 1])

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');