![](https://storage.googleapis.com/kaggle-datasets-images/1120859/1882037/04da2fb7763e553bdf251d5adf6f88d9/data-original.jpg?t=2021-01-26-19-57-05)

## Stroke Prediction
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(r'../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data.head()

### About the Data:
1. id: unique identifier
2. gender: "Male", "Female" or "Other"
3. age: age of the patient
4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6. ever_married: "No" or "Yes"
7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8. Residence_type: "Rural" or "Urban"
9. avg_glucose_level: average glucose level in blood
10. bmi: body mass index
11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12. stroke: 1 if the patient had a stroke or 0 if not
* Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.nunique()

* let me drop **id** column

In [None]:
data.drop('id',axis=1,inplace=True)

* **gender** has 3 unique values, let's see each value counts:

In [None]:
data['gender'].value_counts()

* it looks like **Other** having only one value, let's change it to **Male**.

In [None]:
data['gender'].replace('Other','Male',inplace=True)

* **work_type** have 5 unique values, let's see each value counts:

In [None]:
data['work_type'].value_counts()

* let's put **children** in **Never_worked** category

In [None]:
data['work_type'].replace('children','Never_worked',inplace=True)

* **smoking_status** have 4 unique values, let's see each value counts:

In [None]:
data['smoking_status'].value_counts()

* here, "Unknown" in smoking_status means that the information is unavailable for this patient.

* let's check if there is any missing values

In [None]:
data.isnull().sum()

* looks like **bmi** column has some missing values. let's change null values with **bmi** mean.

In [None]:
data['bmi'].fillna(data['bmi'].mean(),inplace=True)

In [None]:
data.isnull().sum()

* looks like there is no missing value.

### data visualization
* let's visualize the data

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(121)
sns.countplot(x='gender',hue='stroke',data=data,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
plt.title('Distribution of gender\naccording to target variable')

plt.subplot(122)
sns.countplot(x='ever_married',hue='stroke',data=data,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0) 
plt.title('Distribution of ever_married\naccording to target variable')
plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(121)
sns.countplot(x='work_type',hue='stroke',data=data,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
plt.title('Distribution of work_type\naccording to target variable')

plt.subplot(122)
sns.countplot(x='Residence_type',hue='stroke',data=data,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0) 
plt.title('Distribution of Residence_type\naccording to target variable')
plt.show()

In [None]:
sns.countplot(x='smoking_status',hue='stroke',data=data,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
plt.title('Distribution of smoking_status\naccording to target variable')
plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(121)
sns.kdeplot(x='avg_glucose_level',hue='stroke',data=data,fill=True, palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
plt.title('"Distribution of avg_glucose_level\naccording to target variable')

plt.subplot(122)
sns.kdeplot(x='bmi',hue='stroke',data=data,fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0) 
plt.title('"Distribution of bmi\naccording to target variable')
plt.show()

### Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod = ['gender','ever_married','work_type','Residence_type','smoking_status']
for i in var_mod:
    data[i] = le.fit_transform(data[i])

### Model building

In [None]:
X = data.drop('stroke',axis=1)
y = data['stroke']

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

def cross_val(X, y, model, params, folds=5):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict(x_test)
        accuracy = accuracy_score(y_test, pred)
        print(f" accuracy_score: {accuracy}")
        print("-"*50)
    
    return alg

In [None]:
lgb_params= {'learning_rate' : 0.01,
                  'max_depth' : 8,
                  'n_estimators' : 900,
                  'num_leaves' :8}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

#### if you like this notebook please upvote it.
#### thank you!