In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
%matplotlib inline
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Importing the Dataset

In [None]:
df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head(5)

### Exploratory Data Analysis

In [None]:
df.describe()

In [None]:
# droppping 'id' as its of no use for predicting
df.drop('id',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

Since we have 201 values of bmi missing we have to fill these before predicting as bmi can be an important index

In [None]:
# dataframe of those having bmi NaN
bmi_none=df[df['bmi'].isna()==True]
bmi_none['stroke'].value_counts()

In [None]:
# filling all the nan values using mean values of particular category(0,1)
df1=df[df['stroke']==1].fillna(df['bmi'][df['stroke']==1].mean())
df2=df[df['stroke']==0].fillna(df['bmi'][df['stroke']==0].mean())

In [None]:
#concatenating 2 dataframes into a final one
result_df=pd.concat([df1,df2])

In [None]:
result_df.isna().sum()

In [None]:
result_df['stroke'].value_counts()

In [None]:
import seaborn as sns
sns.pairplot(df,hue="stroke")

Since this dataset is imbalanced we have to balance it using sampling techniques

In [None]:
# doing one-hot encoding of categorical variables
result_df=pd.get_dummies(result_df,drop_first=True)

In [None]:
result_df.head()

In [None]:
result_df.columns

In [None]:
x=result_df[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'gender_Male', 'gender_Other', 'ever_married_Yes',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes']]
y=result_df['stroke']

### Using SMOTE for oversampling

In [None]:
from imblearn.over_sampling import SMOTE
sampling=SMOTE()
x,y=sampling.fit_resample(x,y)

In [None]:
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

Since we will use lightGBM which is decision tree based algorithm there is no need to scale values

### Model Building using LightGBM

LightGBM is a gradient boosting model that uses tree-based algorithms. It is much faster than the usual tree-based algorithms like Decision Trees, Random Forests, etc. It has the following advantages over the traditional machine learning algorithms.

* Faster training speed with better efficiency. 
* Lower memory usage.
* Supports GPU processing. 
* Highly scalable and efficiently handles large datasets

In [None]:
import lightgbm as lgb

In [None]:
train_data=lgb.Dataset(x_train,label=y_train)

#### Setting Parameters

In [None]:
param = {'num_leaves':100, 'objective':'binary','max_depth':5,'learning_rate':.05}
param['metric'] = ['auc']

In [None]:
# Training of model
lgbm=lgb.train(param,train_data,5)

### Making Predictions

In [None]:
y_pred=lgbm.predict(x_test)

In [None]:
y_preds=[]
for i in y_pred:
    if i>0.5:
        y_preds.append(1)
    else:
        y_preds.append(0)

### Evaluating Results

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_preds,y_test))

In [None]:
sns.heatmap(confusion_matrix(y_preds,y_test),annot=True)