# Introduction

Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

Loading CSV Data

In [2]:
df = pd.read_csv("strokeprediction.csv") # Load the dataset

In [3]:
df.head() # Display the first 5 rows of the dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,58.0,0,0,Yes,Govt_job,Urban,82.27,31.8,never smoked,0
1,Female,59.0,0,0,Yes,Self-employed,Rural,104.89,22.4,never smoked,0
2,Male,28.0,0,0,No,Private,Urban,65.99,27.5,formerly smoked,0
3,Male,67.0,0,1,Yes,Self-employed,Urban,90.69,24.9,smokes,0
4,Male,45.0,0,0,Yes,Private,Rural,77.83,35.5,formerly smoked,0


Checking Null Values

In [4]:
df.isnull().sum() # Check for missing values

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Encoding the object dtype columns

In [5]:
le = LabelEncoder() # Label Encoder object
for column in df.columns:
    if df[column].dtype == "object": # If the column is of type object
        df[column] = le.fit_transform(df[column]) # Fit and transform the column

---

# Light Gradient Boosting Classification Model

Defining Constants

In [6]:
SPLITS = 10 # Number of KFolds
RANDOM_STATE = 42 # Random State for reproducibility

## Without KFold Cross Validation

Defining the Training and Target Data

In [7]:
X = df.copy() # Copy of the dataframe
y = df["stroke"] # Target column
X.drop(["stroke"], axis = 1, inplace = True) # Drop the target column

Spliting Data into Training and Testing

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RANDOM_STATE)

Fitting the Model

In [9]:
lgbm = LGBMClassifier() # LGBM Classifier object
lgbm.fit(X_train, y_train) # Fit the model on the training data

Predicting the Test Values

In [10]:
pred = lgbm.predict(X_test)  # Predict on the test data

Accuracy Score of The Model

In [11]:
print("The accuracy score of LGBM without KFold is", accuracy_score(y_test, pred)) # Print the accuracy score

The accuracy score of LGBM without KFold is 0.8026143790849674


## With KFold Cross Validation

Defining KFold for the Regression Model

In [12]:
kf = StratifiedKFold(n_splits = SPLITS, shuffle = True, random_state = RANDOM_STATE) # Stratified KFold object

Defining the Training and Target Data

In [13]:
X = df.copy() # Copy of the dataframe
y = df["stroke"] # Target column
X.drop(["stroke"], axis = 1, inplace = True) # Drop the target column

In [14]:
score = [] # List to store the accuracy scores

Spliting Data & Fitting the Model

In [15]:
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lgbr = LGBMClassifier()
    lgbr.fit(X_train, y_train)
    pred = lgbr.predict(X_test)
    score.append(accuracy_score(y_test, pred))

Mean Accuracy Score of the Model

In [16]:
print("The accuracy score of LGBM with KFold is", np.mean(score)) # Print the mean of the accuracy scores

The accuracy score of LGBM with KFold is 0.7919916628068552
