In [69]:
import warnings 
warnings.filterwarnings('ignore')

In [70]:
import pickle as pkl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

In [72]:
df = pd.read_csv('..\\Data\\healthcare-dataset-stroke-data.csv')
df.head()

In [73]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False, cmap='summer')

In [74]:
df["bmi"]=df["bmi"].fillna(df["bmi"].mean())

In [75]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)

In [76]:
df_numerical = ['id','hypertension','heart_disease','age','avg_glucose_level','bmi','stroke']
df_categorical = ['gender','ever_married','work_type','Residence_type','smoking_status']

In [77]:
plt.figure(figsize=(15,8))
for i, j in enumerate(['age','bmi','avg_glucose_level']):
  plt.subplot(3,4,i+1)
  sns.boxplot(y=df[j])
  plt.title(f"{j}")
plt.tight_layout()
plt.show()

In [78]:
l = ["bmi","avg_glucose_level",'age']
clist = list()
def outlier_counter(l):    
    for i in l:
        count = 0
        Q1  = df[i].quantile(0.25)
        Q3  = df[i].quantile(0.75)
        IQR = Q3-Q1
        Uper = Q3+1.5*IQR
        lower = Q1-1.5*IQR
        for j in range(df.shape[0]):
            if df[i][j] > Uper:
                count += 1
        clist.append(count)
    return clist
con = outlier_counter(l)
print(f"Outlier count: \nBMI = {con[0]}\navg_glucose_level = {con[1]}\nage = {con[2]}")

In [79]:
def outlier_removal(l):
    for i in l:
        Q1  = df[i].quantile(0.25)
        Q3  = df[i].quantile(0.75)
        IQR = Q3-Q1
        Uper = Q3+1.5 * IQR
        lower = Q1-1.5 * IQR
        df[i]  = np.where(df[i]>Uper,Uper,(np.where(df[i]<lower,lower,df[i])))
outlier_removal(l)

In [80]:
plt.figure(figsize=(15,8))
for i, j in enumerate(['age','bmi','avg_glucose_level']):
  plt.subplot(3,4,i+1)
  sns.boxplot(y=df[j])
  plt.title(f"{j}")
plt.tight_layout()
plt.show()

In [81]:
from sklearn.preprocessing import LabelEncoder
label_encoders={}
for col in df.select_dtypes(include=['object']):
    label_encoders[col]=LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])
df

In [82]:
label_encoders

In [83]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import NearMiss

In [84]:
x = df.drop(['stroke'],axis=1)
x

In [85]:
y = df['stroke']
y

In [86]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [87]:
nm=NearMiss(n_neighbors=5,sampling_strategy='majority',version=1)
x_resampled,y_resampled=nm.fit_resample(x_scaled,y)

In [88]:
xtrain,xtest,ytrain,ytest = train_test_split(x_resampled, y_resampled, test_size=0.1, random_state=3)
print(xtrain.shape)
print(xtest.shape)

In [89]:
models = [
    (RandomForestClassifier, {"n_estimators": 100, "random_state": 42}),
    (DecisionTreeClassifier, {"random_state": 42}),
    (GradientBoostingClassifier, {"random_state": 42}),
    (AdaBoostClassifier, {"n_estimators": 50, "random_state": 42}),
    (xgb.XGBClassifier, {"n_estimators": 100, "random_state": 42}),
    (LogisticRegression, {"max_iter": 1000, "random_state": 42}),
    (SVC, {"kernel": "linear"}),
    (KNeighborsClassifier, {"n_neighbors": 5}),
    (GaussianNB,{})]
results = {}
for model_class, model_params in models:
    model = model_class(**model_params)
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    accuracy = accuracy_score(ytest, ypred)
    model_name = model_class.__name__
    results[model_name] = accuracy
for model_name, accuracy in results.items():
    print(f"{model_name}: Accuracy = {accuracy}")

In [90]:
model = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
print("Test prediction accuracy =",np.mean(ypred == ytest)*100)
print(classification_report(ytest,ypred))

In [91]:
import pickle
trainedstrokemodel = {
    'model':model,
    'scaler':scaler
}
label_encoders = label_encoders

In [92]:
with open('..\\pkl_dump\\trainedstrokemodel.pkl','wb') as file:
    pickle.dump(trainedstrokemodel,file)
with open('..\\pkl_dump\\trainedlabelencoders.pkl','wb') as file:
    pickle.dump(label_encoders,file)