# Import libraries

In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Model libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Model testing libraries
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Import data

In [None]:
paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv(paths[0])
data.head()

# Data preprocessing

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
classes = {}
lb = LabelEncoder()
data['gender'] = lb.fit_transform(data['gender'])
classes['gender'] = lb.classes_

data['ever_married'] = lb.fit_transform(data['ever_married'])
classes['ever_married'] = lb.classes_

data['work_type'] = lb.fit_transform(data['work_type'])
classes['work_type'] = lb.classes_

data['Residence_type'] = lb.fit_transform(data['Residence_type'])
classes['Residence_type'] = lb.classes_

data['smoking_status'] = lb.fit_transform(data['smoking_status'])
classes['smoking_status'] = lb.classes_
mean_value = data['bmi'].mean()
data['bmi'].fillna(value=mean_value, inplace=True)
for key, value in classes.items():
    print(key,"\t", value)
data.describe()

# Data visualization

In [None]:
data.drop(['id'], axis = 1, inplace = True)

In [None]:
sns.heatmap(data.corr())

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (8, 8))
sns.boxplot(data = data['avg_glucose_level'], ax = ax[0, 0], color='purple')
sns.boxplot(data = data['smoking_status'], ax = ax[0, 1], color='red')
sns.boxplot(data = data['age'], ax = ax[1, 0], color='orange')
sns.boxplot(data = data['bmi'], ax = ax[1, 1], color='pink')
plt.show()

# Testing different models

In [None]:
X = data.drop(['stroke'], axis = 1)
y = data['stroke']
std = StandardScaler()
X = std.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
model_name = []
model_accuracy = []
def test_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    model_name.append(name)
    model_accuracy.append(accuracy)
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
    print(accuracy)
    disp.plot()
    plt.show()

# 1. Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_estimators = 10)
test_model(model, "Random Forest Classifier")

# 2. Support vector machine

In [None]:
model = SVC(C=1.2, kernel='rbf')
test_model(model, "Support Vector Machine")

# 3. Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier(random_state = 0)
test_model(model, "Decision Tree Classifier")

# 4. Kneighbor classifier

In [None]:
model = KNeighborsClassifier(n_neighbors=3)
test_model(model, "KNeighbors Classifier")

# 5. Gaussian Naive Bayes

In [None]:
model = GaussianNB()
test_model(model, "Gaussian Naive Bayes")

# 6 Logistic Regression

In [None]:
model = LogisticRegression()
test_model(model, "Logistic Regression")

# Plotting accuracies of different models

In [None]:
fig = plt.figure(figsize = (20, 5))
plt.bar(model_name, model_accuracy, width = 0.2, color='green')
plt.xlabel("Models", fontsize = 15)
plt.ylabel("accuracy", fontsize = 15)
# plt.xticks()
plt.show()