# 1. Data Preprocessing

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

### i) All Data

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.info()

In [None]:
data.bmi = (data.bmi.fillna(data['bmi'].mean()))
le = LabelEncoder()
object_col = [col for col in data.columns if data[col].dtype == 'object']
for col in object_col:
    data[col] = le.fit_transform(data[col])
data.info()

In [None]:
med_data = data.copy()
del med_data['ever_married']
del med_data['work_type']
del med_data['Residence_type']

### ii) Medical Data

In [None]:
med_data.describe()

In [None]:
nonm_data = data.copy()
del nonm_data['gender']
del nonm_data['age']
del nonm_data['hypertension']
del nonm_data['heart_disease']
del nonm_data['avg_glucose_level']
del nonm_data['bmi']
del nonm_data['smoking_status']

### iii) Non-Medical Data

In [None]:
nonm_data.describe()

## Train/Test Dataset

### i) All Data

In [None]:
X = data.iloc[:, :-1]
y= data.iloc[:, -1:]
x_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.42, random_state=42)

x_train = x_train.drop(axis=1, columns=['id'])
x_test = X_test.drop(axis=1, columns=['id'])
indexes = ['accuracy', 'f1_score']

### ii) Medical Data

In [None]:
mX = med_data.iloc[:, :-1]
my= med_data.iloc[:, -1:]
mx_train, mX_test, my_train, my_test = train_test_split(mX, my, test_size=0.42, random_state=42)

mx_train = mx_train.drop(axis=1, columns=['id'])
mx_test = mX_test.drop(axis=1, columns=['id'])
indexes = ['accuracy', 'f1_score']

### iii) Non-Medical Data

In [None]:
nmX = nonm_data.iloc[:, :-1]
nmy= nonm_data.iloc[:, -1:]
nmx_train, nmX_test, nmy_train, nmy_test = train_test_split(nmX, nmy, test_size=0.42, random_state=42)

nmx_train = nmx_train.drop(axis=1, columns=['id'])
nmx_test = nmX_test.drop(axis=1, columns=['id'])
indexes = ['accuracy', 'f1_score']

## 2. Classification

## 2-1. Multinomial Naive Bayes

### i) All Data

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
mnb_pred = mnb.predict(x_test)
mnb_acc = accuracy_score(mnb_pred, y_test)
mnb_f1 = f1_score(mnb_pred, y_test)
mnb_frame = pd.DataFrame({
    'All Data' : [mnb_acc, mnb_f1]
}, index = indexes)

### ii) Medical Data

In [None]:
mnb_m = MultinomialNB()
mnb_m.fit(mx_train, my_train)
mnb_m_pred = mnb_m.predict(mx_test)
mnb_m_acc = accuracy_score(mnb_m_pred, my_test)
mnb_m_f1 = f1_score(mnb_m_pred, my_test)
mnb_m_frame = pd.DataFrame({
    'Medical Data' : [mnb_m_acc, mnb_m_f1]
}, index = indexes)

### iii) Non-Medical Data

In [None]:
mnb_nm = MultinomialNB()
mnb_nm.fit(nmx_train, nmy_train)
mnb_nm_pred = mnb_nm.predict(nmx_test)
mnb_nm_acc = accuracy_score(mnb_nm_pred, nmy_test)
mnb_nm_f1 = f1_score(mnb_nm_pred, nmy_test)
mnb_nm_frame = pd.DataFrame({
    'Non-Medical Data' : [mnb_nm_acc, mnb_nm_f1]
}, index = indexes)

## 2-2. KNN

### i)All Data

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_acc = accuracy_score(knn_pred, y_test)
knn_f1 = f1_score(knn_pred, y_test)
knn_frame = pd.DataFrame({
    'All Data' : [knn_acc, knn_f1]
}, index = indexes)

### ii) Medical Data

In [None]:
knn_m = KNeighborsClassifier()
knn_m.fit(mx_train, my_train)
knn_m_pred = knn_m.predict(mx_test)
knn_m_acc = accuracy_score(knn_m_pred, my_test)
knn_m_f1 = f1_score(knn_m_pred, my_test)
knn_m_frame = pd.DataFrame({
    'Medical Data' : [knn_m_acc, knn_m_f1]
}, index = indexes)


### iii) Non-Medical Data

In [None]:


knn_nm = KNeighborsClassifier()
knn_nm.fit(nmx_train, nmy_train)
knn_nm_pred = knn_nm.predict(nmx_test)
knn_nm_acc = accuracy_score(knn_nm_pred, nmy_test)
knn_nm_f1 = f1_score(knn_nm_pred, nmy_test)
knn_nm_frame = pd.DataFrame({
    'Non-Medical Data' : [knn_nm_acc, knn_nm_f1]
}, index = indexes)


## 2-3. Decision Tree


### i) All Data

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)
dt_acc = accuracy_score(dt_pred, y_test)
dt_f1 = f1_score(dt_pred, y_test)
dt_frame = pd.DataFrame({
    'All Data': [dt_acc, dt_f1]
}, index=indexes)



### ii) Medical Data

In [None]:
dt_m = DecisionTreeClassifier()
dt_m.fit(mx_train, my_train)
dt_m_pred = dt_m.predict(mx_test)
dt_m_acc = accuracy_score(dt_m_pred, my_test)
dt_m_f1 = f1_score(dt_m_pred, my_test)
dt_m_frame = pd.DataFrame({
    'Medical Data': [dt_m_acc, dt_m_f1]
}, index=indexes)

### iii) Non-Medical Dataa

In [None]:
dt_nm = DecisionTreeClassifier()
dt_nm.fit(nmx_train, nmy_train)
dt_nm_pred = dt_nm.predict(nmx_test)
dt_nm_acc = accuracy_score(dt_nm_pred, nmy_test)
dt_nm_f1 = f1_score(dt_nm_pred, nmy_test)
dt_nm_frame = pd.DataFrame({
    'Non-Medical Data': [dt_nm_acc, dt_nm_f1]
}, index=indexes)

## 3. Evaluation

In [None]:
print("1. Multinomial Naive Bayes")
pd.concat([mnb_frame,mnb_m_frame, mnb_nm_frame], axis = 1)

In [None]:
print("2. KNN")
pd.concat([knn_frame,knn_m_frame, knn_nm_frame], axis = 1)

In [None]:
print("3. Decision Tree")
pd.concat([dt_frame,dt_m_frame, dt_nm_frame], axis = 1)

In [None]:
print("4. All Data")
pd.concat([mnb_frame,knn_frame, dt_frame],keys=['Multinomial NB','KNN','Decision Tree'],axis = 1)

In [None]:
print("5. Medical Data")
pd.concat([mnb_m_frame,knn_m_frame, dt_m_frame],keys=['Multinomial NB','KNN','Decision Tree'], axis = 1)

In [None]:
print("6. Non-Medical Data")
pd.concat([mnb_nm_frame,knn_nm_frame, dt_nm_frame],keys=['Multinomial NB','KNN','Decision Tree'], axis = 1)

## Additioinally

In [None]:
data_d = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data_d.dropna(subset = ["bmi"], inplace=True)
data_d.info()

In [None]:
le = LabelEncoder()
object_col = [col for col in data_d.columns if data_d[col].dtype == 'object']
for col in object_col:
    data_d[col] = le.fit_transform(data_d[col])
data_d.info()

In [None]:
X = data_d.iloc[:, :-1]
y= data_d.iloc[:, -1:]
x_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.42, random_state=42)

x_train = x_train.drop(axis=1, columns=['id'])
x_test = X_test.drop(axis=1, columns=['id'])
indexes = ['accuracy', 'f1_score']

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
mnb_pred = mnb.predict(x_test)
mnb_acc = accuracy_score(mnb_pred, y_test)
mnb_f1 = f1_score(mnb_pred, y_test)
mnb_frame = pd.DataFrame({
    'All Data' : [mnb_acc, mnb_f1]
}, index = indexes)

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_acc = accuracy_score(knn_pred, y_test)
knn_f1 = f1_score(knn_pred, y_test)
knn_frame = pd.DataFrame({
    'All Data' : [knn_acc, knn_f1]
}, index = indexes)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)
dt_acc = accuracy_score(dt_pred, y_test)
dt_f1 = f1_score(dt_pred, y_test)
dt_frame = pd.DataFrame({
    'All Data': [dt_acc, dt_f1]
}, index=indexes)

In [None]:
print("A. All Data")
pd.concat([mnb_frame,knn_frame, dt_frame],keys=['Multinomial NB','KNN','Decision Tree'],axis = 1)