In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df = df.drop(columns=["id"])

In [None]:
df.describe()

Verificando os dados faltantes

In [None]:
df.nunique()

In [None]:
df.gender.unique()

In [None]:
drop = df[df["gender"] == "Other"].index
df.drop(drop, inplace = True)
df.gender.unique()

In [None]:
df.work_type.unique()

In [None]:
df.bmi

In [None]:
df.avg_glucose_level

In [None]:
df.isnull().sum()

Substituindo os dados faltantes de bmi pela média

In [None]:
df = df.fillna(df.mean())

In [None]:
df.isnull().sum()

# Visualização

In [None]:
sns.displot(df, x="age",col="gender", hue="stroke",  multiple="dodge")

In [None]:
sns.displot(df, x="smoking_status", hue="stroke", col="gender", multiple="dodge")


In [None]:
sns.displot(df, x="bmi", hue="stroke", col="gender", multiple="dodge")

In [None]:
sns.displot(df, x="avg_glucose_level", hue="stroke", col="gender", multiple="dodge")

In [None]:
sns.displot(df, x="Residence_type", hue="stroke", col="gender", multiple="dodge")

* id = deleta
* age = classificar a categoria com intervalos de 20 anos
* avg_glucose_level = classificar a categoria pelo box plot
* bmi = classificar a categoria pelo box plot
* work_type = one hot encouding
* smoking_status = one hot encouding
* ever_married = binario            
* Residence_type = binario

In [None]:
def classificadorIdade(x):
    if x < 20:
        return 1 #"< 20"
    
    if x < 40:
        return 2 #"de 20 a 39"
    
    if x < 60:
        return 3 #"de 40 a 59"
    
    if x < 80:
        return 4 #"de 60 a 79"
    
    if x <= 100:
        return 5 #"de 80 a 100"

In [None]:
df['age'] = df['age'].apply(lambda x: classificadorIdade(x))

In [None]:
df.age.unique()

In [None]:
df.bmi.describe()

In [None]:
df.avg_glucose_level.describe()

In [None]:
def classificadorAVG(x):
    if x <= 77.245:
        return '<= 25%'
    
    if x <= 91.885:
        return 'entre 26% a 50%'
    
    if x <= 114.09:
        return 'entre 50% a 75%'
    
    else: return '>= 76%'
    
    
def classificadorBMI(x):
    if x <= 23.8:
        return '<= 25%'
    
    if x <= 28.4:
        return 'entre 26% a 50%'
    
    if x <= 32.8:
        return 'entre 50% a 75%'
    
    else: return '>= 76%'

In [None]:
df['avg_glucose_level'] = df['avg_glucose_level'].apply(lambda x: classificadorAVG(x))
df['bmi'] = df['bmi'].apply(lambda x: classificadorBMI(x))

In [None]:
print(f'BMI:\n{df.bmi.unique()} \n\navg_glucose_level:\n{df.avg_glucose_level.unique()}')

In [None]:
labelencoder = LabelEncoder()

df["gender"] = labelencoder.fit_transform(df["gender"])
df["ever_married"] = labelencoder.fit_transform(df["ever_married"])
df["Residence_type"] = labelencoder.fit_transform(df["Residence_type"])


In [None]:
df['avg_glucose_level'] = pd.Categorical(df['avg_glucose_level'])
dfavg = pd.get_dummies(df['avg_glucose_level'], prefix = 'avg_glucose_level_cod')
dfavg

In [None]:
df['bmi'] = pd.Categorical(df['bmi'])
dfbmi = pd.get_dummies(df['bmi'], prefix = 'bmi_cod')
dfbmi

In [None]:
df['work_type'] = pd.Categorical(df['work_type'])
dfwork = pd.get_dummies(df['smoking_status'], prefix = 'work_type_cod')
dfwork

In [None]:
df['smoking_status'] = pd.Categorical(df['smoking_status'])
dfsmoke = pd.get_dummies(df['smoking_status'], prefix = 'smoking_status_cod')
dfsmoke

In [None]:
df['age'] = pd.Categorical(df['age'])
dfage = pd.get_dummies(df['age'], prefix = 'age_cod')
dfage

In [None]:
df.drop("bmi", axis=1, inplace=True)
df.drop("work_type", axis=1, inplace=True)
df.drop("smoking_status", axis=1, inplace=True)
df.drop("avg_glucose_level", axis=1, inplace=True)
df.drop("age", axis=1, inplace=True)

In [None]:
df = pd.concat([df, dfwork], axis=1)
df = pd.concat([df, dfavg], axis=1)
df = pd.concat([df, dfbmi], axis=1)
df = pd.concat([df, dfage], axis=1)
df = pd.concat([df, dfsmoke], axis=1)
df

# Modelo

In [None]:
previsores = df.drop(columns="stroke")
classe = df["stroke"]

In [None]:
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(previsores, classe, test_size = 0.3, random_state = 0)

In [None]:
floresta = RandomForestClassifier(n_estimators = 100)
floresta.fit(X_treinamento, y_treinamento)

In [None]:
previsoes = floresta.predict(X_teste)
confusao = confusion_matrix(y_teste, previsoes)
confusao

In [None]:
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

In [None]:
nb = GaussianNB()
nb.fit(X_treinamento, y_treinamento)

In [None]:
previsoes = nb.predict(X_teste)
confusao = confusion_matrix(y_teste, previsoes)
confusao

In [None]:
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

In [None]:
neighbors = KNeighborsClassifier(n_neighbors=3)
neighbors.fit(X_treinamento, y_treinamento)

In [None]:
previsoes = neighbors.predict(X_teste)
confusao = confusion_matrix(y_teste, previsoes)
confusao

In [None]:
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

In [None]:
tree = DecisionTreeClassifier(random_state=1)
tree.fit(X_treinamento, y_treinamento)

In [None]:
previsoes = tree.predict(X_teste)
confusao = confusion_matrix(y_teste, previsoes)
confusao

In [None]:
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto