In [None]:
#--------------------------------- Dataset manipulation ---------------------------------#
import pandas as pd
import numpy as np

#------------------------------------- Graphics -----------------------------------------#
import matplotlib.pyplot as plt
import seaborn as sns

#--------------------------------- Features balancing -----------------------------------#
from imblearn.over_sampling import SMOTE


#---------------------------------- Machine learning ------------------------------------#
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

### Dataset initial information

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.info()

In this initial information we can see "bmi" feature has some missing values. In order to fill these values let's take a look in some other features and see if they are correlated

###  Continuous variables

In [None]:
continuous = ["age", "bmi", "avg_glucose_level"]

fig, ax = plt.subplots(1, 3, figsize = (12, 7))

for i, ax in enumerate(fig.axes):
    sns.distplot(x = df[continuous[i]], ax = ax, axlabel = continuous[i])

### Correlation between continuous variables

In [None]:
plt.figure(figsize = (12, 7))

correlation = df[continuous].corr()
sns.heatmap(correlation, cmap='viridis', annot=True, vmax = 1.0,
            vmin = 0.3, linewidths=.5, data = df[continuous])

This matrix show the linear correlation among continuous variables are not so strong. But maybe there is a non-linear correlation. 

In [None]:
fig, ax = plt.subplots(1, 3, figsize = (17, 7))
continuous = ["age", "bmi", "avg_glucose_level"]

sns.scatterplot(x = df["age"], y = df["bmi"], ax = fig.axes[0], data = df, hue = "stroke")
sns.scatterplot(x = df["age"], y = df["avg_glucose_level"], ax = fig.axes[1], data = df, hue = "stroke")
sns.scatterplot(x = df["avg_glucose_level"], y = df["bmi"], ax = fig.axes[2], data = df, hue = "stroke")

### Correlations between bmi and categorical variables

In [None]:
categorical = df.columns
continuous = ["id", "age", "avg_glucose_level", "bmi", "_"]

cont = 0
for j, cat in enumerate(categorical):
    
        if(cat != continuous[cont]):
            print("#---------------------", cat, "-----------------------#")
            print(df.groupby([cat])["bmi"].mean())
            print("")
            
        else:
            cont += 1
            

### Correlations between age and categorical variables

In [None]:
categorical = df.columns
continuous = ["id", "age", "avg_glucose_level", "bmi", "_"]

cont = 0
for j, cat in enumerate(categorical):
    
        if(cat != continuous[cont]):
            print("#---------------------", cat, "-----------------------#")
            print(df.groupby([cat])["age"].mean())
            print("")
            
        else:
            cont += 1

An interesting point in these results is a possible relationship between "work_type" and "bmi". People classified as 'children' has an average age of 6.8 years, and a low 'bmi'. This is totally accepted because children have a low body mass index. For the case 'Never_worked' the age is about 16 years, revealing a considered amount of teenagers. This group should have a low 'bmi', although it has to be higher than 'children' group. The remained three other groups of "work_type" seems to have a good proportion of adults, so these groups must take almost the same 'bmi' and higher than 'children' and 'Never_worked'.  

### Filling missing values on "bmi"

In [None]:
filter_children = df["work_type"] == "children"
filter_teenager = df["work_type"] == "Never_worked"

df["bmi"][filter_children] = df["bmi"][filter_children].fillna(20.03)
df["bmi"][filter_teenager] = df["bmi"][filter_teenager].fillna(25.54)
df["bmi"] = df["bmi"].fillna(30.30)
print("Total de valores missing em 'bmi': ", df["bmi"].isnull().sum())

### Checking "Unknown's" amount in 'gender' and 'smoking_status'

In [None]:
print("#------------------------- Gender ------------------------------#")
print(df["gender"].value_counts())

print("")

print("#------------------------- Smoking ------------------------------#")
print(df["smoking_status"].value_counts())

In [None]:
plt.figure(figsize = (12, 7))
sns.countplot(data = df, x = "smoking_status")

'gender' situation is easier than 'smoking_status' because there is only one case classificated as 'Other'. In 'smoking_status' there are 1544 cases classified as 'Unknown' which corresponds to 30.21% of the entire feature. 

For "smoking_status" a possible solution might be replace the "Unknown" values for "never smoked" status. At the statistical point of view, "never smoked" status is the most seen variable on the dataset. The physical explanation for this choice is people have been knowing the dangers about smoking, so many people avoid it. But in a first moment let's keep this classification

### Correlation between stroke cases and continuous variables

In [None]:
features_num = ["age", "bmi", "avg_glucose_level"]

fig, ax = plt.subplots(1, 3, figsize = (20, 12))

for i, ax in enumerate(fig.axes):
    g = sns.barplot(data = df, y = features_num[i], x = "stroke", ci = "sd",
                    capsize=.2, ax = ax)
    
    g.set_title(features_num[i], fontsize = 16)

In [None]:
fig, ax = plt.subplots(1, 3, figsize = (20, 12))

for i, ax in enumerate(fig.axes):
    g = sns.boxplot(data = df, y = features_num[i], x = "stroke",
                    ax = ax)
    
    g.set_title(features_num[i], fontsize = 16)

Until this point we can see that variable "age" is very correlated with stoke classification. So we must take an especial attention about this variable. We might filter these variable using some other features. For exemple, is a little bit difficult to find a child less than 15 years who smoking or formerly smoked. Another point is a 'bmi' above 35 for a 15 years old is also hard to find.

In [None]:
#---------------------------------------- Filtering "age" feature ----------------------------------# 
filter_age      = df["age"] <= 15
filter_smoking  = df["smoking_status"] == "smokes"  
filter_smoking2 = df["smoking_status"] == "formerly smoked"
filter_bmi      = df["bmi"] >= 35
filter_gender   = df["gender"] == "Other"

array_filter = df[(filter_age & filter_smoking) | (filter_age & filter_smoking2) | (filter_age & filter_bmi) | (filter_gender)].index

new_df = df.drop(array_filter)
new_df.info()



### Correlation between stroke cases and categorical variables

In [None]:
cat_var = ["gender", "hypertension", "heart_disease", "ever_married", "work_type", 
           "Residence_type","smoking_status"]

fig, ax = plt.subplots(2, 4, figsize = (17, 12))
cont = 0

for i, ax in enumerate(fig.axes):
    
    if cont < (7):
        sns.countplot(x = cat_var[i], hue = "stroke", data = new_df, ax = ax)
        
    cont += 1

Countplot is a way to see possible correlations between variables, however I think it isn't an easy one. Another form to check correlation is the frequency of stroke and no stroke cases in each feature's classification. 

In [None]:
#----------------------- Encoding hypertension and heart_disease features ------------------------#
new_df["New_hypertension"] = new_df["hypertension"].map({0: "no_hyper", 1: "hyper"})
new_df["New_heart_disease"] = new_df["heart_disease"].map({0: "no_disease", 1: "disease"})
new_df["New_ever_married"] = new_df["ever_married"].map({"Yes": "married", "No": "no_married"})

#------------------------------------- List of categorical features ----------------------------------------#
cat_var = ["gender", "New_hypertension", "New_heart_disease", "New_ever_married", "work_type", 
           "Residence_type","smoking_status"]

#--------------------- Extract proportion of stoke cases in each feature's classification ------------------#
dict_prop = {}
for i, var in enumerate(cat_var):
    total = 0
    
    for j, classif in enumerate(new_df[var].unique()):
        
        for k in range(2): #for stroke classification
            if j < len(new_df[var][new_df["stroke"] == k].value_counts()):
                total += new_df[var][new_df["stroke"] == k].value_counts()[j]
        
        prop_no_stroke =  new_df[var][df["stroke"] == 0].value_counts()[j]/total
        
        if j < len(new_df[var][new_df["stroke"] == k].value_counts()):
            prop_stroke =  new_df[var][df["stroke"] == 1].value_counts()[j]/total
            dict_prop[classif] = [prop_no_stroke, prop_stroke]
            
        else:
            dict_prop[classif] = [prop_no_stroke, 0.0]
            
        total = 0

#------------------------- Converting dictionary of proportions to dataframe ------------------------------#
df_prop = pd.DataFrame(dict_prop)
df_prop.head()

In [None]:
#------------------------------------- Parameters of Pie plot --------------------------------------------#
n = 10
m = 2
fig , ax = plt.subplots(n, m, figsize = (15, 25))
cont = 0
var = df_prop.columns
explode = (0, 0.1)  


#------------------------------------------ Iterative plot -----------------------------------------------#
for i in range(n):
    for j in range(m):
        
        if cont < ((n*m)-1):
            ax[i,j].pie(df_prop[var[cont]], labels = ["No_stroke", "Stroke"], explode=explode,
                        autopct='%1.1f%%', shadow=True, startangle=60)
        
            ax[i,j].axis('equal')
            ax[i,j].set_title(df_prop.columns[cont], loc = "left", fontsize = 16)
        
        cont +=1

Now is easy to see what features are correlated with stroke cases. For example the features "hypertension" and "heart_disease" are correlated with stroke. The proportion of "stroke" and "no stroke" cases are different when we change the classification of these features. On the other hand, the feature "gender" doens't seem make much difference about stroke cases, once the frequency in both classifications are almost the same.

### Proportion of stroke and not stroke cases

In [None]:
#---------------------------------------- Parameters of Pie plot ---------------------------------------------#
plt.figure(figsize = (12, 7))
labels = ["No", "Yes"]
explode = (0, 0.13)
count_stroke = new_df["stroke"].value_counts()
text = {"color": 'w', "fontsize": 16}


#---------------------------------------------- Pie plot -----------------------------------------------------#
plt.pie(count_stroke, labels = labels, 
        explode = explode, autopct='%1.1f%%', 
        shadow=True, startangle = 0,textprops = text)


#------------------------------------------- Pie plot legend -------------------------------------------------#
plt.legend(labels,
          title="Classification",
          loc="upper right",
           prop={'size': 18},
           bbox_to_anchor=(0.7, 0., 0.75, 0.7))

plt.title("Frequency of stroke cases", fontsize = 18)

It's very clear there is an imbalance for stroke target. This might be a problem for model construction. In this situation we have to balance this target.

In [None]:
#--------------------------------- Features to be encodered -------------------------------#
columns = ["Residence_type", "gender", "hypertension", "heart_disease", 
           "ever_married","work_type", "smoking_status"]


#-------------------------------- Features to be eliminated -------------------------------#
eliminate = ["stroke", "id", "New_hypertension", 
             "New_heart_disease", "New_ever_married"]


#-----------------------------------Target and Dataframe ----------------------------------#
target = new_df["stroke"]
new_df = new_df.drop(columns = eliminate)


#---------------------------------------- Encoder -----------------------------------------#
le = preprocessing.OrdinalEncoder()
new_df[columns] = le.fit_transform(new_df[columns])


#------------------------------------Train and test data ----------------------------------#
x_train, x_test, y_train, y_test = train_test_split(new_df, target, 
                                                    test_size = 0.4, random_state = 0)


#------------------------------------ Target balancing ------------------------------------#
oversample = SMOTE()
x, y = oversample.fit_resample(x_train, y_train)

plt.figure(figsize = (12, 7))
sns.countplot(x = y)

### Machine learning models 

In [None]:
def model(x_train, x_test, y_train, y_test):
    
    
    RFC = RandomForestClassifier(random_state = 0)
    DTC = DecisionTreeClassifier(random_state = 0)
    GNB = GaussianNB()
    SVM = SVC(random_state = 0)
    GBC = GradientBoostingClassifier(random_state = 0)
    RC  = RidgeClassifier(random_state = 0)
    
    #---------------------------------- Fit models --------------------------------#
    model_RFC = RFC.fit(x_train, y_train)
    model_DTC = DTC.fit(x_train, y_train)
    model_GNB = GNB.fit(x_train, y_train)
    model_SVM = SVM.fit(x_train, y_train)
    model_GBC = GBC.fit(x_train, y_train)
    model_RC  = RC.fit(x_train, y_train)
    
    #--------------------------------- Predictions --------------------------------#
    prd_RFC = model_RFC.predict(x_test)
    prd_DTC = model_DTC.predict(x_test)
    prd_GNB = model_GNB.predict(x_test)
    prd_SVM = model_SVM.predict(x_test)
    prd_GBC = model_GBC.predict(x_test)
    prd_RC  = model_RC.predict(x_test)
    
    #-------------------------------- Accuracy ------------------------------------#
    acc_RFC = accuracy_score(y_test, prd_RFC)
    acc_DTC = accuracy_score(y_test, prd_DTC)
    acc_GNB = accuracy_score(y_test, prd_GNB)
    acc_SVM = accuracy_score(y_test, prd_SVM)
    acc_GBC = accuracy_score(y_test, prd_GBC)
    acc_RC  = accuracy_score(y_test, prd_RC)
    accuracy = [acc_RFC, acc_DTC, acc_GNB, acc_SVM, acc_GBC, acc_RC]
    
    #---------------------------- Confusion matrix --------------------------------#
    matrix_RFC = confusion_matrix(y_test, prd_RFC)
    matrix_DTC = confusion_matrix(y_test, prd_DTC)
    matrix_GNB = confusion_matrix(y_test, prd_GNB)
    matrix_SVM = confusion_matrix(y_test, prd_SVM)
    matrix_GBC = confusion_matrix(y_test, prd_GBC)
    matrix_RC  = confusion_matrix(y_test, prd_RC)
    matrix = [matrix_RFC, matrix_DTC, matrix_GNB, matrix_SVM, matrix_GBC, matrix_RC]
    
    return accuracy, matrix


#----------------------------- Applying 'model' function --------------------------#
accuracy, matrix = model(x, x_test, y, y_test)   

### Accuracy of each preditive model

In [None]:
def plot(acc, labels):

    fig, ax = plt.subplots(figsize = (12, 7))
    x = np.arange(len(labels))

    ax.barh(x, accuracy)
    ax.set_yticks(x)
    ax.set_yticklabels(labels, fontsize = 16)
    ax.set_xlabel("Accuracy", fontsize = 16)

    
    
    
    
#------------------------------ Applying 'plot' function ----------------------------#    
labels = ('Random Forest', 'Decision Tree', 'Naive Bayes', 
          'Support Vector Machine', "Gradient Boosting",
          "Ridge Regression") 

plot(accuracy, labels)

### Confusion matrix for each preditive model

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (12, 7))

for i, ax in enumerate(fig.axes):
    
    if i < len(matrix):
        g = sns.heatmap(matrix[i], annot=True ,fmt='', ax = ax)
        
        g.set_title(labels[i])   

Here we have an interesting point. Accuracy tells us 'Random Forest' and 'Gradient Boosting' as best models. However, taking a look at confusion matrix we can see these models are good for classificate no_stroke cases. On the ohter hand they are very bad for classificate a stroke case. In a physical point of view is much better a model with good detection in stroke case (because is terrible disease) then a no_stroke case. So, "Naive Bayes", "Ridge" and "SVM" are more suitable models.