# Importing and Looking at The Last 5 Rows of The Dataset

In [None]:
import pandas as pd

df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.tail()

# Data Cleaning

Before we could see that the column "id" will not be of much help to us, so we can eliminate it.

In [None]:
df.drop("id", axis=1, inplace=True)
print(df.info())


We can also see that we only have missing data in the body mass index column, being a small amount we can get rid of those rows.

In [None]:
df.dropna(subset=["bmi"], inplace=True)

# Visualizing The Data

In [None]:
df.describe()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

sns.pairplot(df)
plt.show()

# Visualizing The Distribution of The Data

In [None]:
fig, ax=plt.subplots(nrows=6, ncols=2, figsize=(20, 30))
fig.tight_layout(h_pad=9, w_pad=7)

sns.countplot(x=df.gender, ax=ax[0][0])
info_text="Female: "+str(df.gender.value_counts().Female)+"\nMale: "+str(df.gender.value_counts().Male)+"\nOther: "+str(df.gender.value_counts().Other)
ax[0][0].legend(title=info_text, labels=[])
ax[0][0].set_xlabel("Gender")
ax[0][0].set_ylabel("Count")
ax[0, 0].set_title("Gender", fontsize=20)

sns.histplot(df.age, ax=ax[0, 1], kde=True)
ax[0, 1].set_xlabel("Age")
ax[0, 1].set_ylabel("Count")
ax[0, 1].set_title("Age", fontsize=20)
plt.sca(ax[0, 1])
plt.legend(labels=[], title="Line: Kernel Density Estimation")

sns.countplot(x=df.hypertension, ax=ax[1, 0])
plt.sca(ax[1, 0])
plt.xticks([0, 1], ["No", "Yes"])
info_text="No: "+str(df.hypertension.value_counts()[0])+"\nYes: "+str(df.hypertension.value_counts()[1])
plt.legend(labels=[], title=info_text)
ax[1, 0].set_xlabel("Answer")
ax[1, 0].set_ylabel("Count")
ax[1, 0].set_title("Hypertension", fontsize=20)

sns.countplot(x=df.heart_disease, ax=ax[1, 1])
plt.sca(ax[1, 1])
plt.xticks([0, 1], ["No", "Yes"])
info_text="No: "+str(df.heart_disease.value_counts()[0])+"\nYes: "+str(df.heart_disease.value_counts()[1])
plt.legend(labels=[], title=info_text)
ax[1, 1].set_xlabel("Answer")
ax[1, 1].set_ylabel("Count")
ax[1, 1].set_title("Heart Disease", fontsize=20)

sns.countplot(x=df.ever_married, ax=ax[2, 0], order=["No", "Yes"])
ax[2, 0].set_xlabel("Answer")
ax[2, 0].set_ylabel("Count")
ax[2, 0].set_title("Ever Married?", fontsize=20)

plt.sca(ax[2, 1])
ax[2, 1].set_title("Work Type", fontsize=20)
df.work_type.value_counts().index
renamed_labels=['Private', 'Self Employed', 'Children', 'Government Jobs', 'Never Worked']
plt.pie(x=df.work_type.value_counts(), labels=renamed_labels, autopct="%1.1f%%", 
        explode=df.work_type.nunique()*[.03])

plt.sca(ax[3, 0])
ax[3, 0].set_title("Residence Type", fontsize=20)
df.Residence_type.value_counts().index
plt.pie(x=df.Residence_type.value_counts(), labels=df.Residence_type.value_counts().index, autopct="%1.1f%%", 
        explode=df.Residence_type.nunique()*[.03])

sns.histplot(x=df.avg_glucose_level, ax=ax[3, 1], kde=True)
ax[3, 1].set_title("Average Glucose Level In Blood", fontsize=20)
ax[3, 1].set_xlabel("Level")
ax[3, 1].set_ylabel("Count")

sns.histplot(x=df.bmi, kde=True, ax=ax[4, 0])
ax[4, 0].set_title("Body Mass Index", fontsize=20)
ax[4, 0].set_xlabel("")
ax[4, 0].set_ylabel("Count")

sns.countplot(x=df.smoking_status, ax=ax[4, 1])
plt.sca(ax[4, 1])
ax[4, 1].set_title("Smoking Status", fontsize=20)
ax[4, 1].set_xlabel("")
ax[4, 1].set_ylabel("Count")

plt.sca(ax[5, 0])
ax[5, 0].set_title("Stroke", fontsize=20)
plt.pie(x=df.stroke.value_counts(), labels=["No", "Yes"], autopct="%1.1f%%")

ax[5, 1].set_visible(False)

plt.show()

# Encoding Process

In [None]:
df.dtypes

In [None]:
categorical_df=df.copy()
dictionary_of_encodes={}

for column in categorical_df.select_dtypes("object").columns:
    categorical_df[column]=categorical_df[column].astype("category") #Changing dtype.
    dictionary_of_encodes[column]=dict( enumerate(categorical_df[column].cat.categories ) ) #Saving the encoding dictionary.
    categorical_df[column]=categorical_df[column].cat.codes #Encoding the dataframe.
    
categorical_df.dtypes

In [None]:
categorical_df[["age", "avg_glucose_level", "bmi"]].describe()

#### Creating Certain Ranges For Numeric Columns

In [None]:
categorical_df.age=pd.cut(x=categorical_df.age, bins=[x for x in range(0, 101, 10)])
categorical_df.avg_glucose_level=pd.cut(x=categorical_df.avg_glucose_level, bins=[x for x in range(55, 301, 20)])
categorical_df.bmi=pd.cut(x=categorical_df.bmi, bins=[x for x in range(10, 101, 10)])

for column in ["age", "avg_glucose_level", "bmi"]:
    dictionary_of_encodes[column]=dict( enumerate(categorical_df[column].cat.categories ) ) #Saving the encoding dictionary.
    categorical_df[column]=categorical_df[column].cat.codes

categorical_df[["age", "avg_glucose_level", "bmi"]]

# Adjusting Training Data

After having seen the distribution of the data we know that we have little data on people who had strokes.
 
Instead of doing an oversampling we will collect the same amount of data from people who had and did not have strokes in a totally random way, then a test will be done with the remaining data (despite having a large majority of test data from people that they did not have strokes, this process is being done with the intention of being able to see the precision of some classification algorithms).

In [None]:
categorical_df.stroke.value_counts()

In [None]:
#Adjusting Train Data

type1indices=[]
type0indices=[]

for x in range(categorical_df.shape[0]):
    if categorical_df.stroke.iloc[x]==1:
        type1indices.append(x)
    else:
        type0indices.append(x)
        
import numpy as np

np.random.shuffle(type0indices)
np.random.shuffle(type1indices)

train_x=categorical_df.iloc[type0indices[0:101]+type1indices[0:101], 0:-1]
train_y=categorical_df.iloc[type0indices[0:101]+type1indices[0:101], -1]
val_x=categorical_df.iloc[type0indices[101:]+type1indices[101:], 0:-1]
val_y=categorical_df.iloc[type0indices[101:]+type1indices[101:], -1]

# Classification By Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(random_state=1)
model.fit(train_x, train_y)
pred_y=model.predict(val_x)

pred_y

## Creation of a New Data Frame

In [None]:
comparing_results_df=pd.DataFrame(val_y) #Creating new DF.
comparing_results_df.columns=["actual"] 
comparing_results_df["pred"]=pred_y #Adding new column for predicted values.
comparing_results_df['pred'] = np.where(comparing_results_df['pred']<=np.mean(pred_y), 0, 1) #adjusting values by the obtained probability.

### Decoding Values From Encoded Columns

In [None]:
results_df=pd.concat([val_x, comparing_results_df], axis=1).reset_index(drop=True)

for row_n in range(results_df.shape[0]):
    for column in dictionary_of_encodes.keys():
        results_df.loc[row_n, column]=dictionary_of_encodes[column][results_df.loc[row_n, column]]

## Precision Display

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def plotAndSaveConfusionMatrix(name_of_model, name_of_pic, actual_arr, pred_arr):
    confmat = confusion_matrix(actual_arr, pred_arr)
    fig, ax=plt.subplots(ncols=2, nrows=1, figsize=(20, 7))

    sns.heatmap(confmat, cbar=False, annot=True, fmt="g", ax=ax[0])
    plt.suptitle("Confusion Matrix of "+name_of_model+" Model", fontsize=23)
    plt.sca(ax[0])
    plt.ylabel("True Values")
    plt.xlabel("Predicted Values")

    ax[1].set_axis_off()
    ax[1].text(0, 0, classification_report(actual_arr, pred_arr), fontsize=20)
    plt.savefig(name_of_pic+'.png')
    plt.show()
    
plotAndSaveConfusionMatrix("Random Forest", "cm_rf", comparing_results_df['actual'], comparing_results_df['pred'])
results_df[results_df.actual==1].tail(n=20)

# Classification By Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 

model=LogisticRegression(max_iter=440)
model.fit(train_x, train_y)
pred_y=model.predict(val_x)
results_df.pred=pred_y

pred_y

## Precision Display

In [None]:
plotAndSaveConfusionMatrix("Logistic Regression", "cm_lr", results_df['actual'], pred_y)
results_df[results_df.actual==1].tail(n=20)

# Classification By Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB(alpha=1)
model.fit(train_x, train_y)
pred_y=model.predict(val_x)
results_df.pred=pred_y

pred_y

## Precision Display

In [None]:
plotAndSaveConfusionMatrix("Naive Bayes", "cm_nb", results_df['actual'], pred_y)
results_df[results_df.actual==1].tail(n=20)

# Classification By K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats

best_score=-1
best_n=-1

for x in range(1, 101):
    model=KNeighborsClassifier(n_neighbors=x)
    model.fit(train_x, train_y)
    pred_y=model.predict(val_x)
    
    score=np.sum(pred_y==val_y)/len(val_y)
    if score>best_score:
        best_score=score
        best_n=x
        
model=KNeighborsClassifier(n_neighbors=best_n)
model.fit(train_x, train_y)
pred_y=model.predict(val_x)
results_df.pred=pred_y

print(stats.describe(pred_y), "\nBest N: ",best_n, sep="")

## Precision Display

In [None]:
plotAndSaveConfusionMatrix("K-Nearest Neighbors", "cm_knb", results_df['actual'], pred_y)
results_df[results_df.actual==1].tail(n=20)

# Visualization of The Precision of All The Algorithms Used

In [None]:
import matplotlib.image as mpimg

fig, ax=plt.subplots(nrows=4, ncols=1, figsize=(20, 40))
fig.tight_layout(h_pad=-80)

ax[0].imshow(mpimg.imread('cm_rf.png'))
ax[0].set_axis_off()

ax[1].imshow(mpimg.imread('cm_lr.png'))
ax[1].set_axis_off()

ax[2].imshow(mpimg.imread('cm_nb.png'))
ax[2].set_axis_off()

ax[3].imshow(mpimg.imread('cm_knb.png'))
ax[3].set_axis_off()