In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
sns.heatmap(df.isnull(), cmap ='viridis', cbar=False, yticklabels=False)

In [None]:
sns.countplot(data=df, x='stroke')

In [None]:
df['stroke'].value_counts()

In [None]:
sns.countplot(data=df, x='stroke', hue='gender', palette='rainbow')

In [None]:
sns.countplot(data=df, x='stroke', hue='hypertension', palette='rainbow')

In [None]:
sns.countplot(data=df, x='stroke', hue='heart_disease', palette='rainbow')

In [None]:
sns.pairplot(df, hue='stroke', palette='bwr')

In [None]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)

In [None]:
sns.heatmap(df.isnull(), cmap ='viridis', cbar=False, yticklabels=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
Gen = pd.get_dummies(df['gender'], drop_first=True)
Married = pd.get_dummies(df['ever_married'], drop_first=True)
Work = pd.get_dummies(df['work_type'], drop_first=True)
Res = pd.get_dummies(df['Residence_type'], drop_first=True)
Smoking = pd.get_dummies(df['smoking_status'], drop_first=True)

In [None]:
df.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df = pd.concat([df, Gen, Married, Work, Res, Smoking], axis=1)

In [None]:
df.head()

In [None]:
df.drop(['id'], axis=1, inplace=True)

In [None]:
df.head()

# We have an imbalanced data set here. To make it a balanced data set, we can make use of undersampling

In [None]:
# Shuffle the Dataset.
shuffled_df = df.sample(frac=1,random_state=4)

# Put all the yes stroke class in a separate dataset.
str_yes_df = shuffled_df.loc[shuffled_df['stroke'] == 1]

#Randomly select 249 observations from the no stroke (majority class)
str_no_df = shuffled_df.loc[shuffled_df['stroke'] == 0].sample(n=249,random_state=42)

# Concatenate both dataframes again
normalized_df = pd.concat([str_yes_df, str_no_df])

#plot the dataset after the undersampling
plt.figure(figsize=(8, 8))
sns.countplot('stroke', data=normalized_df)
plt.title('Balanced Classes')
plt.show()

In [None]:
normalized_df.info()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(normalized_df.drop('stroke',axis=1), normalized_df['stroke'], test_size=0.30, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test,predictions)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
normalized_df.info()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, predictions, labels=logmodel.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logmodel.classes_)
disp.plot()