## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.info() # Checking infos

In [None]:
df.isna().sum()   #checking for missing values

In [None]:
sns.set_style("darkgrid")
sns.countplot(data = df , x = "stroke");

In [None]:
for i in df.columns:
    print(f"The number of unique values in {i} column is/are {len(df[i].unique())}")
    print("\n")
    print(f"The unique values in {i} column is/are {df[i].unique()}")
    print("\n")
    print(f"The value counts for each value in {i} column is/are :  \n{df[i].value_counts()}")
    print("\n\n")
    print("*"*100)
    print("\n\n")

In [None]:
df.drop(["id"] , axis = 1 , inplace = True)  # Droping id columns as it has all unique data
df.info()

In [None]:
cats = df.select_dtypes(include = ["object"]).columns  # Categorical columns
cats

In [None]:
ints = df.select_dtypes(exclude = ["object"]).columns  # Integer columns
ints

In [None]:
df.head()

## EDA (Exploratory data analysis)

In [None]:
sns.pairplot(df)

In [None]:
g = sns.PairGrid(df)
g.map_diag(plt.hist)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
  
fig.suptitle('Count plots of categorical columns')
  
sns.countplot(ax = axes[0, 0], data = df, x = 'gender')
sns.countplot(ax = axes[0, 1], data = df, x = 'ever_married')
sns.countplot(ax = axes[0, 2], data = df, x = 'work_type')
sns.countplot(ax = axes[1, 0], data = df, x = 'Residence_type')
sns.countplot(ax = axes[1, 1], data = df, x = 'smoking_status')

In [None]:
for i , r in enumerate(cats):
    plt.figure()
    plt.title(r)
    sns.countplot(data = df , x = r)

In [None]:
for i , r in enumerate(ints):
    plt.figure()
    plt.title(r)
    sns.histplot(df[r] , kde = True)

In [None]:
sns.heatmap(df.corr() , annot = True , cmap = "coolwarm") 

In [None]:
cats

In [None]:
ints

In [None]:
sns.violinplot(x = "gender", y = "bmi", data = df , hue = "ever_married" , palette = 'rainbow' , split = True)

In [None]:
g = sns.FacetGrid(df, col="work_type",  row="Residence_type" , hue = "hypertension")
g = g.map(plt.hist, "bmi")

In [None]:
sns.histplot(data = df , x = "bmi" , bins = 30 , hue = "gender")

In [None]:
for i , r in enumerate(cats):
        plt.figure()
        plt.title(r)
        sns.boxplot(x = r , y = "bmi" , data = df)

In [None]:
ints = df.select_dtypes(exclude = ["object"]).columns
ints

In [None]:
df["bmi"].median() , df["bmi"].mean()

In [None]:
df["bmi"].fillna(df["bmi"].median() , inplace = True)
df.isna().sum()

In [None]:
df.isna().sum()

In [None]:
df.loc[df["gender"] == "Other" , "gender"] = "Female"

In [None]:
df["gender"].value_counts()

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

df["gender"] = lb.fit_transform(df["gender"])
df["ever_married"] = lb.fit_transform(df["ever_married"])
df["Residence_type"] = lb.fit_transform(df["Residence_type"])

In [None]:
df.head()

In [None]:
for i in df.columns:
    print(f"The number of unique values in {i} column is/are {len(df[i].unique())}")
    print("\n")
    print(f"The unique values in {i} column is/are {df[i].unique()}")
    print("\n")
    print(f"The value counts for each value in {i} column is/are :  \n{df[i].value_counts()}")
    print("\n\n")
    print("*"*100)
    print("\n\n")

In [None]:
df.isna().sum()

In [None]:
df = pd.get_dummies(df , columns = ["work_type" , "smoking_status"] , prefix = ["work_type" , "smoking_status"])
df.drop(["work_type_Govt_job" , "smoking_status_Unknown"] , axis = 1 , inplace = True)
df.head()

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(["stroke"] , axis = 1 )
y = df["stroke"]
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

In [None]:
len(X_train) , len(X_test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = ["age" , "avg_glucose_level" , "bmi"]
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [None]:
X_train.head()

In [None]:
X_test.head()

## Upsampling data with SMOTE 

In [None]:
from imblearn.over_sampling import SMOTE
y_train.value_counts()

In [None]:
sm = SMOTE(random_state = 42)
X_train_1, y_train_1 = sm.fit_resample(X_train , y_train.ravel())
sum(y_train_1 == 1) , sum(y_train == 0)

## Model Fitting and Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import confusion_matrix , roc_auc_score , precision_score , recall_score , f1_score , accuracy_score , classification_report , roc_curve , auc
from sklearn.model_selection import cross_val_score

In [None]:
models = []
models.append(['XGBClassifier', XGBClassifier(learning_rate = 0.1 , objective = 'binary:logistic' , random_state = 42 , eval_metric='mlogloss')])
models.append(['Logistic Regression', LogisticRegression(random_state = 42)])
models.append(['SVM', SVC(random_state = 42)])
models.append(['KNeigbors', KNeighborsClassifier()])
models.append(['RandomForest', RandomForestClassifier(random_state = 42)])
models.append(['AdaBoostClassifier', AdaBoostClassifier()])

In [None]:
sam1 = []
for i in range(len(models)):
    sam2 = []
    model = models[i][1]
    model.fit(X_train_1 , y_train_1)
    
    
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test , y_pred)
    
    
    accuracies = cross_val_score(estimator = model, X = X_train_1 , y = y_train_1, cv = 5)
    roc = roc_auc_score(y_test , y_pred)
    precision = precision_score(y_test , y_pred)
    recall = recall_score(y_test , y_pred)
    f1 = f1_score(y_test , y_pred)
    
    
    
    print(models[i][0],':')
    print(cm)
    print('Accuracy Score: ' , accuracy_score(y_test,y_pred))
    print('\n')
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('\n')
    print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
    print('\n')
    print('ROC AUC Score: {:.2f} %'.format(roc))
    print('\n')
    print('Precision: {:.2f} %'.format(precision))
    print('\n')
    print('Recall: {:.2f} %'.format(recall))
    print('\n')
    print('F1 Score: {:.2f} %'.format(f1))
    print("\n")
    print('*'*40)
    print('\n\n\n')
    
    
    
    sam2.append(models[i][0])
    sam2.append(accuracy_score(y_test , y_pred)*100)
    sam2.append(accuracies.mean()*100)
    sam2.append(accuracies.std()*100)
    sam2.append(roc)
    sam2.append(precision)
    sam2.append(recall)
    sam2.append(f1)
    sam1.append(sam2)

## Model Selection

In [None]:
df2 = pd.DataFrame(sam1 , columns = ['Model','Accuracy','K-Fold Mean Accuracy','Std.Deviation','ROC_AUC','Precision','Recall','F1 Score'])

df2.sort_values(by = ["F1 Score" , "ROC_AUC" , 'K-Fold Mean Accuracy' , "Accuracy"] , inplace = True , ascending = False)
df2

In [None]:
sns.barplot(x = "Model" , y = "ROC_AUC" , data = df2)
plt.title("Model Compare");

In [None]:
linear = LogisticRegression(random_state = 42)
linear.fit(X_train_1, y_train_1)
y_pred = linear.predict(X_test)
y_prob = linear.predict_proba(X_test)[ : , 1]
cm = confusion_matrix(y_test , y_pred)

print(classification_report(y_test, y_pred))
print("\n\n\n")
print(f'ROC AUC score: {roc_auc_score(y_test , y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test , y_pred))
print("\n\n\n")

# Visualizing Confusion Matrix
plt.figure(figsize = (8, 5))
sns.heatmap(cm, cmap = 'coolwarm', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
plt.yticks(rotation = 0)
plt.show()

# Roc Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate , true_positive_rate , color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
preds = linear.predict(X_test)
sum(preds == 0) , sum(preds == 1)

In [None]:
predicted_probab = linear.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)

plt.plot(fpr, tpr, marker='.', color='green',label="Logistic Regressor")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Logistic Regressor Model with AUC of 80.96 percent