## About the Data
<b>Data Dictionary</b><br>
1. id: unique identifier
2. gender: "Male", "Female" or "Other"
3. age: age of the patient
4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6. ever_married: "No" or "Yes"
7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8. Residence_type: "Rural" or "Urban"
9. avg_glucose_level: average glucose level in blood
10. bmi: body mass index
11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12. stroke: 1 if the patient had a stroke or 0 if not

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix, roc_curve, roc_auc_score, auc

%matplotlib inline

In [None]:
DATA_PATH = "../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv"

In [None]:
data = pd.read_csv(DATA_PATH)
data.head()

In [None]:
data.shape

In [None]:
print(f"Number of uniqe id's in data: {len(data.id.unique())}")

<b>OBSERVATION: </b>We can notice that the number of unique id's is equal to the number of rows. Therfore there is no duplicacy in the data.

In [None]:
# Dropping the id column
data.drop(columns = ["id"], inplace = True)
data.head()

In [None]:
all_columns = list(data.columns)
categorical_data_cols  = [column for column in all_columns if len(data[column].unique())<=5]
continuous_data_cols  = [column for column in all_columns if column not in categorical_data_cols]
print(f"Continuos Data Columns: {', '.join(continuous_data_cols)}")
print(f"Categorical Data Columns: {', '.join(categorical_data_cols)}")

## Univariate Analysis

In [None]:
stroke_val_counts = data["stroke"].value_counts()

print(f"Non Stroke: {stroke_val_counts[0] / sum(stroke_val_counts)}%")
print(f"Stroke: {stroke_val_counts[1] / sum(stroke_val_counts)}%")

data["stroke"].value_counts().plot(kind = "bar")
plt.show()

<b>OBSERVATION: </b>From the above plot we can clearly notice that the dataset is higly imbalanced dataset. We need to do some upsampling to balance the data.

In [None]:
for column in categorical_data_cols[:-1]:
    print(f"Number of NaN values in {column}: {data[column].isnull().sum()}")

In [None]:
plt.figure(figsize = (17,19))
i = 1
for column in categorical_data_cols[:-1]:
    plt.subplot(4, 2, i)
    sns.countplot(x = data[column], hue = data["stroke"])
    i+=1
plt.show()

<b>OBSERVATIONS:</b><br>
1. The number of male and female having stroke are almost equal in number.
2. The people suffering and not suffering with hypertension have almost same and no sign of heart stroke. This may be due to the fact that the number of records with stroke "1" is very less.
3. The married people are showing more signs for heart stroke
4. The people who are having private jobs are more prone to heart attack.

In [None]:
for column in categorical_data_cols[:-1]:
    plt.figure(figsize = (9,5))
    type_count = data.groupby(column)["stroke"].sum()
    x = type_count.index
    y = type_count.values
    plt.barh(x, y)

    for index, value in enumerate(y):
        plt.text(value, index,
                 value)

    plt.title(f"{column} vs Stroke")
    plt.show()

In [None]:
for column in categorical_data_cols:
    print(f"Unique values in {column} are: {', '.join([str(i) for i in data[column].unique()])}")

<b>OBSERVATION: </b>Here smoking status is an ordinal variable and remaining are nominal variables. Let us do ordinal encoding for the ordinal variable and one hot encoding for nominal variables.

In [None]:
married_map = {
    "Yes":1,
    "No":0
}
residence_map = {
    "Urban":1,
    "Rural":2
}

ord_encoder = OrdinalEncoder()
data["smoking_status"] = ord_encoder.fit_transform(data["smoking_status"].values.reshape(-1, 1))

data["ever_married"] = data["ever_married"].map(married_map)
data["Residence_type"] = data["Residence_type"].map(residence_map)

data = pd.get_dummies(data, columns = ["gender", "work_type"], drop_first = True)
data.head()

In [None]:
for column in continuous_data_cols:
    print(f"Number of NaN values in {column}: {data[column].isnull().sum()}")

In [None]:
# checking the distribution of the data
plt.figure(figsize = (11, 9))
i = 1
for column in continuous_data_cols:
    plt.subplot(2, 2, i)
    sns.histplot(data[column], bins = 50)
    i+=1
plt.show()

In [None]:
# checking the distribution of the data
plt.figure(figsize = (11, 9))
i = 1
for column in continuous_data_cols:
    plt.subplot(2, 2, i)
    sns.kdeplot(data[column])
    i+=1
plt.show()

In [None]:
# checking the outliers in the data
plt.figure(figsize = (11, 9))
i = 1
for column in continuous_data_cols:
    plt.subplot(2, 2, i)
    sns.boxplot(x = data[column])
    i+=1
plt.show()

<b>OBSERVATION: </b>Since the data for BMI is right skewed let us fill the missing values in the bmi with median, since median is not affected by the ouliers.

In [None]:
data["bmi"].fillna(value = data["bmi"].median(), inplace = True)
print(f"Number of missing values in BMI: {data['bmi'].isnull().sum()}")

In [None]:
# checking the distribution of the data
plt.figure(figsize = (11, 9))
i = 1
for column in continuous_data_cols:
    plt.subplot(2, 2, i)
    sns.kdeplot(data[column])
    i+=1
plt.show()

The BMI column is in the form of log normal distribution. Let us apply log transformation to convert it into Normal Distribution.

In [None]:
data["bmi"] = np.log(data["bmi"])
sns.kdeplot(data["bmi"])
plt.show()

## Bivariate Analysis

In [None]:
plt.figure(figsize = (7,7))
sns.pairplot(data[continuous_data_cols+["stroke"]], hue = "stroke")
plt.show()

In [None]:
plt.figure(figsize = (9,7))
sns.heatmap(data[continuous_data_cols].corr(), annot = True, center = 0)
plt.show()

In [None]:
data.head()

In [None]:
X = data.drop(columns = ["stroke"])
y = data["stroke"]

## Splitting the data for training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Train Data: {X_test.shape}, {y_test.shape}")

## Upsampling using SMOTE

In [None]:
counter = Counter(y_train)
print(f"Before Upsampling: {counter}")

upsample = SMOTE()
X_train, y_train = upsample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

In [None]:
# After Upsampling
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Train Data: {X_test.shape}, {y_test.shape}")

## Scaling the Data

In [None]:
std_scaler  = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

## Checking for best baseline model using Cross Validation

In [None]:
all_models = {
    "xgb_model":XGBClassifier(eval_metric = "logloss",random_state=18,use_label_encoder=False),
    "rf_model":RandomForestClassifier(random_state = 18),
    "logistic_model":LogisticRegression(),
    "svm_model":SVC(),
    "ada_model":AdaBoostClassifier(RandomForestClassifier(random_state = 18))
}

for model_name in all_models:
    print(f"Model Name: {model_name}")
    cv_score = cross_val_score(all_models[model_name],X_train, y_train, cv = 5)
    print(cv_score)
    print(f"Mean Score: {np.mean(cv_score)}")
    print()

## SVM Model

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

print("On Test Data")
predictions = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(svm_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = svm_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(svm_model, X_train, y_train)
plt.show()

## Hyperparameter Tuning for SVM Model

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 0)
grid.fit(X_train, y_train)

print("Best Params:",grid.best_params_)
print("Best Estimator", grid.best_estimator_)

In [None]:
svm_model = SVC(C=10, gamma=1)
svm_model.fit(X_train, y_train)

print("On Test Data")
predictions = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(svm_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = svm_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(svm_model, X_train, y_train)
plt.show()

In [None]:
def plot_roc_auc(model, X, y):
    probs = model.predict_proba(X)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(y, preds)
    roc_auc = auc(fpr, tpr)
    
    print("AUC Score",roc_auc_score(y, preds))

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

## KNN Classifier

In [None]:
error_rate = []
for i in range(1, 50):
    pipeline = KNeighborsClassifier(n_neighbors = i)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy at k = {i} is {accuracy}")
    error_rate.append(np.mean(predictions != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate))+1)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 2)
knn_model.fit(X_train, y_train)

print("On Test Data")
predictions = knn_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(knn_model, X_test, y_test)
plt.show()

print()

print("On Train Data")
predictions = knn_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(knn_model, X_train, y_train)
plt.show()

## Random Forest Model

In [None]:
rf_model = RandomForestClassifier(random_state = 24)
rf_model.fit(X_train, y_train)

print("On Test Data")
predictions = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(rf_model, X_test, y_test)
plt.show()

plot_roc_auc(rf_model, X_test, y_test)

print()

print("On Train Data")
predictions = rf_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(rf_model, X_train, y_train)
plt.show()
plot_roc_auc(rf_model, X_train, y_train)

## XGBoost Model

In [None]:
xgb_model = XGBClassifier(eval_metric = "logloss",random_state=18,use_label_encoder=False)
xgb_model.fit(X_train, y_train)

print("On Test Data")
predictions =xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(xgb_model, X_test, y_test)
plt.show()
plot_roc_auc(xgb_model, X_test, y_test)


print()

print("On Train Data")
predictions =xgb_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(xgb_model, X_train, y_train)
plt.show()
plot_roc_auc(xgb_model, X_train, y_train)

## Adaboost Model

In [None]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)

print("On Test Data")
predictions = ada_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(ada_model, X_test, y_test)
plt.show()
plot_roc_auc(ada_model, X_test, y_test)


print()

print("On Train Data")
predictions = ada_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(ada_model, X_train, y_train)
plt.show()
plot_roc_auc(ada_model, X_train, y_train)

In [None]:
voting_model = VotingClassifier(
    [
        ("svm_model", SVC()),
        ("xgb_model", XGBClassifier(eval_metric = "logloss",random_state=18,use_label_encoder=False)),
        ("ada_model", AdaBoostClassifier())
    ]
)

voting_model.fit(X_train, y_train)

print("On Test Data")
predictions = voting_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(f"F1 Score: {f1_score(y_test, predictions)}")
print(f"Precision: {precision_score(y_test, predictions)}")
print(f"Recall: {recall_score(y_test, predictions)}")
plot_confusion_matrix(voting_model, X_test, y_test)
plt.show()


print()

print("On Train Data")
predictions = voting_model.predict(X_train)
print(f"Accuracy: {accuracy_score(y_train, predictions)}")
print(f"F1 Score: {f1_score(y_train, predictions)}")
print(f"Precision: {precision_score(y_train, predictions)}")
print(f"Recall: {recall_score(y_train, predictions)}")
plot_confusion_matrix(voting_model, X_train, y_train)
plt.show()
