# INDEX
- ## 1. Library Management
- ## 2. Data Sourcing
- ## 3. Data Cleaning
- ## 4. Data Preparation (For Modelling)
- ## 5. Data Modelling

# 1. Library Management

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize': (15, 10)})

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.linear_model import LogisticRegression

# 2. Data Sourcing

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
raw_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
raw_data.head(10)

In [None]:
raw_data_row_count, raw_data_column_count = raw_data.shape
print('Row Count:', raw_data_row_count)
print('Column Count:', raw_data_column_count)

# 3. Data Cleaning

## 3.1. ID Column

In [None]:
raw_data.id.isnull().sum()

In [None]:
raw_data.age.describe()

In [None]:
len(raw_data.id.unique())

##### We see that number number of unique values equals total items in the row.
##### This can also indicate that, it might be Patient / Customer ID. We will drop this, as we already have a unique identifier for the dataframe
##### Thus, we will drop this column

In [None]:
raw_data = raw_data.drop(columns='id')

In [None]:
raw_data.shape

## 3.2. Gender

In [None]:
raw_data.gender.isnull().sum()

In [None]:
raw_data.gender.value_counts()

##### Gender needs to be categorized as Categorical Nominal Variable. For this, we would be using Dummy Variable Method.
##### Also, from the analysis perspective, it will be tedious to create another dummy variable just for one row vlue (of Others). Therefore, we will impute this other value with mode in this column.
#### Therefore, conversion will be as follows:
- 1. Male: 1
- 2. Female: 0
- 3. Others: Mode Value of column

##### Replacing Other value with mode

In [None]:
raw_data['gender'] = raw_data['gender'].replace('Other', list(raw_data.gender.mode().values)[0])

##### Verifying if the value was imputed appropriately

In [None]:
raw_data.gender.value_counts()

In [None]:
raw_data.gender = raw_data.gender.map({'Male': 1, 'Female': 0})

## 3.3. Age

In [None]:
raw_data.age.isnull().sum()

In [None]:
raw_data.age.dtypes

In [None]:
raw_data.age.describe()

In [None]:
pd.cut(raw_data['age'], bins=np.arange(0, 100, 10)).value_counts(sort=False)

In [None]:
sns.displot(raw_data.age)
plt.title('Age Distribution Plot', fontdict={'fontsize': 20})
plt.xlabel('Age', fontdict={'fontsize': 12})
plt.ylabel('Patient Count', fontdict={'fontsize': 12})
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(raw_data.age)
plt.title('Age Distribution Box Plot', fontdict={'fontsize': 20})
plt.xlabel('Age', fontdict={'fontsize': 12})
plt.show()

## 3.4. HyperTension

In [None]:
raw_data.hypertension.isnull().sum()

In [None]:
raw_data.hypertension.value_counts()

##### Nothing needs to be done with this data column

## 3.5 Heart Disease

In [None]:
raw_data.heart_disease.isnull().sum()

In [None]:
raw_data.heart_disease.value_counts()

##### Nothing needs to be done with this data column

## 3.6. Ever Married

In [None]:
raw_data.ever_married.isnull().sum()

In [None]:
raw_data.ever_married.value_counts()

In [None]:
raw_data.ever_married = raw_data.ever_married.map({'Yes': 1, 'No': 0})

## 3.7. Work Type

In [None]:
raw_data.work_type.isnull().sum()

In [None]:
raw_data.work_type.value_counts()

##### This data represent a bit like categorical nominal variable. Hence we will keep them as it is.

In [None]:
dummy_train_df = pd.get_dummies(raw_data['work_type'], drop_first=True)
raw_data = pd.concat([raw_data, dummy_train_df], axis=1)
raw_data = raw_data.drop(columns=['work_type'])

## 3.8. Residence Type

In [None]:
raw_data.Residence_type.isnull().sum()

In [None]:
raw_data.Residence_type.value_counts()

In [None]:
raw_data.Residence_type = raw_data.Residence_type.map({'Rural': 0, 'Urban': 1})

## 3.9. Average Glucose Level

In [None]:
raw_data.avg_glucose_level.isnull().sum()

In [None]:
raw_data.avg_glucose_level.dtypes

In [None]:
raw_data.avg_glucose_level.describe()

In [None]:
pd.cut(raw_data['avg_glucose_level'], bins=np.arange(50, 300, 25)).value_counts(sort=False)

In [None]:
sns.displot(raw_data.avg_glucose_level)
plt.title('Average Glucose Distribution Plot', fontdict={'fontsize': 20})
plt.xlabel('Average Glucose', fontdict={'fontsize': 12})
plt.ylabel('Patient Count', fontdict={'fontsize': 12})
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(raw_data.avg_glucose_level)
plt.title('Average Glucose Box Plot', fontdict={'fontsize': 20})
plt.xlabel('Average Glucose', fontdict={'fontsize': 12})
plt.show()

## 3.10. BMI

In [None]:
raw_data.bmi.dtypes

In [None]:
raw_data.bmi.describe()

In [None]:
raw_data.bmi.isnull().sum()

##### Count of missing values is bit high enough to drop respective value. for the same, we will use he mean value to impute these null values.

In [None]:
raw_data["bmi"].fillna(raw_data.bmi.mean(), inplace=True)

In [None]:
raw_data.bmi.isnull().sum()

In [None]:
pd.cut(raw_data['bmi'], bins=np.arange(10, 110, 10)).value_counts(sort=False)

In [None]:
sns.displot(raw_data.bmi)
plt.title('BMI Distribution Plot', fontdict={'fontsize': 20})
plt.xlabel('BMI', fontdict={'fontsize': 12})
plt.ylabel('Patient Count', fontdict={'fontsize': 12})
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(raw_data.bmi)
plt.title('BMI Box Plot', fontdict={'fontsize': 20})
plt.xlabel('BMI', fontdict={'fontsize': 12})
plt.show()

##### From the box plot, we can see that there are many outliers prsent in the higher region. Technically we can neglect the top oultier to predict output.
##### But from application perspective, we can have pateints with that BMI level and if we neglect these values, our model wont exterpolate higher values. Therefore, we will continue with this missing values.

## 3.11. Smoking Status

In [None]:
raw_data.smoking_status.isnull().sum()

In [None]:
raw_data.smoking_status.value_counts()

##### These values dont have any definate order. Hence we will asume them to be Categorical Nominal Variable

In [None]:
dummy_train_df = pd.get_dummies(raw_data['smoking_status'], drop_first=True)
raw_data = pd.concat([raw_data, dummy_train_df], axis=1)
raw_data = raw_data.drop(columns=['smoking_status'])
raw_data

## 3.12. Stroke  Status

In [None]:
raw_data.stroke.isnull().sum()

In [None]:
raw_data.stroke.value_counts()

# 4. Data Preparation (For Modelling)

In [None]:
raw_data.head()

## 4.1. Defining Input / Output Data

In [None]:
X = raw_data.drop(columns=['stroke'])

In [None]:
y = raw_data.stroke

## 4.2. Splitting Train & Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=14)

## 4.3. Data Scaling

In [None]:
# std_scaler = StandardScaler()
# X_train = pd.DataFrame(std_scaler.fit_transform(X_train), columns = X_train.columns)
# X_test = pd.DataFrame(std_scaler.transform(X_test),columns = X_test.columns)

# 5. Data Modelling

## 5.1. Data Modelling on Trained Data

In [None]:
col = list(X_train.columns)

In [None]:
model = LogisticRegression()
model = model.fit(X_train, y_train)
pred_probs_train = model.predict_proba(X_train[col])

In [None]:
y_train_pred_final = pd.DataFrame(y_train)
y_train_pred_final['stroke_probability'] = pred_probs_train[:,1]
numbers = np.arange(0.0, 1.0, 0.001)
for i in numbers:
    y_train_pred_final[i] = y_train_pred_final.stroke_probability.map(lambda x: 1 if x > i else 0)

In [None]:
cutoff_df = pd.DataFrame(columns=['prob', 'accuracy', 'sensi', 'speci'])
for i in numbers:
    cm1 = metrics.confusion_matrix(y_train_pred_final.stroke, y_train_pred_final[i])
    accuracy = (cm1[0, 0] + cm1[1, 1]) / sum(sum(cm1))
    speci = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    sensi = cm1[1, 1] / (cm1[1, 0] + cm1[1, 1])
    cutoff_df.loc[i] = [i, accuracy, sensi, speci]
cutoff_df[(cutoff_df['sensi'] < 0.8) & (cutoff_df['sensi'] > 0.7)]

In [None]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.xlabel('Probability', fontdict={'fontsize': 15})
plt.title('Cut-Off for Logisitic Regression Model', fontdict={'fontsize': 20})
plt.show()

##### From the above graph, probability cut off of 0.064 seems to be respectable enough to behave as threshold value above which patient is likely to have stroke

In [None]:
cut_off = 0.064

In [None]:
conf_matrix = metrics.confusion_matrix(y_train_pred_final.stroke, y_train_pred_final[cut_off])

TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]
TP = conf_matrix[1, 1]

accuracy_score = metrics.accuracy_score(y_train_pred_final.stroke, y_train_pred_final[cut_off])
accuracy_score = round(accuracy_score*100, 2)

precision_score = metrics.precision_score(y_train_pred_final.stroke, y_train_pred_final[cut_off])
precision_score = round(precision_score*100, 2)

recall_score = metrics.recall_score(y_train_pred_final.stroke, y_train_pred_final[cut_off])
recall_score = round(recall_score*100, 2)

sensitivity = TP / float(FN + TP)
sensitivity = round(sensitivity*100, 2)

specificity = TN / float(TN + FP)
specificity = round(specificity*100, 2)

f1_score = metrics.f1_score(y_train_pred_final.stroke, y_train_pred_final[cut_off])
f1_score = round(f1_score*100, 2)

auc_score = metrics.roc_auc_score(y_train_pred_final.stroke, y_train_pred_final.stroke_probability)
auc_score = round(auc_score*100, 2)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_train_pred_final.stroke, y_train_pred_final.stroke_probability, drop_intermediate=False )
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate or [1 - True Negative Rate]', fontdict={'fontsize': 15})
plt.ylabel('True Positive Rate', fontdict={'fontsize': 15})
plt.title('ROC (Receiver Operating Characteristic) Curve - Train Data\nLogisitic Regression Model', fontdict={'fontsize': 20})
plt.legend(loc="lower right")
plt.show()

In [None]:
data = pd.DataFrame({'Parameter': ['Accuracy', 'Sensitivity', 'Specificity', 'Precision Score', 'Recall Score', 'F1 Score', 'AUC Score'],
                     'Value': [accuracy_score, sensitivity, specificity, precision_score, recall_score, f1_score, auc_score]}, index=['Accuracy', 'Sensitivity', 'Specificity', 'Precision Score', 'Recall Score', 'F1 Score', 'AUC Score'])   
data = data.groupby(by='Parameter').Value.sum().sort_index()
graph = sns.barplot(x=data.index, y=data.values)
plt.title('Model Metrices (With Train Data)\nLogisitic Regression Model', fontdict={'fontsize': 20})
plt.xlabel('Parameters', fontdict={'fontsize': 15})
plt.ylabel('Score Value (In Percent)', fontdict={'fontsize': 15})
label_deviation_above_y_axis = data.max() * 0.015
for index, value in enumerate(data.iteritems()):
    graph.text(index, value[1] + label_deviation_above_y_axis, str(round(value[1], 1))+'%', color='black', ha="center")
plt.show()

## 5.2. Data Modelling on Test Data

In [None]:
pred_probs_test = model.predict_proba(X_test[col])

In [None]:
y_test_pred_final = pd.DataFrame(y_test)
y_test_pred_final['stroke_probability'] = pred_probs_test[:,1]
y_test_pred_final['stroke_predicted'] = y_test_pred_final.stroke_probability.map(lambda x: 1 if x > cut_off else 0)

In [None]:
conf_matrix_test = metrics.confusion_matrix(y_test_pred_final.stroke, y_test_pred_final.stroke_predicted)

TN_test = conf_matrix[0, 0]
FP_test = conf_matrix[0, 1]
FN_test = conf_matrix[1, 0]
TP_test = conf_matrix[1, 1]

accuracy_score_test = metrics.accuracy_score(y_test_pred_final.stroke, y_test_pred_final.stroke_predicted)
accuracy_score_test = round(accuracy_score_test*100, 2)

precision_score_test = metrics.precision_score(y_test_pred_final.stroke, y_test_pred_final.stroke_predicted)
precision_score_test = round(precision_score_test*100, 2)

recall_score_test = metrics.recall_score(y_test_pred_final.stroke, y_test_pred_final.stroke_predicted)
recall_score_test = round(recall_score_test*100, 2)

sensitivity_test = TP_test / float(FN_test + TP_test)
sensitivity_test = round(sensitivity_test*100, 2)

specificity_test = TN_test / float(TN_test + FP_test)
specificity_test = round(specificity_test*100, 2)

f1_score_test = metrics.f1_score(y_test_pred_final.stroke, y_test_pred_final.stroke_predicted)
f1_score_test = round(f1_score_test*100, 2)

auc_score_test = metrics.roc_auc_score(y_test_pred_final.stroke, y_test_pred_final.stroke_probability)
auc_score_test = round(auc_score_test*100, 2)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test_pred_final.stroke, y_test_pred_final.stroke_probability, drop_intermediate=False )
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score_test)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate or [1 - True Negative Rate]', fontdict={'fontsize': 15})
plt.ylabel('True Positive Rate', fontdict={'fontsize': 15})
plt.title('ROC (Receiver Operating Characteristic) Curve - Test Data\nLogisitic Regression Model', fontdict={'fontsize': 20})
plt.legend(loc="lower right")
plt.show()

In [None]:
data = pd.DataFrame({'Parameter': ['Accuracy', 'Sensitivity', 'Specificity', 'Precision Score', 'Recall Score', 'F1 Score', 'AUC Score'],
                     'Value': [accuracy_score_test, sensitivity_test, specificity_test, precision_score_test, recall_score_test, f1_score_test, auc_score_test]}, index=['Accuracy', 'Sensitivity', 'Specificity', 'Precision Score', 'Recall Score', 'F1 Score', 'AUC Score'])   
data = data.groupby(by='Parameter').Value.sum().sort_index()
graph = sns.barplot(x=data.index, y=data.values)
plt.title('Model Metrices (With Test Data)\nLogisitic Regression Model', fontdict={'fontsize': 20})
plt.xlabel('Parameters', fontdict={'fontsize': 15})
plt.ylabel('Score Value (In Percent)', fontdict={'fontsize': 15})
label_deviation_above_y_axis = data.max() * 0.015
for index, value in enumerate(data.iteritems()):
    graph.text(index, value[1] + label_deviation_above_y_axis, str(round(value[1], 1))+'%', color='black', ha="center")
plt.show()

In [None]:
train_test_score_df = pd.DataFrame({'train': [accuracy_score, sensitivity, specificity, precision_score, recall_score, f1_score, auc_score], 
                                    'test': [accuracy_score_test, sensitivity_test, specificity_test, precision_score_test, recall_score_test, f1_score_test, auc_score_test]},
                                   index=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1 Score', 'AUC Score'])   

fig, ax = plt.subplots()
x = np.arange(len(train_test_score_df.index))
width = 0.35


ax.set_ylabel('Score (In %)', fontdict={'fontsize': 15})
ax.set_xlabel('Parameters', fontdict={'fontsize': 15})
ax.set_title('Logisitic Regression Model', fontdict={'fontsize': 20})
ax.set_xticks(x)
ax.set_xticklabels(train_test_score_df.index)

rects1 = ax.bar(x - width/2, train_test_score_df['train'], width, label='Train Score')
for rect in rects1:
    height = rect.get_height()
    ax.annotate('{}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

rects2 = ax.bar(x + width/2, train_test_score_df['test'], width, label='Test Score')
for rect in rects2:
    height = rect.get_height()
    ax.annotate('{}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

ax.legend()
plt.show()

## 5.3. Model Parameters

In [None]:
model.intercept_

In [None]:
model.coef_

In [None]:
X_train.columns