In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, roc_curve, roc_auc_score

from pandas_profiling import ProfileReport

from scipy.stats import norm

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head()

In [None]:
print("Shape")
print(df.shape)
print("-"*100)
print("Columns")
columns = df.columns
print(columns)
print("-"*100)
print("Data information: ")
print(df.info())
print("-"*100)
print("Data description: ")
print(df.describe())
print("-"*100)
print("Null values count: ")
print(df.isnull().sum())
print("-"*100)

In [None]:
print("Gender: ",df["gender"].unique())
print("Hypertension: ",df["hypertension"].unique())
print("Heart Disease: ",df["heart_disease"].unique())
print("Ever Married: ",df["ever_married"].unique())
print("Work Type: ",df["work_type"].unique())
print("Residence type: ",df["Residence_type"].unique())
print("Smoking status: ",df["smoking_status"].unique())
print("Stroke: ",df["stroke"].unique())

# Data visualization
## Count plots

### 1. Count of gender

In [None]:
print(df["gender"].value_counts())
sns.set(style="darkgrid")
sns.countplot(x=df["gender"],data=df)

### 2. Count of hypertension

In [None]:
print(df["hypertension"].value_counts())
sns.countplot(x=df["hypertension"], data=df)

### 3. Heart Disease

In [None]:
print(df["heart_disease"].value_counts())
sns.countplot(x=df["heart_disease"], data=df)

### 4. Married status

In [None]:
print(df["ever_married"].value_counts())
sns.countplot(x=df["ever_married"], data=df)

### 5. Work type

In [None]:
print(df["work_type"].value_counts())
sns.countplot(x=df["work_type"], data=df)

### 6. Residence type

In [None]:
print(df["Residence_type"].value_counts())
sns.countplot(x=df["Residence_type"], data=df)

### 7. Smoking type

In [None]:
print(df["smoking_status"].value_counts())
sns.countplot(x=df["smoking_status"], data=df)

### 8. Stroke

In [None]:
print(df["stroke"].value_counts())
sns.countplot(x=df["stroke"], data=df)

## Relation between Categories and strock

### Hypertension vs stroke

In [None]:
sns.countplot(x=df["hypertension"], hue=df["stroke"], data=df)

### Gender vs Stroke

In [None]:
sns.countplot(x=df["gender"], hue=df["stroke"], data=df)

### Heart disease vs stroke

In [None]:
sns.countplot(x=df["heart_disease"], hue=df["stroke"], data=df)

### Married type vs Stroke

In [None]:
sns.countplot(x=df["ever_married"], hue=df["stroke"], data=df)

### Work type vs stroke

In [None]:
sns.countplot(x=df["work_type"], hue=df["stroke"], data=df)

### Residence type vs Stroke

In [None]:
sns.countplot(x=df["Residence_type"], hue=df["stroke"], data=df)

### Smoking status vs Stroke

In [None]:
sns.countplot(x=df["smoking_status"], hue=df["stroke"], data=df)

## Distplot

In [None]:
sns.distplot(df["age"], fit=norm)

In [None]:
sns.distplot(df["bmi"], fit=norm)

In [None]:
sns.distplot(df["avg_glucose_level"], fit=norm)

## Outlier analysis

In [None]:
sns.boxplot(x=df["age"], data=df)

In [None]:
sns.boxplot(x=df["bmi"], data=df)

In [None]:
sns.boxplot(x=df["avg_glucose_level"], data=df)

## Correlation

In [None]:
data = df.corr(method='pearson')
fig = plt.figure(figsize=(15,8))
sns.heatmap(data,annot=True,cbar=True,linewidths=1)

# Data preprocessing

### 1. Handling missing values

#### We have recorded some missing values in BMI category.

In [None]:
print("Missing values in bmi before: ",df["bmi"].isnull().sum())

In [None]:
df["bmi"].fillna(value=df["bmi"].mean(), inplace=True)

In [None]:
print("Missing values in bmi after: ",df["bmi"].isnull().sum())

### 2. Outlier Removal
There are otliers present in the bmi category. Now to remove we can use z-score or either we can use IQR (interquatile Ranege)
I have used IQR for the removal of teh outliers.

1. IQR = Q3-Q1
2. upper_boundry = Q3+(1.5*IQR)
3. Lower bound = Q1-(1.5*IQR)

In [None]:
q1,q3 = np.percentile(df["bmi"],[25,75])
print(q1,q3)
iqr = q3-q1
upper_bound = q3+(1.5*iqr)
lower_bound = q1-(1.5*iqr)
print("upper bound: {}, lower bound: {}".format(upper_bound,lower_bound))

In [None]:
df.drop(df[df['bmi'] > upper_bound].index, inplace = True)
df.drop(df[df['bmi'] < lower_bound].index, inplace = True)

In [None]:
print("After outlier removal")
fig, axes = plt.subplots(1, 2,figsize=(15,5))
sns.boxplot(x=df["bmi"], data=df, ax = axes[0])
sns.distplot(df["bmi"], fit=norm, ax = axes[1])

### 2. Handeling categorical values
#### 1. One hot encoding

In [None]:
# There is only one "other" category in the gender. So, we should remove it.
df.drop(df[df["gender"]=="Other"].index, inplace=True)

In [None]:
# For gender column.
sex = pd.get_dummies(df["gender"], drop_first=True)
df = pd.concat([df,sex],axis=1)

In [None]:
# For Ever_married colummn.
married_status = pd.get_dummies(df["ever_married"], drop_first=True)
df = pd.concat([df, married_status], axis=1)

In [None]:
# For Residence type column.
residence = pd.get_dummies(df["Residence_type"], drop_first=True)
df = pd.concat([df, residence], axis=1)

#### 2. Target guided encoding

In [None]:
df["Work_Type"] = df["work_type"].map({'Private':0,'Self-employed':1, 'Govt_job':2, 'children':3, 'Never_worked':4})
df["Smoking_Status"] = df["smoking_status"].map({'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3})

In [None]:
df.head()

##### Now since we have converted all the categorical values to numerical, now we should drop all those columns.

In [None]:
df.drop(["id","gender","ever_married","work_type","Residence_type","smoking_status"],axis=1, inplace=True)
df.head()

#### 3. Renaming the columns

In [None]:
df.rename(columns={"Male":"Gender","Yes":"Ever_Married","Urban":"Residence_type"}, inplace=True)
df.head()

### 2. Balancing the dataset

We can see at the data visualization that out target category("Stroke") is highly imbalanced. So, before movig farther to remove the outliers first we should balance the dataset. It can be done by two methods.

        --> 1. Under sampaling
        --> 2. Oversampling

We will be using Oversampling i.e bringing the minority class equal to majority calss. For this purpous i will be using a very famous technique called Synthetic Minority Oversampling Technique or SMOTE.

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
X = df.drop(['stroke'],axis=1)
y = df['stroke']
X,y= smote.fit_resample(X,y)
y = pd.DataFrame({'stroke':y})
sns.countplot(data = y, x = 'stroke', y= None)
plt.show()
print(y.value_counts())

##### It is clearly visible that our data is completely balanced. Now let's join back the updated dataset

In [None]:
df = pd.concat([X,y],axis = 1)
df.head()

### 4. Standardizing the data.

In [None]:
X = df.drop(["stroke"], axis=1)
y = df["stroke"]

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X = ss.fit_transform(X)

## Model Building
#### Classification Models

#### Train Test Splitting

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression(solver="liblinear").fit(x_train,y_train)
gnb = GaussianNB().fit(x_train,y_train)
knnc = KNeighborsClassifier().fit(x_train,y_train)
dtc = DecisionTreeClassifier(random_state=42).fit(x_train,y_train)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(x_train,y_train)
xgbc = XGBClassifier().fit(x_train,y_train)
catbc = CatBoostClassifier(verbose=False).fit(x_train,y_train)

In [None]:
model_names = [lr,gnb,knnc,dtc,rfc,xgbc,catbc]

In [None]:
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict(x_test)
    CV = cross_val_score(model,x_test,y_test,cv=10,verbose=False).mean()
    error = -cross_val_score(model,x_test,y_test,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    print(name + ": ")
    print("-" * 50)
    print("Accuracy Score: ",accuracy_score(y_test,predict))
    print("Cross Validation Score: ",CV)
    print("Error: ",np.sqrt(error))
    print("R-square value: ",r2_score(y_test,predict))
    print("Confusion matrix: ")
    print(confusion_matrix(y_test,predict))
    print("-" * 100)

### Model Comparison

In [None]:
df = pd.DataFrame(columns=["MODELS","Accuracy"])
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test,predict)
    result = pd.DataFrame([[name,accuracy*100]],columns=["MODELS","Accuracy"])
    df = df.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="Accuracy",y="MODELS",data=df,color="k")
plt.xlabel("ACCURACY")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()

### Model Cross Validation Comparison

In [None]:
df = pd.DataFrame(columns=["MODELS","CV"])
for model in model_names:
    name = model.__class__.__name__
    CV = cross_val_score(model,x_test,y_test,cv=10,verbose=False).mean()
    result = pd.DataFrame([[name,CV*100]],columns=["MODELS","CV"])
    df = df.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="CV",y="MODELS",data=df,color="k")
plt.xlabel("CV")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL CROSS VALIDATION COMPARISON")
plt.show()

# ROC_ Curve and ROC_AUC_SCORE
### Prediction Probabilities

In [None]:
r_prob = [0 for _ in range(len(y_test))]
r_auc = roc_auc_score(y_test,r_prob)


### AUROC Score

In [None]:
for model in model_names:
    name = model.__class__.__name__
    predict = model.predict_proba(x_test)[:,1]
    auroc_score = roc_auc_score(y_test,predict)
    print(name+" score: ",auroc_score)
    print("-"*50)

### ROC Curve

In [None]:
r_fpr,r_tpr,_= roc_curve(y_test,r_prob)
model_dict={}

for model in model_names:
    name = model.__class__.__name__
    predict = model.predict_proba(x_test)[:,1]
    fpr,tpr,_= roc_curve(y_test,predict)
    model_dict[name]=[fpr,tpr]

In [None]:
plt.figure(figsize=(15,8))
plt.plot(r_fpr,r_tpr,linestyle="--")
plt.plot(model_dict["LogisticRegression"][0],model_dict["LogisticRegression"][1],linestyle='dotted', label="LogisticRegression")
plt.plot(model_dict["GaussianNB"][0],model_dict["GaussianNB"][1],linestyle='dotted',label="GaussianNB")
plt.plot(model_dict["KNeighborsClassifier"][0],model_dict["KNeighborsClassifier"][1],linestyle='dotted', label="KNeighborsClassifier")
plt.plot(model_dict["DecisionTreeClassifier"][0],model_dict["DecisionTreeClassifier"][1],linestyle='dotted', label="DecisionTreeClassifier")
plt.plot(model_dict["RandomForestClassifier"][0],model_dict["RandomForestClassifier"][1],linestyle='dotted', label="RandomForestClassifier")
plt.plot(model_dict["XGBClassifier"][0],model_dict["XGBClassifier"][1],linestyle='dotted', label="XGBClassifier")
plt.plot(model_dict["CatBoostClassifier"][0],model_dict["CatBoostClassifier"][1],linestyle='dotted', label="CatBoostClassifier")

plt.title("ROC plot")
plt.xlabel("False positive rate.")
plt.ylabel("True positive rate.")
plt.legend()
plt.show()