### Importing libraries and dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")
plt.style.use(["seaborn-bright","dark_background"])

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data.head()

In [None]:
data.info()

### Checking for missing value percentage.

In [None]:
for i in data.columns:
    perc = data[i].isnull().sum()
    print("Missing data in column {} = {}%".format(i,(perc/len(data))*100))

In [None]:
data.describe()

In [None]:
val = ["bmi"]
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values= np.nan, strategy="mean" )
data[val] = imputer.fit_transform(data[val])

In [None]:
cat_data = [x for x in data.columns if data[x].dtype == "object"]
num_data = [y for y in data.columns if data[y].dtype != "object"]

In [None]:
for i in cat_data:
    print(i," = ",data[i].unique())

#### Visualizing categorical values using countplot.

In [None]:
for i in cat_data:
    plt.figure(figsize=(8,5))
    sns.countplot(data[i])
    plt.title(i,fontsize=15,color="lime")
    plt.show()

In [None]:
data['gender'].value_counts()

In [None]:
data.shape

#### Removing the row with "Other" as gender value as it is only 1 in count.

In [None]:
data = data[data.gender!="Other"]

#### Ploting histogram for numerical values.

In [None]:
for i in num_data:
    plt.figure(figsize=(8,5))
    sns.histplot(data[i],kde=True)
    plt.title(i,fontsize=15,color="lime")
    plt.show()

#### Ploting boxplot to check outliers.

In [None]:
num_data = ["age","avg_glucose_level","bmi"]
for i in num_data:
    plt.figure(figsize=(8,5))
    sns.boxplot(data["work_type"],data[i],hue=data["gender"])
    plt.title(i,fontsize=15,color="lime")
    plt.show()

#### Removing outliers.

In [None]:
def find_outliers(feature):
    q1 = data[feature].quantile(0.25)
    q3 = data[feature].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr
    return upper,lower

In [None]:
def func(feature):
    upper,lower = find_outliers(feature)
    data[feature] = np.where(data[feature]>upper,upper,data[feature])
    data[feature] = np.where(data[feature]<lower,lower,data[feature])

In [None]:
for feature in num_data:
    func(feature)    

In [None]:
cat_data

#### Creating dummies.

In [None]:
data = pd.get_dummies(data,columns=["gender","ever_married","Residence_type"],drop_first=True)
data = pd.get_dummies(data,columns=["work_type","smoking_status"],drop_first=False)

In [None]:
data = data.drop(columns=["id"])

#### Ploting correleation heatmap.

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.corr(),annot=True,cmap="rainbow")
plt.title("Correleation Heatmap",fontsize=20,color="c")
plt.show()

In [None]:
X = data.drop(columns=["stroke"])
Y = data["stroke"]

In [None]:
from sklearn.model_selection import train_test_split
x1_train,x1_test,y1_train,y1_test = train_test_split(X,Y,test_size=0.2,random_state=100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=100)

#### Train score.

In [None]:
RFC.fit(x1_train,y1_train)
RFC.score(x1_train,y1_train)

In [None]:
y1_train = pd.DataFrame(y1_train,columns=["stroke"])
y1_train.stroke.value_counts()

#### Test score.

In [None]:
RFC.score(x1_test,y1_test)

In [None]:
prob1 = RFC.predict_proba(x1_test)

#### Confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print("For train data \n",confusion_matrix(y1_train,RFC.predict(x1_train)))
print("For test data \n",confusion_matrix(y1_test,RFC.predict(x1_test)))

#### Classification Report.

In [None]:
print("For train data \n",classification_report(y1_train,RFC.predict(x1_train)))
print("For test data \n",classification_report(y1_test,RFC.predict(x1_test)))

#### The precision , recall and f1-score for class 1 is very bad, as our dataset is imbalanced. So now we will balance the dataset using the library imblearn.

In [None]:
from imblearn.over_sampling import SMOTE
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=100)
sm = SMOTE(random_state=27)
x_train, y_train = sm.fit_resample(x_train, y_train)
x_train.shape, y_train.shape


In [None]:
y_train = pd.DataFrame(y_train, columns = ['stroke'])

In [None]:
y_train.stroke.value_counts()

In [None]:
sm = SMOTE(random_state=27)
x_test, y_test = sm.fit_resample(x_test, y_test)
x_test.shape, y_test.shape

In [None]:
y_test = pd.DataFrame(y_test, columns = ['stroke'])
y_test.stroke.value_counts()

In [None]:
RFC.fit(x_train,y_train)

In [None]:
RFC.score(x_train,y_train)

In [None]:
RFC.score(x_test,y_test)

In [None]:
prob = RFC.predict_proba(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print("For train data \n",confusion_matrix(y_train,RFC.predict(x_train)))
print("For test data \n",confusion_matrix(y_test,RFC.predict(x_test)))

In [None]:
print("For train data \n",classification_report(y_train,RFC.predict(x_train)))
print("For test data \n",classification_report(y_test,RFC.predict(x_test)))

### Precision-Recall curve for imbalanced dataset.

In [None]:
from sklearn.metrics import precision_recall_curve
precision_points, recall_points, threshold_points = precision_recall_curve(y1_test,prob1[:,1])
plt.figure(dpi =100, figsize=(6,6))
plt.plot(threshold_points, precision_points[:-1], color = 'r', label = 'Precision')
plt.plot(threshold_points, recall_points[:-1], color = 'b', label = 'Recall')
plt.xlabel('Threshold')
plt.ylabel('Frequency')
plt.title('Precision-Recall Curve for X1_Test')
plt.legend()
plt.show()

### Precision-Recall curve for balanced dataset.

In [None]:
from sklearn.metrics import precision_recall_curve
precision_points, recall_points, threshold_points = precision_recall_curve(y_test,prob[:,1])
plt.figure(dpi =100, figsize=(6,6))
plt.plot(threshold_points, precision_points[:-1], color = 'r', label = 'Precision')
plt.plot(threshold_points, recall_points[:-1], color = 'b', label = 'Recall')
plt.xlabel('Threshold')
plt.ylabel('Frequency')
plt.title('Precision-Recall Curve for X_Test')
plt.legend()
plt.show()

### AUC-ROC curve for imbalanced dataset.

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score
fpr, tpr, threshold = roc_curve(y1_test ,prob1[:,1])
plt.figure(dpi = 100, figsize=(8,6))
plt.plot(fpr,tpr, color = 'r', label='FPR-TPR')
plt.plot([0,1],[0,1], color = 'g', label = 'Baseline')
plt.title('AUC-ROC Curve for X1_Test')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.show()

### AUC-ROC curve for balanced dataset.

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score
fpr, tpr, threshold = roc_curve(y_test ,prob[:,1])
plt.figure(dpi = 100, figsize=(8,6))
plt.plot(fpr,tpr, color = 'r', label='FPR-TPR')
plt.plot([0,1],[0,1], color = 'g', label = 'Baseline')
plt.title('AUC-ROC Curve for X_Test')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.show()