In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Attribute Information
1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12) stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#sns.set_style ('dark')

#sns.set_palette('RdYlGn')
colors = ['#B90276','#50237F', '#005691', '#008ECF','#E20015', '#00A8B0', '#78BE20', '#006249', '#525F6B']

sns.set_palette(sns.color_palette(colors))

# Reading the data

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', sep = ',')
df = pd.DataFrame(data)
df.head()

In [None]:
df.info()

# Filling missing values with ffill method

In [None]:
df['bmi'] = df['bmi'].fillna(method = 'ffill')

In [None]:
df.bmi.shape

In [None]:
df.columns

In [None]:
df.age = df.age.astype(int)
#df.avg_glucose_level = df.avg_glucose_level.astype(int)

In [None]:
object1 = df.select_dtypes(include = ['object']).columns

In [None]:
othercol = df.select_dtypes(include = ['float64']).columns

In [None]:
for col in object1:
    print(col ,'\n', df[col].unique(),'\n')

# What does our target data contain
its highly "Imbalanced"

In [None]:
plt.title('Count of Stroke')
sns.countplot(x ='stroke',  data = df)

# Visualization

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
plt.title('Age & Gender Count')
sns.histplot(x ='age',  data = df, hue = 'gender')
plt.subplot(2,2,2)
plt.title('Affect of Age BMI on Stroke')
sns.scatterplot(x = 'age', y = 'bmi', hue = 'stroke', data = df)
plt.subplot(2,2,3)
plt.title('Which Gender has more chances of stroke')
sns.countplot(x = 'gender', hue ='stroke', data = df)
plt.subplot(2,2,4)
plt.title('Count of Gender')
sns.histplot(x='gender', data = df)
plt.show()

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(3,3,1)
plt.title('Gender & Avg Glucose Level & Stroke')
sns.boxplot(x ='gender',y='avg_glucose_level', hue = 'stroke', data = df)

plt.subplot(3,3,2)
plt.title('How many of them smoke')
sns.countplot(x = 'gender', hue = 'smoking_status', data = df)

plt.subplot(3,3,3)
plt.title('Which Residence_type has more chances of stroke')
sns.countplot(x = 'Residence_type', hue ='stroke', data = df)

plt.subplot(3,3,4)
plt.title('Count of Gender with Residence type')
sns.histplot(x='Residence_type', data = df, hue = 'gender')

plt.subplot(3,3,5)
plt.title('Which Residence type is smoking')
sns.histplot(x='Residence_type', data = df, hue = 'smoking_status')


plt.subplot(3,3,6)
plt.title('Married or Unmarried')
sns.countplot(x='gender', data = df, hue = 'ever_married')

plt.subplot(3,3,7)
plt.title('Ever Worked?')
sns.countplot(x='gender', data = df, hue = 'work_type')

plt.subplot(3,3,8)
plt.title('Having Hypertension?')
sns.countplot(x='gender',data = df, hue = 'hypertension')

plt.subplot(3,3,9)
plt.title('Having heart_disease?')
sns.scatterplot(x='age', y = 'bmi',data = df, hue = 'heart_disease')
plt.show()

In [None]:
i = 1
plt.figure(figsize = (10,15))
for col in object1:
    plt.subplot(3,2,i)
    sns.countplot(x = 'stroke',hue = df[col] , data = df)
    i +=1

In [None]:
plt.figure(figsize=(10,7))
for col in object1:
    g = sns.FacetGrid(df, col=col, hue="stroke")
    g.map(sns.kdeplot, "age",alpha=.7)
    plt.figure(figsize=(15,7))
    g.add_legend()

# 3D Visualization

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(15,9))
ax = fig.add_subplot(121, projection = '3d')

x = df['age']
y = df['hypertension']
z = df['bmi']

ax.scatter(x, y, z)
ax.set_xlabel("Age")
ax.set_ylabel("Hypertension")
ax.set_zlabel("BMI")
plt.title("AGE, Hypertension & BMI")

ax = fig.add_subplot(122, projection = '3d')

x = df['age']
y = df['heart_disease']
z = df['bmi']

ax.scatter(x, y, z)
ax.set_xlabel("Age")
ax.set_ylabel("heart_disease")
ax.set_zlabel("BMI")
plt.title("AGE, Heart Disease & BMI")

plt.show()

In [None]:
fig = px.scatter_3d(df, x='age', y='avg_glucose_level', z='bmi',color='stroke')
fig.update_traces(marker=dict(size=2))
fig.show()


# Label Encoder fitting on categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
col = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in col:
    df[col] = le.fit_transform(df[col])


# Corelation plot
***As such I do not see any corelation with stroke***

In [None]:
df1 = df.drop(['id'],1)
plt.figure(figsize= (12,8))
corr = df1.corr()
sns.heatmap(corr, linewidth=0.7, annot =True, cmap = 'coolwarm')


In [None]:
X = df1.drop(['stroke'], 1)
y = df1.stroke

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , random_state =43, test_size = 0.2)

# Without under or over Sampling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

rfc = RandomForestClassifier()

# fit the predictor and target
rfc.fit(X_train, y_train)

# predict
rfc_predict = rfc.predict(X_test)# check performance
print('ROCAUC score:',roc_auc_score(y_test, rfc_predict))
print('Accuracy score:',accuracy_score(y_test, rfc_predict))
print('F1 score:',f1_score(y_test, rfc_predict))

# Without under sampling we get a very high score of accuracy, as our data mostly contains less number of people who have stroke. ML models work well when they have "Balanced Data"

# Under Sampling the Data & fitting it to our Model


***under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.***

In [None]:
# import library
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
X_rus, y_rus = rus.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus , random_state =43, test_size = 0.2)



***Under sampling improves the F1 Score***

In [None]:
rfc.fit(X_train, y_train)

# predict
rfc_predict = rfc.predict(X_test)# check performance
print('ROCAUC score:',roc_auc_score(y_test, rfc_predict))
print('Accuracy score:',accuracy_score(y_test, rfc_predict))
print('F1 score:',f1_score(y_test, rfc_predict))

# Fitting the undersampled data on multiple models

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score , plot_roc_curve

from sklearn import metrics
from sklearn.metrics import mean_squared_error
rf = RandomForestClassifier()
ad = AdaBoostClassifier(base_estimator =rf)
dt = DecisionTreeClassifier()
kn = KNeighborsClassifier()
#gnb = GaussianProcessClassifier()
svc = SVC()

models = [rf,ad, dt, kn, svc]
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mod = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores = cross_val_score(model, X, y, cv=5).mean().round(3)
    f1score = metrics.f1_score(y_test, y_pred).round(3)
    #accuracy = metrics.classification_report(y_test, y_pred)
    print(model, '\n', 'mean_score:',scores, '\n', 'ROCAUC score:',roc_auc_score(y_test, y_pred).round(3),'\n',"F1_SCORE:", f1score,'\n' )
   

# Adaboost Classifier gives better F1 Score  compared to other models

In [None]:
y_pred_ada = ad.predict(X_test)

# Prediction with id's

In [None]:
prediction = pd.DataFrame(y_pred_ada, columns= ['Predicted'])
prediction['ID'] = df['id']
prediction.head()

In [None]:
from yellowbrick.classifier import class_prediction_error
visualizer = class_prediction_error(AdaBoostClassifier(n_estimators=100), X_train, y_train)
visualizer.show()

In [None]:
from yellowbrick.classifier import classification_report
visualizer = classification_report(AdaBoostClassifier(n_estimators=100), X_train, y_train)
visualizer.show()


In [None]:
from yellowbrick.classifier import confusion_matrix
visualizer = confusion_matrix(AdaBoostClassifier(n_estimators=100), X_train, y_train)
visualizer.show()

In [None]:
from yellowbrick.classifier import roc_auc
visualizer = roc_auc(AdaBoostClassifier(n_estimators=100), X_train, y_train)
visualizer.show()


In [None]:
from yellowbrick.classifier import discrimination_threshold
visualizer = discrimination_threshold(AdaBoostClassifier(n_estimators=100), X_train, y_train)
visualizer.show()

In [None]:
prediction.to_csv('submission.csv', index = False)

# If you have come so far please do not forget to vote, Thank you !!