In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [None]:
# reading csv file
stroke_data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
# checking the first five element
stroke_data.head()

In [None]:
# shape of data
stroke_data.shape

In [None]:
# gathering the information
stroke_data.info()

In [None]:
stroke_data.describe()

In [None]:
# checking the null value of every column
stroke_data.isnull().sum()

In [None]:
# since bmi has many null value we will replace it with mean
stroke_data=stroke_data.replace(to_replace=np.nan,value=stroke_data.mean())

In [None]:
# checking null values
stroke_data.isnull().sum()

In [None]:
correlation=stroke_data.corr()

In [None]:
sns.heatmap(correlation,annot=True)

In [None]:
stroke_data.nunique()

In [None]:
stroke_data['smoking_status'].unique()

In [None]:
stroke_data['Residence_type'].unique()

In [None]:
stroke_data['work_type'].unique()

In [None]:
stroke_data['ever_married'].unique()

In [None]:
stroke_data['gender'].unique()

In [None]:
# count of smoking status
sns.countplot(x=stroke_data['smoking_status'])

In [None]:
# as for clarification we will treat 'Unknown' as a null value and replace it
stroke_data['smoking_status'].replace(to_replace='Unknown',value=stroke_data['smoking_status'].mode()[0],inplace=True)

In [None]:
sns.countplot(x=stroke_data['Residence_type'])

In [None]:
sns.countplot(x=stroke_data['work_type'])

In [None]:
sns.countplot(x=stroke_data['ever_married'])

In [None]:
sns.countplot(x=stroke_data['gender'])

In [None]:
sns.countplot(x=stroke_data['heart_disease'])

In [None]:
sns.countplot(x=stroke_data['hypertension'])

In [None]:
sns.boxplot(x='bmi',data=stroke_data)

In [None]:
def remove_outliers(data):
    arr=[]
    #print(max(list(data)))
    q1=np.percentile(data,25)
    q3=np.percentile(data,75)
    iqr=q3-q1
    mi=q1-(1.5*iqr)
    ma=q3+(1.5*iqr)
    #print(mi,ma)
    for i in list(data):
        if i<mi:
            i=mi
            arr.append(i)
        elif i>ma:
            i=ma
            arr.append(i)
        else:
            arr.append(i)
    #print(max(arr))
    return arr


In [None]:
stroke_data['bmi']=remove_outliers(stroke_data['bmi'])

In [None]:
sns.boxplot(x='bmi',data=stroke_data)

In [None]:
stroke_data.head()

In [None]:
sns.boxplot(x='avg_glucose_level',data=stroke_data)

In [None]:
stroke_data['avg_glucose_level']=remove_outliers(stroke_data['avg_glucose_level'])

In [None]:
sns.boxplot(x='avg_glucose_level',data=stroke_data)

In [None]:
# relation between age and stroke
sns.countplot(x='work_type',hue='stroke',data=stroke_data)

In [None]:
# relationship between smoking status and stroke
sns.countplot(hue='hypertension',x='stroke',data=stroke_data)


In [None]:
sns.countplot(x='work_type',hue='stroke',data=stroke_data)

In [None]:
sns.countplot(x='heart_disease',hue='stroke',data=stroke_data)

In [None]:
stroke_data.columns

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder=LabelEncoder()

In [None]:
data=stroke_data.apply(label_encoder.fit_transform)

In [None]:
data.head()

In [None]:
data['smoking_status'].unique()

In [None]:
correl=data.corr(method='pearson')
#sns.heatmap(correl,annot=True)
correl

In [None]:
data.drop(['id'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x=data.drop('stroke',axis=1)
y=data['stroke']

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from imblearn.over_sampling import RandomOverSampler
rus = RandomOverSampler(random_state=42)
x,y=rus.fit_resample(x, y)

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x_Train,x_Test,y_Train,y_Test= train_test_split(x,y,test_size=0.4,random_state=40)

In [None]:
features=list(x_Train.columns)

In [None]:
features

# Decision Tree

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_Train,y_Train)

In [None]:
pred1=dtree.predict(x_Test)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
print(accuracy_score(y_Test,pred1))

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_Test,pred1)
print(cm)

In [None]:
def precision(cm):
    return cm[1][1]/(cm[1][1]+cm[0][1])

print(precision(cm))

In [None]:
def recall(cm):
    return cm[1][1]/(cm[1][1]+cm[1][0])

recall(cm)                                    

In [None]:
def specificity(cm):
    return cm[0][0]/(cm[0][0]+cm[0][1])

specificity(cm)

In [None]:
f1_score(y_Test,pred1)

#  Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
# x=data.drop('stroke',axis=1)
# y=data['stroke']


In [None]:
# sc=MinMaxScaler(feature_range=(0,1))
# X=sc.fit_transform(X)

In [None]:
#x_Train,x_Test,y_Train,y_Test= train_test_split(x,y,test_size=0.4,random_state=42)

In [None]:
lg=LogisticRegression(max_iter=10000)

In [None]:
lg=lg.fit(x_Train,y_Train)

In [None]:
pred2=lg.predict(x_Test)

In [None]:
accuracy_score(y_Test,pred2)

In [None]:
cm2=confusion_matrix(y_Test,pred2)
cm2

In [None]:
precision(cm2)

In [None]:
recall(cm2)

In [None]:
specificity(cm2)

In [None]:
f1_score(y_Test,pred2)

# Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(n_estimators=100,random_state=42)

In [None]:
rf=rf.fit(x_Train,y_Train)

In [None]:
pred3=rf.predict(x_Test)

In [None]:
feature_important=pd.Series(rf.feature_importances_,index=x.columns).sort_values(ascending=False)
feature_important

In [None]:
sns.barplot(x=feature_important,y=feature_important.index)

In [None]:
accuracy_score(y_Test,pred3)

In [None]:
cm3=confusion_matrix(y_Test,pred3)
cm3

In [None]:
precision(cm3)

In [None]:
recall(cm3)

In [None]:
specificity(cm3)

In [None]:
f1_score(y_Test,pred3)