In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
health = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
health.head(10)

In [None]:
health.isnull().sum()

In [None]:
np.mean(health.bmi)

In [None]:
health.bmi.fillna(np.mean(health.bmi), inplace=True)

In [None]:
health.smoking_status.value_counts()

In [None]:
health.drop(['id'] , axis=1 , inplace=True)

In [None]:
health.work_type.value_counts()

In [None]:
#health.smoking_status.replace({'Unknown':'smokes'} , inplace=True)

In [None]:
health.age.min()

## Removing Outliers in age who has stroke

In [None]:
def remove_outliers(df , col , k):
    mean= df[col].mean()
    sd=df[col].std()
    global df1
    final_list = [x for x in df[col] if (x>mean-k*sd)]
    final_list = [x for x in final_list if (x<mean+k*sd)]
    df1 = df.loc[df[col].isin(final_list)];
    return df1
health = pd.concat([remove_outliers(health[health.stroke==1] , 'age' ,2.0) , health[health.stroke == 0]] , axis=0)

#  EDA 

### Percent of data in both catagory 

In [None]:

dist = [health.stroke[health.stroke == 1 ].count() , health.stroke[health.stroke == 0 ].count()]
dist
plt.pie(dist, autopct='%1.1f%%');

### Relation between stroke and age 

In [None]:
sns.histplot(x='age' , data=health , hue='stroke' , element='poly')
plt.title("Age Vs Stroke Histogram")

In [None]:
sns.boxplot(y='age' , x='stroke', data=health )
plt.title("Age Vs Stroke Boxplot ")

####  Here we can see that the age of people having stroke is left skewed distribution 55 and above aged people are much more prone to stroke

## Plot between smoking status in people that had stroke

In [None]:
data = health[health.stroke == 1].smoking_status.value_counts()
pie_data = []
for i in data:
    pie_data.append(i)
label = ('never smoked' , 'formerly smoked' , 'unknown'  ,'smokes');
plt.pie(pie_data , labels=label , autopct='%1.0f%%' );
plt.title("Somking status of people with stoke");

## Plot between stroke and Gender Count

In [None]:
sns.countplot(x='gender' , hue='stroke' , data=health )
plt.title("Gender vs Stroke");

#### Here we can see that female count in stroke bar is more than that of Male 

## Stroke and BMI relation ship 

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(x='bmi' , hue='stroke' , data=health , element='poly')
plt.title("Bmi vs Stroke")
plt.grid()

#### people of Bmi group (20 , 40 ) is more prone to stroke

## Relation between hearth disease and Stroke

In [None]:
plt.figure(figsize=(10 , 5))
sns.countplot(x='heart_disease' , hue='stroke' , data=health)
plt.title("Histogram of Heart Disease vs Stoke");

#### Here people without heart disease much prone to stroke 

## Hypertension VS stroke

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='hypertension' , hue='stroke' , data=health)
plt.grid()

#### Here people without hypertention is more prone to stroke

## Avg_glucose_level count and stroke

In [None]:
plt.figure(figsize=(10 , 5))
sns.histplot(x='avg_glucose_level' , data=health , hue='stroke' , element='poly');
plt.title("AVg_glucose_level count and stroke");

## Work Type and Stroke

In [None]:
sns.catplot(y="work_type", hue="stroke", kind="count", edgecolor=".6",
            data=health)

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(health.corr(),cmap="Blues");

# Converting Labeled Data to Numerical Data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
health[health.select_dtypes(include=['object']).columns] = health[health.select_dtypes(include=['object']).columns].apply(le.fit_transform)

In [None]:
health.head()

# Feature Selection 

## Boruta

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [None]:
health_x = health.drop(['stroke'] , axis=1)
health_y = health.stroke

health_x

health_x = np.array(health_x)
health_y = np.array(health_y)

rf = RandomForestClassifier()
boruta = BorutaPy(rf, max_iter=25 )
boruta.fit(health_x, health_y)

In [None]:
features = pd.DataFrame({"Features":health.drop(['stroke'], axis=1).columns , "Score":boruta.support_})

In [None]:
features

## Chi 2 test 

In [None]:
health_x = health.drop(['stroke'] , axis=1)
health_y = health.stroke

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
test = SelectKBest(score_func=chi2 , k='all')

fitted = test.fit(health_x , health_y)
print(fitted.scores_)

In [None]:
Feature_imp = pd.DataFrame({"Features":health_x.columns , "Importance":fitted.scores_ })

In [None]:
sns.catplot(y='Importance' , x='Features' , data=Feature_imp.sort_values(['Importance'], ascending=False) , kind='bar')
plt.xticks(rotation = 90);

# Test train Split

In [None]:
from sklearn.model_selection import train_test_split
train  , test  = train_test_split(health , test_size = 0.2 , random_state=555)

In [None]:
train_x = train.drop(['stroke'] , axis=1)
train_y = train.stroke

In [None]:
test_x = test.drop(['stroke'] , axis=1)
test_y = test.stroke


# Model and Prediction

### Random forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report
from sklearn.ensemble import AdaBoostClassifier

In [None]:
rfc = RandomForestClassifier(criterion='entropy' , max_depth=6 ,class_weight='balanced')
rfc.fit(train_x , train_y)
pred_rfc = rfc.predict(test_x)
confution_rfc = confusion_matrix(pred_rfc , test_y)
print(confution_rfc)
repo = classification_report(test_y , pred_rfc)
print(repo)
Accuracy_rfc = accuracy_score(test_y , pred_rfc)
print(Accuracy_rfc)
plt.figure(figsize=(5,2))
sns.heatmap(confution_rfc, annot=True ,fmt='' );

In [None]:
rfc_feature = pd.DataFrame({"Feature":train_x.columns , "Importance":rfc.feature_importances_})
rfc_feature.sort_values('Importance' , ascending=False)

 <i>Here we can see that age , avg_gloucose_level , bmi  , work_type are most important columns </i>

### upsampling class 1

In [None]:
df2 = train[train.stroke == 1]
train = pd.concat([train , df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2] , axis=0)
train_x = train.drop(['stroke'] , axis=1)
train_y = train.stroke

In [None]:
df2 = train[train.stroke == 1]
df2.shape
train.shape

<i>The Number of Rows after upsampling in train data are 8018</i>

### Logistic Regression 

In [None]:
lr = LogisticRegression()
adb= AdaBoostClassifier(lr)
adb.fit(train_x , train_y)
pred_lr = adb.predict(test_x)
confution_lr = confusion_matrix(pred_lr , test_y)
print(confution_lr)
repo = classification_report(test_y , pred_lr)
print(repo)
Accuracy_rfc = accuracy_score(test_y , pred_lr)
print(Accuracy_rfc)
plt.figure(figsize=(5,2))
sns.heatmap(confution_lr , annot=True ,fmt='' );



### Random Forest  with AdaBoost

In [None]:
rfc = RandomForestClassifier(max_depth=6 , criterion='entropy' , class_weight='balanced')
adb = AdaBoostClassifier(rfc)
adb.fit(train_x , train_y)
pred_adb = adb.predict(test_x)
confution_adb = confusion_matrix(pred_adb , test_y)
print(confution_adb)
repo = classification_report(test_y , pred_adb)
print(repo)
Accuracy_rfc = accuracy_score(test_y , pred_adb)
print(Accuracy_rfc)
plt.figure(figsize=(5,2))
sns.heatmap(confution_adb, annot=True ,fmt='' );

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
dt = DecisionTreeClassifier(min_samples_split=3 , criterion='entropy' , max_depth=6  )
dt.fit(train_x  , train_y)
pred_dt = dt.predict(test_x)
conf_dt = confusion_matrix(pred_dt , test_y)
print(conf_dt)
repo = classification_report(test_y , pred_dt)
print(repo)
Accuracy_rfc = accuracy_score(test_y , pred_dt)
print(Accuracy_rfc)
plt.figure(figsize=(5,2))
sns.heatmap(conf_dt, annot=True ,fmt='' );

### Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(class_weight='balanced')

In [None]:
et.fit(train_x , train_y )
adb = AdaBoostClassifier(et , learning_rate=2 )
adb.fit(train_x , train_y)
pred_et = adb.predict(test_x)
conf_et = confusion_matrix(pred_et , test_y)
print(conf_et)

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
grd = GradientBoostingClassifier()
grd.fit(train_x , train_y)
pred_grd = grd.predict(test_x)
conf_grd = confusion_matrix(pred_grd , test_y)
repo_grd = classification_report(test_y , pred_grd)
acc_grd = accuracy_score(test_y , pred_grd)
print(conf_grd)
print(repo_grd)
print(acc_grd)
plt.figure(figsize=(5,2))
sns.heatmap(conf_grd, annot=True ,fmt='' );

## Naive Bayes 

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
NB = MultinomialNB()
NB.fit(train_x , train_y)
pred_NB = NB.predict(test_x)
conf_NB =confusion_matrix(pred_NB , test_y)
print(conf_NB)
acc_grd = accuracy_score(test_y , pred_NB)
print(acc_grd)
repo_NB= classification_report(test_y , pred_NB)
print(repo_NB)
plt.figure(figsize=(5,2))
sns.heatmap(conf_NB, annot=True ,fmt='' );

In [None]:
import xgboost as xg
xgboost=xg.XGBClassifier(n_estimators=900,learning_rate=0.1)
xgboost.fit(train_x , train_y)

predictions = xgboost.predict(test_x)
conf_tab = confusion_matrix(predictions , test_y)
conf_tab