![](http://)

In [None]:
""" id: unique identifier
2) gender: "Male", "Female" or "Other"
3) age: age of the patient
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6) ever_married: "No" or "Yes"
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8) Residence_type: "Rural" or "Urban"
9) avg_glucose_level: average glucose level in blood
10) bmi: body mass index
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12) stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient"""

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.simplefilter("ignore")

In [None]:
df= pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head(5)

In [None]:
#Code to find numeric data
numeric_data = df.select_dtypes(include = np.number)
numeric_col = numeric_data.columns
numeric_data.head(1)

In [None]:
#code to find categorical data
categorical_data = df.select_dtypes(exclude= np.number)
categorical_col = categorical_data.columns
categorical_data.head(1)

In [None]:
#check how many unique categories each column has
for col in df.columns:
  print(col, ':', len(df[col].unique()), 'categories')

In [None]:
plt.title('Missing Value Status',fontweight='bold')
ax = sns.heatmap(df.isna().sum().to_frame(),annot=True,fmt='d',cmap="BuPu")
ax.set_xlabel('Amount Missing')
plt.show()

201 missing rows in bmi feature

In [None]:
#Just for learning purpose. Actually not needed.

bmi_percent_null = (df['bmi'].isna().sum() / df.shape[0]) * 100
bmi_percent_null

In [None]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['age'].min()  

Doubt: Should we consider this age?

In [None]:
sns.countplot(df['gender'])
df['gender'].value_counts()

In [None]:
df.columns

In [None]:
sns.distplot(df['age'], bins=100);
df['age'].value_counts()


In [None]:
sns.countplot(df['smoking_status']);
df['smoking_status'].value_counts()

People who never smoked holds the maximum count

In [None]:
sns.countplot(df['ever_married']);
df['ever_married'].value_counts()

In [None]:
sns.countplot(df['Residence_type']);
df['Residence_type'].value_counts()

In [None]:
sns.countplot(df['work_type']);
df['work_type'].value_counts()

In [None]:
sns.countplot(df['hypertension']);


###Outlier detection

In [None]:

#For bmi

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df,x=df["bmi"],color='green');

In [None]:
def outliers(df, variable, distance):
     IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
     lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
     upper_boundary = df[variable].quantile(0.75) + (IQR * distance) 
        
     return upper_boundary, lower_boundary


In [None]:
upper_limit, lower_limit = outliers(df, "bmi", 1.5)
upper_limit, lower_limit

In [None]:
outliers_bmi=df.loc[df['bmi']>50]
outliers_bmi['bmi'].shape

In [None]:
#so bmi with outlier
outliers_bmi['stroke'].value_counts()


In [None]:
#Applying lambda function to fill bmi to 50 when bmi given is greater than 50 else same.




df["bmi"] = df["bmi"].apply(lambda x: 50 if x>50 else x)

# null values in BMI (replaced with mean) 
df["bmi"] = df["bmi"].fillna(df['bmi'].mean())


In [None]:
sns.boxplot(data=df,x=df["bmi"],color='green');

###According to the National Institutes of Health (NIH): A BMI of less than 18.5 means that a person is underweight. A BMI of between 18.5 and 24.9 is ideal. A BMI of between 25 and 29.9 is overweight.

In [None]:
plt.figure(figsize=(7,7))
df['stroke'].value_counts().plot.pie(autopct='%1.1f%%', colors = ['grey', 'b'])
plt.title("Stroke status", fontdict={'fontsize': 14})

df["stroke"].value_counts()

Only 4.9% of people had stroke

In [None]:
sns.countplot(df['heart_disease']);
df['heart_disease'].value_counts()

In [None]:
sns.lineplot(x='age', y='stroke', data=df)

People between the age of 60 and 80 has the high chance of getting stroke

In [None]:
plt.subplot(1,2,2)
sns.countplot(df['gender'], hue= df['stroke'])

In [None]:

sns.countplot(df['hypertension'], hue= df['stroke'])

In [None]:
plt.subplot(1,2,2)
sns.countplot(df['heart_disease'], hue= df['stroke'])

People with no heart disease has very high chance of not receiving a stroke.

In [None]:
plt.subplot(1,2,2)
sns.countplot(df['ever_married'], hue= df['stroke'])

df['ever_married'].value_counts()

Unmarried people has less chance of getting stroke 

In [None]:
sns.countplot(x="work_type", hue= df['stroke'], data=df)

In [None]:
sns.countplot(x="Residence_type", hue= df['stroke'], data=df)

There is no much difference in people who may receive or not receive depending on their residence type that is Rural or Urban.

In [None]:
""""Observations

1.Females are more prone to have a stroke.
2.More than 25% of stroke case patients have hypertension.
3.Very few cases of people who have a heart disease have had a stroke.
4.Most of the patients who have a stroke were married.
5.Doing private work increases chances of having a stroke. Those who have never worked barely have experienced a stroke.
6.The type of residence did not impact the chances of having a stroke.
7.Being a smoker or a former smoker increases your risk of having a stroke.

#Data Preprocessing

In [None]:
df.head(5)

In [None]:
categorical_variable= df.select_dtypes(include=['object']).columns.tolist()

In [None]:
categorical_variable

In [None]:
print("Gender", df['gender'].unique())
print("Ever_married",df['ever_married'].unique())
print("worktype",df['work_type'].unique())
print("residence_type", df['Residence_type'].unique())
print("smaokingstatus", df['smoking_status'].unique())

###The conversion of categorical data into numerical data is called Categorical Encoding.

Label Encoding

In [None]:
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['work_type'] = le.fit_transform(df['work_type'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])


In [None]:
#So label encoding is done

In [None]:
df.head()

Standard scalar

In [None]:
from sklearn.preprocessing import StandardScaler

s=StandardScaler()
columns = ['avg_glucose_level','bmi','age']
stand_scaled = s.fit_transform(df[['avg_glucose_level','bmi','age']])
stand_scaled = pd.DataFrame(stand_scaled,columns=columns)

df=df.drop(columns=columns,axis=1)

In [None]:
stand_scaled.head()

In [None]:
#Now add scaled to df

In [None]:
df = pd.concat([df, stand_scaled], axis=1)
df.head(5)

In [None]:
#lets drop id

In [None]:
df=df.drop(columns='id',axis=1)

#Modelling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=df.drop(['stroke'], axis=1)
y=df['stroke']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state = 0)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state= 124)

model= DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print (confusion_matrix(y_test, prediction))


In [None]:
print (classification_report (y_test, prediction))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, prediction)

logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

In [None]:
y_pred_loj = loj_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
accuracy_score(y_test, y_pred_loj)

In [None]:
print("Training Accuracy :", loj_model.score(X_train, y_train))
print("Testing Accuracy :", loj_model.score(X_test, y_test))

In [None]:
print(classification_report(y_test, y_pred_loj))

XG Boost

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred_xgb_model = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred_xgb_model)

In [None]:
print(classification_report(y_test, y_pred_xgb_model))

In [None]:
print (confusion_matrix(y_test, y_pred_xgb_model))

In [None]:
from xgboost import XGBClassifier
# Model Tuning
XGB_model = XGBClassifier(random_state = 42, max_depth = 8, n_estimators = 3000, 
                          reg_lambda = 1.2, reg_alpha = 1.2, 
                          min_child_weight = 1,objective = 'binary:logistic',
                         learning_rate = 0.15, gamma = 0.3, colsample_bytree = 0.5,
                          eval_metric = 'auc').fit(X_train, y_train)

In [None]:
y_pred_XGB_model = XGB_model.predict(X_test)
accuracy_score(y_test, y_pred_XGB_model)

In [None]:
feat_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
#feat_importances.nsmallest(20).plot(kind='barh')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_XGB_model))

In [None]:
models = [
    model,
    xgb_model,
    loj_model,
      
]

for model in models:
    names = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(names + ":" )
    print("Accuracy: {:.4%}".format(accuracy))