In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

**CONTEXT**

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.*

*A stroke occurs when the blood supply to part of your brain is interrupted or reduced, preventing brain tissue from getting oxygen and nutrients. Brain cells begin to die in minutes*
*There can be various factors related to occurence to stroke. So using the data given we try to list out the potential factors by using various visualization techniques. *

**READ DATA**

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data

**CHECK FOR NULL VALUES**

In [None]:
data.isnull().sum()

*AS WE CAN SEE BMI CONTAINS NULL VALUES AND WE NEED TO FIX THIS*

***FILLING THE NULL VALUES WITH AVERAGE OF THE BMI'S***

In [None]:
avg = data['bmi'].mean()
avg

In [None]:
data.bmi=(data.bmi.fillna(28.74))

FOR ADULTS THE NORMAL BMI RANGE IS BETWEEN 18.5 TO 24.9 FOR ADULTS AND AS WE OBSERVE THAT AVERAGE BMI CALCULTION COMES OUT TO BE MORE THAN NORMAL THAT LARGE PROPROTION OF THE POPULATION IN THE GIVEN DATASET IS OVERWEIGHT

In [None]:
data.isnull().sum()

SO NOW NO NULL VALUES PRESENT

In [None]:
data.info() # THIS FUNCTION LETS US KNOW WHAT DATA TYPE VARIABLE ARE PROVIDED IN THE DATASET

In [None]:
data.describe()

So from above statistical description of the dataset we can see that mean age of people is around 43 years and mean bmi is more than normal

WHILE PLOTTING WE NEED TO KEEP IN MIND THAT AGAINST WHICH TYPE OF VARIABLES WE ARE PLOTTING THEN ONLY WE CAN DRAW INSIGHT FROM IT

**COUNTPLOT TO SEE THE DISTRIBUTION OF WORK_TYPE**

In [None]:
sns.countplot(data['work_type'])

THERE ARE LARGE NUMBER OF PEOPLE WHO WORK ON PRIVATE SECTOR 

In [None]:
sns.countplot(data['Residence_type'])

THERE ARE ALMOST SAME NUMBER OF PEOPLE LIVING IN BOTH URBAN AND RURAL AREAS

In [None]:
sns.countplot(data['smoking_status'])

GOOD TO SEE THAT MOST NUMBER OF PEOPLE NEVER SMOKED AS "SMOKING KILLS"

In [None]:
sns.countplot(data['stroke'])

THE ABOVE PLOT SHOWS THAT THERE IS **HIGH IMBALANCE** IN THE BOTH THE TARGET CLASSES AN WE NEED TO RESOLVE THIS ISSUE BEFORE APPLYING ANY ALGORITHM

In [None]:
sns.countplot(data['ever_married'])

In [None]:
min_avg_glucose_level = min(data.avg_glucose_level)
max_avg_glucose_level = max(data.avg_glucose_level)
print(min_avg_glucose_level)
print(max_avg_glucose_level)

THE ABOVE MINIMUM AND MAXIMUM VALUES OF AVERAGE GLUCOSE LEVEL SHOWS THAT THE COLUMN NEEDS TO BE STANDARDIZED AS THERE IS VERY HIGH DIFFERENCE BETWEEN THEM

In [None]:
sns.distplot(data['age'])

In [None]:
sns.distplot(data['avg_glucose_level'])

**MAPPING OF CATEGORICAL VARIABLES**

In [None]:
data['work_type'] = data['work_type'].map({'Private':0, 'Self-employed': 1, 'Govt_job':2, 'children':3, 'Never_worked':4})

In [None]:
data['gender'] = data['gender'].map({'Male':0, 'Female':1})
data['Residence_type'] = data['Residence_type'].map({'Urban':0, 'Rural':1})
data['smoking_status'] = data['smoking_status'].map({'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3})
data['ever_married'] = data['ever_married'].map({'Yes':0, 'No':1})

DATASET AFTER MAPPING OF CATEGORICAL VARIABLES

In [None]:
data

**CORRELATION HEATMAP** TO CHECK FOR ANY CORRELATION BETWEEN VARIABLES

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

FROM THE ABOVE FIGURE WE CAN SEE THAT-
WORK_TYPE AND BMI - NEGATIVE CORRELATION
STROKE AND AGE HAS A POSITIVE CORRELATION
SIMILARLY MANY OTHER VARIABLES HAVE SUCH CORRELATION VALUES WE CANNOT REMOVE ANY VARIABLES. WE HAVE TO CONSIDER ALL THE VARIABLES FOR OUR MODEL

In [None]:
sns.scatterplot(x=data['age'], y=data['avg_glucose_level'])


FROM THE ABOVE SCATTER PLOT IT IS QUITE VISIBLE THAT AS THE AGE INCREASE IT LEADS TO INCREASE IN GLUCOSE LEVEL

In [None]:
sns.catplot(x='heart_disease',y='age', hue="work_type", kind="bar", data=data)

PEOPLE WHO ARE SELF EMPLOYED ARE THE ONES WHO HAVE HEART DISEASE AND OBVIOUSLY LEAST NUMBERS ARE OF CHILDREN

In [None]:
sns.catplot(x='hypertension',y='age', hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x="smoking_status", y="stroke", hue="work_type", kind="bar", data=data)

AGAIN SELF-EMPLOYED PEOPLE HAVE HIGHER RISK OF STROKE. THIS SHOWS THAT THESE PEOPLE ARE MORE VULNERABLE TO DIFFERENT DISEASES AS THEY CARRY LOT OF TENSION OF EARNINGS AND FAMILY INCOME

In [None]:
sns.catplot(x="hypertension", y="stroke", hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x="Residence_type", y="stroke", hue="work_type", kind="bar", data=data)

TYPE OF RESIDENCE HARDLY MAKES ANY DIFFERENCE TO DISEASE

In [None]:
sns.catplot(x='stroke', y="avg_glucose_level", kind="box", data=data)

PEOPLE HAVING HIGHER GLUCOSE LEVEL ARE AT HIGH RISK OF STROKE

In [None]:
sns.catplot(x='stroke', y="age", hue = 'gender', kind="box", data=data)

HIGH AGE FEMALES ARE AT THE RISK TO STROKE

In [None]:
sns.catplot(x='stroke', y="age", hue = 'work_type', kind="box", data=data)

**APPLY MACHINE LEARNING ALGORITHM FOR PREDICTION**

DIVIDING THE DATASET INTO FEATURES AND LABELE

In [None]:
features = ['id','age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

label = ['stroke']

X = data[features]
y = data[label]

ONCE AGAIN CHECK FOR NULL VALUES IN THE DATASET

In [None]:
X.isnull().sum() #WE STILL HAVE 1 NULL VALUE IN THE GENDER COLUMN

In [None]:
X.gender=(X.gender.fillna(1))

In [None]:
X.isnull().sum()

SINCE THE TARGET CLASS IS HIGHLY IMBALANCED, WE NEED TO TREAT IT AS IT'S PRESENCE WILL LEAD TO POOR PERFORMANCE OF THE MODEL. HERE I HAVE USED SMOTE (Synthetic Minority Oversampling Technique) TECHNIQUE. SMOTE WORKS BY RANDOMNLY PICKING A POINT FROM MINORITY CLASS AND COMPUTING A K-NEAREST NEIGHBOURS FOR THIS POINT.

**TREATING IMBALANCE CLASS USING SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X, y)

SPLITTING OF DATASET INTO TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(x_smote,y_smote,test_size=0.33,random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
testing = X_test['id'] #taking ID column for the purpose of submission
testing

AS ID COLUMN DOES NOT AFFETCT THE MODEL'S PERFORMANCE, WE DROP IT

In [None]:
X_train = X_train.drop(columns=['id'])
X_test = X_test.drop(columns=['id'])

STANDARDIZATION OF THE DATA IS REQUIRED AS DATA ARE IN DIFFERENT SCALES

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

DATA POINTS AFTER STANDARDIZATION:

In [None]:
X_train

In [None]:
X_test

LET'S APPLY **LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)


In [None]:
y_pred_log_reg = log_reg.predict(X_test)
y_pred_log_reg

CLASSIFICATION REPORT OF LOGISTIC REGRESSION

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_pred_log_reg)
print(classification_report)

VALUES OF F1 SCORE SHOWS THAT THE MODEL IS PERFORMING QUITE WELL

In [None]:
auc = roc_auc_score(y_test, y_pred_log_reg)
auc

AUC SCORE OF AROUND 80% IS QUITE GOOD. MODEL IS ABLE TO CLASSIFY THE CLASSES VERY WELL

In [None]:
cm = confusion_matrix(y_test, y_pred_log_reg)
cm

In [None]:
predicted_probab_log = log_reg.predict_proba(X_test)
predicted_probab_log = predicted_probab_log[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab_log)

**ROC CURVE**

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', label='Logistic Regression')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

LOGISTIC REGRESSION IS PERFORMING WELL, BUT CAN WE IMPROVE PERFORMANCE USING ANOTHER MODEL? LET'S APPLY ANOTHER ALGORITHM

**RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc_predict = rfc.predict(X_test)
roc_auc_score(y_test, rfc_predict)

AUC SCORE HAS INCREASED TO **94%**. AMAZING!!

In [None]:
cm = confusion_matrix(y_test, rfc_predict)
cm

CALCULATION OF F1 SCORE

In [None]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)

In [None]:
predicted_probab = rfc.predict_proba(X_test)

In [None]:
predicted_probab = predicted_probab[:, 1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, predicted_probab)

**ROC CURVE**

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', color='red', label='Random Forest')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

LET'S SEE IF WE CAN IMPROVE IT FURTHER USING ANOTHER MODEL

**XGBOOST CLASSIFIER**

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

In [None]:
y_pred1 = model.predict(X_test)

In [None]:
roc_auc_score(y_test, y_pred1)

WOW!! AUC SCORE INCREASED

In [None]:
cm = confusion_matrix(y_test, y_pred1)
cm

In [None]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)

AMAZING F1 SCORE

In [None]:
predicted_probab = model.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)

**ROC CURVE**

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', color='green',label='XGB Classifier')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:

my_submission = pd.DataFrame({'Id': testing, 'Stroke': y_pred1})
my_submission.to_csv('submission.csv', index=False)
my_submission = pd.read_csv('submission.csv')
my_submission


**LET'S FIND OUT THE BEST PARAMETERS**

In [None]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
xgb = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

**APPLYING GRID SEARCH**

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4,  verbose=3, random_state=1001 )
random_search.fit(X_train, y_train)

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

**CONCLUSION**

IN THE GIVEN DATASET WE FIRT APPLIED GENERAL PREPROCESSING TO REMOVE/IMPUTE MISSING VALUES. STANDARDIZATION WAS IMPORTANT AS INDEPENDENT FEATURES WERE IN DIFFERENT SCALES.
WE MUST MAKE SURE THAT RARGET CLASS IS NOT IMBALANCED AND IF IT IS SO THEN WE MUST HANDLE IT USING APPROPRIATE TECHNIQUE.
AMONG THREE MODELS APPLIED, XGBOOST WAS FOUND TO BE THE MOST SUCCESSFUL WITH F1 SCORE OF AROUND 95%. IN SUCH TYPE OF DATASET LIKE THIS WHERE THERE IS HIGH CLASS IMBALANCE ACCURACY METRIC SHOULD NOT BE RELIED ON. WE MUST SEE CONFUSION MATRIX FOR CLEAR INSIGHT OF HOW THE MODEL IS PERFORMING.


IF YOU FIND THIS NOTEBOOK USEFUL THEN PLEASE UPVOTE!!



THANK YOU..