#  Stroke prediction

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier,VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,confusion_matrix,plot_confusion_matrix,classification_report,accuracy_score,recall_score,precision_score
from imblearn.over_sampling import RandomOverSampler

## Data information

In [None]:
stroke_data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
stroke_data.head()

In [None]:
stroke_data.info()

In [None]:
stroke_data_replaced=stroke_data.copy()

In [None]:
stroke_data_replaced.fillna(value=stroke_data_replaced.mean(axis=0),axis=0,inplace=True)

In [None]:
for col in ['hypertension','heart_disease','stroke']:
    stroke_data_replaced[col]=stroke_data_replaced[col].replace([0,1],['No','Yes'])

In [None]:
stroke_data_replaced.describe()

In [None]:
stroke_data_replaced.info()

In [None]:
stroke_data_replaced.head()

In [None]:
categorical_col=[]
numerical_col=[]
for col in stroke_data_replaced.dtypes.index:
    if stroke_data_replaced[col].dtype == 'object':
        categorical_col.append(col)
    else:
        numerical_col.append(col)

In [None]:
print("Numerical Columns in DataFrame are :",numerical_col, '\n')
print("Categorical Columns in DataFrame are :",categorical_col)

## Visulization

In [None]:
def bar_plot(data,x,y='id',hue='stroke',group='stroke' ,title=None):
    """
        function use to plot barplot by grouping the data

    """
    try:
        group=data.groupby([x,group],as_index=False)['id'].count()
        sns.barplot(data=group,x=x,y=y,hue=hue,)
        plt.title("Stroke count based on "+ x , fontdict={'size' :16,'color':'red'})
        plt.ylabel('Count',fontdict={'size':12,'color':'blue'})
        plt.xlabel(x,fontdict={'size':12,'color':'blue'})
        print('-'*100)
        return display(group) ,plt.show()
    except:
        pass

In [None]:
for col in categorical_col:
    bar_plot(stroke_data_replaced,col)

Things that can be assumed on the basis of above plots:

   - Stroke Count based on gender    
        * it is around 5% for males and 4% for females those have been struggling with stroke  
        
        * this can be the important feature as 1% is a significant drop     
        
   - Stroke Count based on hypertension    
        * It is found to be person tested with hypertension is more likely to be stroked (~ 13%)   
        
        * and this is also accepted by having some prior domain knowledge     
        
   - Stroke Count based on heart_disease     
        * It is found to be person with heart disease are at high risk of stroke (~ 17%)
        
        * This is also accepted by having some prior domain knowlegde
        
   - Stroke Count Based on maritial Status       
        * 6% of married people have faced stroke         
        
        * also is it to be notice that we have more baised data around the married people nearly 2000 more entries married people
        
        * This need to be investigated further
        
   - Stroke count Based on Work Type
        * Based on the data it is seen that there are more number of records for private sector jobs
        
        * It can be possible that people having govt job are more relived than people with private jobs but on other have people having no job don't have stroke who should probaly be more worried about getting a job
        
        * And children are being tested positive for stroke 
        
        * Conclusively Stroke doesn't depends on Work Type
        
   - Stroke based on Residence Type
        * People living in Urban Areas have !% more Stroke than People Living in Rural areas
        
        * Urban areas have more Pollution level and Busy life style which can have great impact on the health of a person
        
   - Stroke Count Based on Smoking Status
        * Similary for people who formerly smoked or smokes have more chances to get Stroke than people who doesn't
        
        * It is un predictable for the people with Unknown Status

By above Study it is clear that we are going to drop the featue Work Type as it doesn't contribute to prediction

And The Featue Maritial Status need to be further investigated 

In [None]:
Feature_to_drop=['work_type']

In [None]:
def scatter_plot(data,x,y='stroke',title=None):
    plt.figure(figsize=(10,7))
    plt.title("Relationship of "+ y + " v/s "+ x , fontdict={'size':18,'color':'red'})
    sns.scatterplot(data=data,x=x,y='stroke')
    plt.xlabel(x,fontdict={'size':12,"color":'blue'})
    plt.ylabel(y,fontdict={'size':12,"color":'blue'})
    print('-'*100)
    return plt.show()

In [None]:
for col in numerical_col:
    scatter_plot(stroke_data_replaced,col)

Things that can be assumed on the basis of above plots:
    
   - Relationship of Stroke and id
        - this feature id is totally irrelavent for prediction purpose as it add no information to the data
        - every person will have different id and no pattern will be formed
   - Relationship of Stroke and age
        - According to the trend elder people will have high risk of getting stroked
   - Relationship of Stroke and average glucose level
        - People having glucose level around 150 mg/dl are less prone to get stroke
        - while people having more or less than this are more prone for stroke
   - Relationship of Stroke and average BMI
        - we can't really tell if this is a good predictor or not
        - further investgation is needed

In [None]:
stroke_data_replaced

In [None]:
Feature_to_drop.append('id')

In [None]:
stroke_data_replaced.drop(Feature_to_drop,axis=1,inplace=True)

In [None]:
sns.pairplot(stroke_data_replaced,hue='stroke')

In [None]:
sample=stroke_data_replaced.sample(n=100,random_state=72018,)
sample

In [None]:
def outlier_detection(data,x):
    plt.figure(figsize=(10,7))
    plt.title("Outliers for " + x , fontdict={'size':18,'color':'red'})
    sns.boxplot(data=data,x=x)
    plt.xlabel(x,fontdict={'size':12,"color":'blue'})
    print('-'*100)
    return plt.show()

In [None]:
for i in numerical_col:
    if i!='id':
        outlier_detection(stroke_data_replaced,i)

From this we can see clearly the Column bmi and average glucose level have a lot of outliers
we can handle this with suitable techniques like replacing it with mean or median or dropping in 
in this case we are ieabing these outliers as it is.

In hope that an extreme behaviour target variable are highly dependent on the pattern of these features

## Data Transformation

In [None]:
#this wil masl the categorical columns
mask = stroke_data_replaced.dtypes == np.object_

In [None]:
#here we separate the numerical and categorical columns
categorical_col = stroke_data_replaced.columns[mask.values]
numerical_col = stroke_data_replaced.columns[~mask.values]

In [None]:
#now separate ordinal and onehot columns
#here by the visulization we know our ordinal column have >2 variables

onehot_categorical_columns=[]
ordinal_categorical_columns=[]
for i in categorical_col:
    if stroke_data_replaced[i].nunique()<3:
        onehot_categorical_columns.append(i)
    else:
        ordinal_categorical_columns.append(i)

In [None]:
#this version of scikit-learn doesn't support the handle unknown option for ordinal encoder so this will replace ordinal encoder
converter={'gender':{'Other':0,'Male':1,'Female':2},'smoking_status':{'formerly smoked':1, 'never smoked':2, 'smokes':3, 'Unknown':4}}
for i in ordinal_categorical_columns:
    stroke_data_replaced[i]=stroke_data_replaced[i].map(converter[i])

In [None]:
#Create our transformers that will transform the columns 
#oridnal transformer
#OD=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=5)

#min max scaler
MM=MinMaxScaler()

#label encoder for target variable
LE=LabelEncoder()

#one hot encoded column
OHE=OneHotEncoder(handle_unknown='ignore')

In [None]:
#traget variable
y=stroke_data_replaced['stroke']

#remove from onehot columns
onehot_categorical_columns.remove('stroke')

In [None]:
#create column transformer that will transform the columns respective to the criteria
#transformer=ColumnTransformer(transformers=[('numerical',MM,numerical_col),
#                                            ('ordinal',OD,ordinal_categorical_columns),
#                                            ('onehot',OHE,onehot_categorical_columns)])

#use te transformer above if you are using ordinal_encoder
transformer=ColumnTransformer(transformers=[('numerical',MM,numerical_col),('onehot',OHE,onehot_categorical_columns)])

In [None]:
#feature columns
X=stroke_data_replaced.drop('stroke',axis=1)

In [None]:
#used for oversampling the data as it is highly imbalanced
ROS=RandomOverSampler(random_state=42)

In [None]:
def transform_and_resample_data(X,y):
    """
    It accepts feature column and target column.
    split the data int train and test set in stratify manner.
    transform the target with label encoder
    over sample the target set.
    return train set and test set
    """
    
    train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
    train_y = LE.fit_transform(train_y)
    test_y = LE.transform(test_y)
    train_X_resampled , train_y_resampled = ROS.fit_resample(train_X,train_y)


    return train_X_resampled,train_y_resampled,test_X,test_y

##  Model traning and prediction

In [None]:
def prune_tree(train_X,train_y,transformer):
    """
    accepts the model and train data.
    fit to Decision tree return the required parameter. 
    
    It is used to prune the tree to generate range of the parameters.
    """
    model = DecisionTreeClassifier(criterion='gini',random_state=42)
    
    pipeline=Pipeline(steps=[('transform',transformer),
                             ('model',model)])
    
    pipeline.fit(train_X,train_y)
    return model.tree_.max_depth,model.tree_.n_features

In [None]:
def model_selection(train_X,train_y,transformer,current=None):
    """
    It is used to select different models for traning purpose based in the value of current.
    
    """
    if current == 0:
        
        label='Logistic Regression'
        model = LogisticRegression(solver='liblinear')
        params = {'model__C':[0.001,0.01,0.1,1,10],
               'model__penalty':['l1','l2']}
        
    elif current == 1:
        
        label = 'KNeighborsClassifier'
        model = KNeighborsClassifier(weights='distance')
        params = {'model__n_neighbors':list(range(2,20))}
        
    elif current == 2:
        
        label = 'SVC'
        model = SVC(kernel='rbf')
        params = {'model__C':[0.001,0.01,0.1,1,10]}
        
    elif current == 3:
        
        label = 'DecisionTreeClassifier'
        model = DecisionTreeClassifier(criterion='entropy',random_state=42)
        max_depth, max_features = prune_tree(train_X,train_y,transformer)
        params = {'model__max_depth':list(range(1,max_depth+1,2)),
                 'model__max_features':list(range(1,max_features))}
        
    else:
        raise ValueError('No value passed for current')
        
    return model, params, label

In [None]:
def scoring(model , params, true, pred):
    """
    It is used to score the model based on its preformance 
    Accepts model , parameter, original labels and predicted labels
    
    """
    re_Y = recall_score(true, pred, pos_label=1)
    re_N = recall_score(true, pred, pos_label=0)
    pr_Y = precision_score(true, pred, pos_label=1)
    pr_N = precision_score(true, pred, pos_label=0)
    f1_Y = f1_score(true, pred, pos_label=1)
    f1_N = f1_score(true, pred, pos_label=0)
    acc = accuracy_score(true, pred)
    data = pd.Series({'model':model,
                      'params': params,
                      'recall_Yes': re_Y,
                      'recall_No': re_N,
                      'precision_Yes':pr_Y,
                      'precision_No':pr_N,
                      'f1_Yes':f1_Y,
                      'f1_No':f1_N,
                      'accuracy':acc})
    
    return data

In [None]:
#final dataframe which will store the performance of the model
performance =  pd.DataFrame(columns=['model','params','recall_Yes','recall_No','precision_Yes','precision_No','f1_Yes','f1_No','accuracy'])

#get train and test set
train_X,train_y,test_X,test_y = transform_and_resample_data(X,y)

#call different model fir training and testing
for i in range(4):
    
    model, param, label = model_selection(train_X, train_y, transformer, current=i)
    pipeline = Pipeline(steps = [('transform',transformer),
                             ('model',model)])
    
    #score metric 
    score = {'r':'recall','p':'precision'} 
    grid = GridSearchCV(pipeline,param,cv=4,scoring=score, refit='r')
    
    grid.fit(train_X,train_y)
    
    pred = grid.predict(test_X)
    
    para = grid.best_estimator_.named_steps['model']
    
    performance = performance.append(scoring(label, para, test_y, pred),ignore_index=True)

    string = 'This results are for '+label
    print(19*'* '+string+19*' *')
    print('----- Classification report -----')
    print(classification_report(test_y,pred))
    print('----- Confusion matrix -----')
    print(confusion_matrix(test_y,pred))

In [None]:
performance


observing recall, precision and accuracy of SVC is higher than everyone else.

while KNeighborsClassifier perform the worst.

## Voting classifier

In [None]:
#getting our model we want to use fr voting
v_1,v_2,v_3,v_4 = performance['params'].values

In [None]:
#models for voting
estimators = [('lr',v_1),('svc',v_3),('dt',v_4)]

In [None]:
#voting model
vote = VotingClassifier(estimators,voting='hard')
pipeline = Pipeline(steps = [('transform',transformer),
                             ('model',vote)])

In [None]:
#fit the model
pipeline.fit(train_X,train_y)

In [None]:
#prediction made my the pipeline
pred=pipeline.predict(test_X)

In [None]:
#classification report 
print(classification_report(test_y,pred))

In [None]:
#plot the confusion matrix
plot_confusion_matrix(pipeline,test_X,test_y)

we can see from voting the recall precision and accuray is actualy increased by fine margin.