# Stroke Visualisation and Prediction

Hello everyone! Welcome to my notebook, where today we will be visualising the different data about strokes, and then try to predict whether a patient has a stroke or not.

<img src="https://www.cheyenneregional.org/wp-content/uploads/2020/02/strokeservicestoo.jpg" width="500px"/>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy import stats
from collections import Counter
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from plotly.subplots import make_subplots
from sklearn.svm import LinearSVC
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Here we gather our dataset and fill in the missing values using a KNNImputer.

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df = df.drop('id', axis=1)

imputer = KNNImputer()
df['bmi'] = imputer.fit_transform(np.array(df['bmi']).reshape(-1, 1)).reshape(1, -1)[0]

no_stroke = df[df['stroke']==0]
df_stroke = df[df['stroke']==1]
x_axes = ['stroke', 'not_stroke']

df.head()

### Description of dataset
* gender - "Male", "Female" or "Other"
* age - age of patient
* hypertension - whether somebody has hypertension: 1/0
* heart_disease - whether somebody has hypertension: 1 yes, 0 no
* ever_married - "Yes" or "No"
* work_type - type of profession
* Residence_type - "Urban" or "Rural"
* avg_glucose_level - average glucose level of patient
* bmi - BMI of patient
* smoking_status - smoking history of person
* stroke - whether somebody has a stroke: 1 yes, 0 no

In [None]:
def pie(col):
    count = Counter(no_stroke[col])
    count_str = Counter(df_stroke[col])
    data = pd.DataFrame({'number of people':count.values(), col:count.keys()})
    data_stroke = pd.DataFrame({'number of people':count_str.values(), col:count_str.keys()})
    
    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])

    fig.add_trace(go.Pie(
         values=data['number of people'],
         labels=data[col],
         domain=dict(x=[0, 0.5]),
         name='no stroke'),
         row=1, col=1)

    fig.add_trace(go.Pie(
         values=data_stroke['number of people'],
         labels=data_stroke[col],
         domain=dict(x=[0.5, 1.0]),
         name='with stroke'),
         row=1, col=2)
    
    fig.update_layout(legend_title=dict(text=col, font=dict(family="sans-serif", size=20)))
    fig.show()

def dists(col, num=-3, arr={'00':0, '10':0, '20':0, '30':0, '40':0, '50':0, '60':0, '70':0, '80':0, '90':0}):
    ix = 0
    traces = []
    fig = make_subplots(rows=1, cols=2)

    for data in [no_stroke, df_stroke]:
        ix += 1
        arr=arr.fromkeys(arr, 0)
        for i in data[col]:
            if col == 'avg_glucose_level':
                arr[str(np.floor(i))[:-3]+'0'] += 1
            else:
                if i != 0.0:
                    arr[str(i).split('.')[0][0]+'0'] += 1
        ax = go.Bar(x=list(arr.keys()), y=list(arr.values()), name=x_axes[ix-2])
        fig.append_trace(ax, 1, ix)
        
    fig.update_layout(go.Layout(title=col))
    fig.show()

def bin_cols(col):
    i = 0
    fig = go.Figure()
    for data in [df_stroke, no_stroke]:
        hyp = (data[col].sum()*100)/len(data[col])
        non_hyp = 100-hyp
        ax = go.Bar(x=[x_axes[i]], y=[non_hyp], name='no ' + col)
        ax1 = go.Bar(x=[x_axes[i]], y=[hyp], name=col)
        fig.add_trace(ax)
        fig.add_trace(ax1)
        i += 1
    fig.update_layout(go.Layout(title=col, barmode='stack'))
    fig.show()
    
def cat_col(col):
    i = 0
    fig = go.Figure()
    for data in [df_stroke, no_stroke]:
        count = Counter(data[col])
        for j in count:
            value = (count[j]*100)/pd.Series(count.values()).sum()
            ax = go.Bar(x=[x_axes[i]], y=[value], name=j)
            fig.add_trace(ax)
        i += 1
    fig.update_layout(go.Layout(title=col, barmode='stack'))
    fig.show()
    
def scatter(col1, col2):
    i = 0
    fig = make_subplots(specs=[[{"type": "scatter"}, {"type": "scatter"}]], rows=1, cols=2)

    for data in [df_stroke, no_stroke]:
        i += 1
        fig.add_trace(go.Scatter(x=data[col1], y=data[col2], mode='markers', 
                                 name=x_axes[i-1]), row=1, col=i)
    fig.update_layout(go.Layout(title=col1+' and '+col2))
    fig.show()

# Pie plots

In the following pie charts we will see how each feature in our dataset compares to the patients with and without strokes. Those without strokes are represented by the pie chart on the left, and those with are shown on the right.

## gender

We firstly analyse the gender of our patients, seeing that there isn't much of a difference in our graphs.

In [None]:
pie('gender')

## ever_married

Next, as we visualise whether the people have been married, we can conclude that the unmarried people without strokes are twice the amount of the unmarried people with strokes. This could be because people usually have strokes later in life, which also gives them more time to get married.

In [None]:
pie('ever_married')

## work_type

Now, we take a look at the type of work which our patients do. Over half of the people in both graphs are in private jobs, followed by self-employed. However, for the cases with strokes, there are more self-employed people and less people who devote themselves to their children than those without strokes.

In [None]:
pie('work_type')

## Residence_type

Subsequently, we visualise whether the patients in our dataset are in urban or rural places.

In [None]:
pie('Residence_type')

## smoking_status

Subsequently, the largest group in both cases are people who have never smoked.

In [None]:
pie('smoking_status')

# Bar charts

Now as we switch our attention to bar charts, we once again visualise the patients without strokes on the left graph, and those with strokes on the right.

## age

Our age feature tells us that most of the patients who have strokes are in their 50s or older.

In [None]:
dists('age')

## bmi

Furthermore, we take a look at our BMI. Here we see that most samples have BMIs from 20 to 40.

In [None]:
dists('bmi')

## avg_glucose_level

The majority of our non-stroke examples have average glucose levels from the 60s to the 100s. The same is true for those with strokes, however there is also a significant increase in the glucose levels from 160 to 240.

In [None]:
dists('avg_glucose_level', arr = {'50':0, '60':0, '70':0, '80':0, '90':0, '100':0, '110':0, '120':0, '130':0, '140':0,
               '150':0, '160':0, '170':0, '180':0, '190':0, '200':0, '210':0, '220':0, '230':0, 
               '240':0, '250':0, '260':0, '270':0})

## hypertension

The graph below tells us that a quarter of the people in our dataset with strokes have hypertension, while less than a tenth of those without a stroke also have hypertension.

In [None]:
bin_cols('hypertension')

## heart_disease

We can see in our data that almost a fifth of people with strokes have heart disease, while only 5% of those without strokes have heart disease.

In [None]:
bin_cols('heart_disease')

## ever_married

In [None]:
cat_col('ever_married')

## gender

In [None]:
cat_col('gender')

## work_type

In [None]:
cat_col('work_type')

## Residence_type

In [None]:
cat_col('Residence_type')

## smoking_status

In [None]:
cat_col('smoking_status')

# Scatter graphs

Since we have multiple numerical features in our dataset, we will want to plot them against each other using a scatter graph. Our samples of patients with strokes are on the left, and those without strokes are on the right.

## avg_glucose_level and bmi

Firstly we plot out the average glucose level with the BMI.

In [None]:
scatter('avg_glucose_level', 'bmi')

## age and avg_glucose_level

Here we can see that while there doesn't seem to be much of a pattern for the glucose level in both groups, the age of our patients with strokes is almost exclusively 50 and older.

In [None]:
scatter('age', 'avg_glucose_level')

## age and bmi

Afterwards, we scatter the age and the BMI.

In [None]:
scatter('age', 'bmi')

# Data cleaning

Now we'll switch our attention to cleaning the data in a way to input it into our classifiers.

## Transformations

Here we analyse how the distribution of the 'age', 'avg_glucose_level' and 'bmi' variables change under log transform and box cox. My conclusion is that 'age' works best with log, 'avg_glucose_level' does best with box cox and 'bmi' is good with box cox.

In [None]:
for col in ['age', 'avg_glucose_level', 'bmi']:
    fig, axes = plt.subplots(1, 3, figsize=(15, 6))
    count1 = Counter(df[col])
    count2 = Counter((df[col]+1).transform(np.log))
    count3 = Counter(stats.boxcox(df[col]+1)[0])
    
    axes[0].bar(count1.keys(), count1.values(), color='lightgreen')
    axes[1].bar(count2.keys(), count2.values(), color='pink')
    axes[2].bar(count3.keys(), count3.values(), color='skyblue')
    
    axes[0].set_xlabel('Normal')
    axes[1].set_xlabel('Log transform')
    axes[2].set_xlabel('Box Cox')
    
    for label in range(3):
        axes[label].set_title('Distribution of '+col)
        
    plt.show()
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    sns.boxplot(data=df[col], orient="h", ax=axes[0])
    sns.boxplot(data=(df[col]+1).transform(np.log), orient="h", ax=axes[1])
    sns.boxplot(data=stats.boxcox(df[col]+1)[0], orient="h", ax=axes[2])
    
    axes[0].set_xlabel('Normal')
    axes[1].set_xlabel('Log transform')
    axes[2].set_xlabel('Box Cox')
    plt.show()

## Bins

Another important piece of data cleaning would be to split our numerical features into 20 bins.

In [None]:
for col in ['age', 'avg_glucose_level', 'bmi']:
    bins = 0
    if col=='age':
        bins = 20
    else:
        bins = 10
    df[col+' bin'] = np.digitize(df[col], np.arange(df[col].min(), df[col].max(), (df[col].max()-df[col].min())/bins))
    count = Counter(df[col+' bin'])
    data = pd.DataFrame({'Bin':count.keys(), 'Number of values':count.values()})
    fig = px.pie(data, 'Bin', 'Number of values', title=col+' bins')
    fig.show()

Now in preparation for our classifiers, we convert our "ever_married" column from categorical to numerical and we apply the previous data transforms. Subsequently, we also resample our data to be more balanced in our target set, followed by splitting it into training and tests sets.

In [None]:
df['ever_married'] = LabelEncoder().fit_transform(df['ever_married'])
df['avg_glucose_level'] = (df['avg_glucose_level']+1).transform(np.log)
df['age'] = stats.boxcox(df['age']+1)[0]
df['age'] = stats.boxcox(df['age']+1)[0]

for col in ['gender', 'work_type', 'Residence_type', 'smoking_status']:
    df = pd.merge(df, pd.get_dummies(df[col]), left_index=True, right_index=True)
    df = df.drop(col, axis=1)

smote = SMOTE()
X, y = smote.fit_resample(df.drop('stroke', axis=1), df['stroke'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Predicting data

Here, we train the XGBoost, Linear SVC and Random Forest models on our dataset.

In [None]:
metrics = ['Model score', 'Cross validation cross', 'F1 score', 'ROC AUC score']
results = [[], [], [], []]
for model in [XGBClassifier(eval_metric='logloss', use_label_encoder=False), LinearSVC(),
             RandomForestClassifier()]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r1 = model.score(X_test, y_test)
    r2 = cross_val_score(model, X_test, y_test).mean()
    r3 = f1_score(y_test, y_pred)
    r4 = roc_auc_score(y_test, y_pred)
    
    for result in [r1, r2, r3, r4]:
        results[[r1, r2, r3, r4].index(result)].append(result)
        print(result)

Finally, we evaluate the performance of our models using bar charts.

In [None]:
for metric in results:
    data = pd.DataFrame({'Results':metric, 'Metric':metrics[results.index(metric)],
                        'Models':['XGBClassifier', 'LinearSVC', 'RandomForestClassifier']})
    fig = px.bar(data, 'Models', 'Results', color='Results')
    fig.update_layout(go.Layout(title=metrics[results.index(metric)]))
    fig.show()

<img src="https://3.bp.blogspot.com/-BxTaAtexRn4/T3Vn0FgFYtI/AAAAAAAAI1E/NVc9YEYXrnw/s1600/3ojlj8.jpg" width="500px"/>

## Thank you for reading this notebook.
## If you enjoyed this notebook and found it helpful, please give it an upvote and provide feedback, as it would help me make more of these.