In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
stroke = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
stroke.head()

In [None]:
stroke.info()


* There are 12 columns, of which 7 are numerical and 5 are categorical
* Right off the bat we can see that BMI is the only column with Null values, all the other columns don't seem to have any.
* id is a useless column for modeling as it is unique for all 5110 rows. 
* There are 5 object columns (gender, ever_married, work_type, residence_type, and smoking_status) that will need to be converted before we can create a model.

## Analyzing Categorical Columns

In [None]:
stroke.nunique()

In [None]:
cat_cols = stroke.nunique()[stroke.nunique()<6].index
cat_cols

* Interestingly, gender has 3 unique values, which should be checked out in case it indicates some sort of missing values.
* Smoking status has 4 unique values that probably need to be checked out

In [None]:
for i in cat_cols:
    print('The unique values in',i,'are:',stroke[i].unique())

* Looking at the values above, all the columns check out so far, although maybe the number of unique values in smoking_status can be reduced  

In [None]:
stroke['gender'].value_counts()

In [None]:
for i in cat_cols:
    stroke[i].value_counts().plot.bar()
    plt.title(i)
    plt.show()

* In gender there is only 1 record having the label 'Other'.
* 'never_worked' also has a similar problem where only a few records have that label. 
* I might drop all records that contain these labels just to make the columns more stratified.
* Number of people who have had a stroke is way too little

In [None]:
for i in cat_cols:
    print('The % of each unique value having a stroke by',i,'is:')
    print(stroke.groupby(i)['stroke'].mean())
    print('-'*100)

* Funnily enough, married people are more likely to have a stroke! None of the other columns have such stark differences in mean

## Analyzing Numerical Columns

In [None]:
stroke.drop(columns='id').describe()

* The ages of the patients range from 0.08 to 82 years old. Mean and median are pretty close so there isn't a lot of skew.
* The mean for hypertension 0.097, so only 9.7% of the 5110 records have high blood pressure. The heart_disease and stroke  columns also have the same issue. This indicates a class imbalance.
* avg_glucose levels are around 106 and 75% of the data is below 114, so I'll assume that this was taken after a meal. The normal range after a meal 70-140.
* Median bmi of 28 is concerning because that means over half the dataset is overweight
* Max of bmi is 97.6 which is wild and should be looked into

In [None]:
int_cols = list(set(stroke.columns)-set(cat_cols))
int_cols.remove('id')
int_cols 

In [None]:
stroke[stroke.bmi>50]['stroke'].value_counts()

#### According to this dataset, there are 79 people with a BMI of over 50, but only 1 of them have had a stroke. This doesn't seem right to me since a higher BMI is generally linked to an increased chance for a stroke. A record of the heights/weights of these people would help solve this issue

In [None]:
sns.heatmap(stroke.corr(),annot=True)

In [None]:
stroke[stroke['work_type']=='children']['age'].max()

Children go up till the age of 16 and stop there

In [None]:
stroke[stroke['age']==17]['work_type'].value_counts()

Seems like all (except 3 people) aged 17 have jobs

In [None]:
stroke_drop = stroke[stroke['work_type']!='Never_worked']

In [None]:
stroke_drop = stroke_drop[stroke_drop['gender']!='Other']

In [None]:
stroke_drop[stroke_drop['stroke'] == 1]['stroke'].count()

The above codes drop all the values of people who have never worked and are not male/female. This does not result in loss of information since all these people have not had strokes

In [None]:
stroke_drop['smoking_status']=stroke_drop['smoking_status'].replace({'formerly smoked':'smokes'})

This was done to reduce unnecessary clutering in the number of categories. I combined the people who currently smoke and formerly smoked into one.

In [None]:
stroke_drop.groupby('smoking_status')['stroke'].mean()

In [None]:
stroke_drop['bmi'].describe()

In [None]:
bmi_na = stroke_drop[stroke_drop.bmi.isna()]
bmi_na.head()

In [None]:
bmi_na.describe()

* 23% of the people without a bmi have had hypertension, which is much larger than the 9% in the df. 
* 16% had heart diseas compared to the 5% in the df. 
* Almost 20% of these people have also had a stroke.

In [None]:
print('Avg BMI of people with hypertension is:',stroke.groupby('hypertension')['bmi'].mean()[1])
print('Median BMI of people with hypertension is:',stroke.groupby('hypertension')['bmi'].median()[1])

In [None]:
print('Avg BMI of people who have had a stroke is:',stroke.groupby('stroke')['bmi'].mean()[1])
print('Median BMI of people who have had a stroke is:',stroke.groupby('stroke')['bmi'].median()[1])

In [None]:
print('Avg BMI of people with heart disease is:',stroke.groupby('heart_disease')['bmi'].mean()[1])
print('Median BMI of people with heart disease is:',stroke.groupby('heart_disease')['bmi'].median()[1])

I will plug in the average for all the missing values in bmi.

In [None]:
stroke_drop['bmi'] = stroke_drop['bmi'].fillna(28)

In [None]:
stroke_drop.isna().sum()

No missing values left

In [None]:
X = stroke_drop.drop(columns='stroke')
y = stroke_drop['stroke']

In [None]:
X.nunique()

In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

In [None]:
X = X.drop(columns = ['id','gender_Female','ever_married_No','Residence_type_Rural'])

Dropped all the redundant columns because they contain the same information as the columns already in the table

In [None]:
X[int_cols].skew()

Columns like glucose levels and bmi seem to be pretty skewed, but age is fine

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
pt = PowerTransformer(method='box-cox')

In [None]:
X[['avg_glucose_level','bmi']] = pt.fit_transform(X[['avg_glucose_level','bmi']])

In [None]:
X[int_cols].skew()

This takes care of the skewness problem and makes our data more normally distributed.
Now it's time to make the model!

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
logreg = LogisticRegression(max_iter=1000)

In [None]:
logreg.fit(X_train,y_train)

In [None]:
print('Training score:',logreg.score(X_train,y_train))
print('Training score:',logreg.score(X_test,y_test))

95% accuracy on our testing set, wow!
Not going to lie, this feels weird because the correlation between the variables was very low. In addition to this, the % of people who had a stroke in the dataset was almost 5, which is really low too. I was expecting a very bad model.

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
np.sum(y_pred)

And now we see why! Our model isn't predicting any 1s at all. This is definitely because of how unbalanced the dataset is. There are some potential solutions to this and I would like to visit them at another time. 