In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()


In [None]:
df.describe()

### As you can see the mean avg glucose level is 15 points higher than the median. Data skewed right


In [None]:
df.stroke.value_counts()

In [None]:
stroke = df.stroke == 1
non_stroke = df.stroke == 0
df[stroke].describe()

In [None]:
fig,ax = plt.subplots(figsize=(10,8))
sns.set_style('whitegrid')
sns.color_palette('tab10')

sns.histplot(df[non_stroke].avg_glucose_level,bins=10)
sns.histplot(df[stroke].avg_glucose_level,color='yellow',alpha=.7)

In [None]:
#Check cat variables
for i in df[['gender','ever_married','work_type','Residence_type','smoking_status']]:
    print(i+':\n')
    print(df[i].value_counts())
    print()

##### Get categorical variables to numbers

In [None]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()
df[['gender','ever_married','work_type','Residence_type','smoking_status']] = enc.fit_transform(df[['gender','ever_married','work_type','Residence_type','smoking_status']])
df.smoking_status.value_counts()
enc.categories_


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr())

### Kinda funny age and married have the highest correlation


In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.scatter(df[non_stroke].age,df[non_stroke].bmi,c=df[non_stroke].stroke,cmap='icefire')
plt.scatter(df[stroke].age,df[stroke].bmi)

plt.xlabel('Age')
plt.ylabel('BMI')
ax.set_ylim((5,75))
ax.set_title('BMI to Age separating out people who had strokes vs those that haven\'t')

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
married = df.ever_married == 1
non_married = df.ever_married == 0
young = []
for i,_ in df.iterrows():
    if df.iloc[i].age < 45:
        young.append(1)
    else:
        young.append(0)
df['young'] = young
ax= sns.violinplot(df[married].stroke,df[married].bmi,color='forestgreen')#,c=df[married].stroke,cmap='autumn')
sns.violinplot(df[non_married].stroke,df[non_married].bmi,color='yellow',hue=df[non_married].young,split=True)

custom_lines = [Line2D([0], [0], color=(.99, .99, .0, .99), lw=4),
                Line2D([0], [0], color=(.45, .54, .3, .99), lw=4),
                Line2D([0], [0], color=(.99, .99, .88,.99), lw=4)]

ax.legend(custom_lines, ['Married', 'Not Married- Younger than 45','Not Married- Older than 45'])

plt.xlabel('stroke')
plt.ylabel('BMI')
plt.ylim((5,75))
plt.title('BMI and marriage distribution with respect to age and stroke status')


Non married stroke patients that are not married seem to be less frequent that married stroke patients

In [None]:
print('average age of stroke patient is:', df[stroke].age.mean())
print('average bmi of stroke patient is:', df[stroke].bmi.mean())
above_stroke_age = df[stroke].age.quantile(.35)
above_df = df.age > above_stroke_age
print('average bmi of patients above 35th percentile of stroke patient age is:',df[above_df].bmi.mean())

In [None]:
bmi = {}
stroke = {}
for i in [10,20,30,40,50,60,70,80,90]:
    dfi = df.loc[df.age < i]
    dfj = dfi.loc[dfi.age > i-10]
    print(f'average bmi of ages {i-10} - {i} is {dfj.bmi.mean()}')
    bmi[i] = dfj.bmi.mean()
    stroke[i] = dfj.stroke.mean() * 100
fig,ax = plt.subplots(figsize=(10,8))
sns.lineplot(data = bmi)
sns.lineplot(data=stroke)

custom_lines = [Line2D([0], [0], color=(.28, .37, .99, .91), lw=4),
                Line2D([0], [0], color=(.99, .75, 0.0, .99), lw=4)]

ax.legend(custom_lines, ['BMI', 'Stroke Percentage'])
plt.title('Average BMI per age group and Percentage of strokes in Age group')

In [None]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

lasso = Lasso(max_iter=5000)
lassoCV = LassoCV(max_iter=5000)

df = df.dropna()

X = df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']].dropna()
y = df[['stroke']].dropna()

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=74,test_size = .22)

lasso.fit(X_train,y_train)
lassoCV.fit(X_train,y_train)

pred = lasso.predict(X_test)
predCV = lassoCV.predict(X_test)

score = mean_squared_error(pred,y_test)
scoreCV = mean_squared_error(predCV,y_test)

print('mean squared error of lasso model: {}'.format(score))
print('mean squared error of lasso model with cross validation: {}'.format(scoreCV))
