# Stroke Prediction Dataset
### 11 clinical features por predicting stroke events

https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

1) id: unique identifier    
2) gender: "Male", "Female" or "Other"    
3) age: age of the patient    
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension    
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease    
6) ever_married: "No" or "Yes"    
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"    
8) Residence_type: "Rural" or "Urban"     
9) avg_glucose_level: average glucose level in blood    
10) bmi: body mass index    
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*    
12) stroke: 1 if the patient had a stroke or 0 if not    
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient    

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
with open('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', 'r') as t:
    for _ in range(3):
        print(t.readline())

In [None]:
ds = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', delimiter=',')

In [None]:
ds.head(3)

In [None]:
ds.drop(['id'], axis=1, inplace=True)

In [None]:
ds.shape

In [None]:
ds.info()

#### gender

In [None]:
ds.gender.replace(['Male', 'Female'], [0, 1], inplace=True)

- Male - 0    
- Female - 1

In [None]:
ds.gender.value_counts()

In [None]:
ds.stroke[ds.gender == 'Other']

#### "Other" in gender is collision    
the cell not filling accident    
this is one row and it refers to target == 0
this row can be removed

In [None]:
ds.drop([3116], inplace=True)

In [None]:
ds.gender = ds.gender.astype('int32', copy=False)

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.gender[ds.stroke == 0])
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.gender[ds.stroke == 1])
plt.title('stroke = 1');

#### hypertension

In [None]:
ds.hypertension.unique()

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.hypertension[ds.stroke == 0].sort_values())
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.hypertension[ds.stroke == 1].sort_values())
plt.title('stroke = 1')
plt.show()

#### heart_disease

In [None]:
ds.heart_disease.unique()

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.heart_disease[ds.stroke == 0].sort_values())
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.heart_disease[ds.stroke == 1].sort_values())
plt.title('stroke = 1')
plt.show()

#### ever_married

In [None]:
ds.ever_married.unique()

- Yes - 1     
- No - 0

In [None]:
ds.ever_married.replace(['Yes', 'No'], [1, 0], inplace=True)

In [None]:
ds.ever_married = ds.ever_married.astype('int32', copy=False)

In [None]:
ds.ever_married.value_counts()

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.ever_married[ds.stroke == 0])
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.ever_married[ds.stroke == 1])
plt.title('stroke = 1');

#### work_type

In [None]:
ds.work_type.hist();

In [None]:
ds.work_type[ds.work_type == 'Never_worked'].shape

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.work_type[ds.stroke == 0].sort_values())
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.work_type[ds.stroke == 1].sort_values())
plt.title('stroke = 1')
plt.show()

- Govt_job - 0     
- Never_worked - 1     
- Private - 2    
- Self-employed - 3    
- Children -4

In [None]:
sns.countplot(ds.work_type);

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_enc = OrdinalEncoder()

In [None]:
ds['work_type'] = ord_enc.fit_transform(ds[['work_type']])

#### Residence_type

In [None]:
sns.countplot(ds.Residence_type);

- Urban - 0    
- Rural - 1

In [None]:
ds.Residence_type.replace(['Urban', 'Rural'], [0, 1], inplace=True)

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.Residence_type[ds.stroke == 0])
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.Residence_type[ds.stroke == 1])
plt.title('stroke = 1')
plt.show()

#### BMI

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(221)
plt.hist(ds.bmi[ds.stroke == 0], bins=100)
plt.subplot(223)
plt.hist(ds.bmi[ds.stroke == 1], bins=100)
plt.subplot(122)
sns.boxplot(data=ds, y='stroke', x='bmi', orient='h')
plt.show()

#### smoking_status

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(121)
sns.countplot(ds.smoking_status[ds.stroke == 0].sort_values())
plt.title('stroke = 0')
plt.subplot(122)
sns.countplot(ds.smoking_status[ds.stroke == 1].sort_values())
plt.title('stroke = 1')
plt.show()

In [None]:
ds.smoking_status.replace(['Unknown', 'formerly smoked', 'never smoked', 'smokes'], 
                          [0, 1, 2, 3], inplace=True)

- Unknown - 0    
- formerly smoked - 1           
- never smoked - 2      
- smokes - 3

In [None]:
ds.head()

In [None]:
sns.pairplot(ds, hue='stroke');

In [None]:
plt.title('null is data', fontweight='bold')
sns.heatmap(ds.isnull().sum().to_frame(), annot=True, fmt='d', cmap='vlag')
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
plt.subplot(121)
plt.pie(ds.stroke.value_counts(), explode=(0.25, 0.0), autopct='%1.1f%%', startangle=60)
plt.title('Pie stroke')
plt.subplot(122)
sns.countplot(ds.stroke)
plt.show()

In [None]:
ds.stroke.value_counts(normalize=True)

In [None]:
ds.stroke[ds.stroke == 1].count()

#### Where is null

In [None]:
sns.countplot(ds.stroke[ds.bmi.fillna(9999) == 9999]);

#### correlation

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(ds.corr(), annot=True, fmt='.2f');

In [None]:
plt.figure(figsize=(18, 6))
col = ['age', 'avg_glucose_level', 'bmi']

for i, c in enumerate(col):
    plt.subplot(1, 3, i+1)
    sns.boxplot(data=ds, x='stroke', y=c)
    plt.title('--- {} ---'.format(c))

#### fillna -> bmi

In [None]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [None]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [None]:
ds.info()

In [None]:
ds.head(3)

In [None]:
ds.columns

In [None]:
columns_numeric = ['age', 'avg_glucose_level', 'bmi']
columns_categorical = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
columns_target = ['stroke']

### small conclusions      
- People who have hypertension and heart disease are predisposed to stroke.     
- The data shows that single people are less susceptible to stroke, although this is possible due to the fact that, that in adulthood, almost all people have families, and single people are usually young.        
- Age data shows that people are susceptible to stroke from the age of 40 to the end of their lives.           
- The influence of the profession. The data show that self employed entrepreneurs are more likely to have a heart attack.      
- Type of residence. Urban residents are slightly more susceptible to heart attacks than rural residents.       
- BMI - has almost no effect on the disease.       
- Smoking. former smokers are more susceptible to stroke. Although it is possible that they became exes because there were health problems, in particular with the heart, which led to an increase in the share of this factor.      