In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sn

# Stroke
**Also called: CVA, cerebrovascular accident**


Damage to the brain from interruption of its blood supply.
A stroke is a medical emergency.
Symptoms of stroke include trouble walking, speaking and understanding, as well as paralysis or numbness of the face, arm or leg

#### Importing Data

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data

In [None]:
data.isnull().sum()

Bmi has null values, so filling it with mean values

In [None]:
avg = data['bmi'].mean()
avg

In [None]:
data.bmi=(data.bmi.fillna(28.90))

In [None]:
data.describe()

It shows most of the people in the dataset are around 43 years of Age

In [None]:
sn.countplot(data['work_type'])

So the most number of people work in private sector

In [None]:
sn.countplot(data['smoking_status'])

It Shows that the most number of people has never smoked.

In [None]:
sn.countplot(data['stroke'])

There is a huge difference in between which needs to be resolved

In [None]:
sn.countplot(data['ever_married'])

In [None]:
min_avg_glucose_level = min(data.avg_glucose_level)
max_avg_glucose_level = max(data.avg_glucose_level)
print(min_avg_glucose_level)
print(max_avg_glucose_level)

This column need to be standardized

In [None]:
sn.distplot(data['age'])

## There are categorical values which need to be mapped 

In [None]:
data['work_type'] = data['work_type'].map({'Private':0, 'Self-employed': 1, 'Govt_job':2, 'children':3, 'Never_worked':4})

In [None]:
data['gender'] = data['gender'].map({'Male':0, 'Female':1})
data['Residence_type'] = data['Residence_type'].map({'Urban':0, 'Rural':1})
data['smoking_status'] = data['smoking_status'].map({'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3})
data['ever_married'] = data['ever_married'].map({'Yes':0, 'No':1})

In [None]:
data

In [None]:
sn.scatterplot(x=data['age'], y=data['avg_glucose_level'])

We can clearly see that as age increases the gulucose levels tends to increase

In [None]:
sn.catplot(x='heart_disease',y='age', hue="work_type", kind="bar", data=data)

People who are self employed tends to have heart disease

In [None]:
sn.catplot(x="smoking_status", y="stroke", hue="work_type", kind="bar", data=data)

People who are self employed have more risk of Stroke and are vulnerable to other diseases as they are more stressed and have lot of tension.

In [None]:
sn.catplot(x='stroke', y="avg_glucose_level", kind="box", data=data)

High gulucose level increases chances of stroke. As high gulucose level can damage blood vessels and nerves.

## Now using Machine Learning Algorithm For Prediction

In [None]:
features = ['id','age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

target = ['stroke']

X = data[features]
y = data[target]

In [None]:
X.isnull().sum()

In [None]:
X.gender=(X.gender.fillna(1))

In [None]:
X.isnull().sum()

Now there are no Null values in our Data

**Now as we seen earlier our target it highly imbalanced. So we'll be using SMOTE to fix it**
The SMOTE function oversamples your rare event by using bootstrapping and k-nearest neighbor to synthetically create additional observations of that event. The definition of rare event is usually attributed to any outcome/dependent/target/response variable that happens less than 15% of the time.


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X, y)

In [None]:
X_train,X_test, y_train,y_test=train_test_split(x_smote,y_smote,test_size=0.33,random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
X_train = X_train.drop(columns=['id'])
X_test = X_test.drop(columns=['id'])

We don't require Id column so deleting it

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Standardization of Data is required as data are in different scales

**Now we will use Linear regression**

In [None]:
from sklearn.linear_model import LogisticRegression
md = LogisticRegression()
md.fit(X_train,y_train)

In [None]:
y_pred = md.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_pred)
print(classification_report)

In [None]:
auc = roc_auc_score(y_test, y_pred)
auc

## Our AUC Score is around 81%, Which is meant to be good.