# IMPORTS

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

# LOAD DATA

In [None]:
risk_avc_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# DATA DESCRIPTION

In [None]:
risk_avc_df.head()

In [None]:
risk_avc_df.head()

## Data Dimesion

In [None]:
print('number of rows: {}'. format(risk_avc_df.shape[0])) #Shape print linhas e colunas de um dataframe
print('number of cols: {}'. format(risk_avc_df.shape[1]))

## DataTypes

In [None]:
risk_avc_df.dtypes

In [None]:
#Id column is not relevant
risk_avc_df.drop(['id'], axis=1, inplace=True)

In [None]:
risk_avc_df['age'] = risk_avc_df['age'].astype(int) # Converting age to int
risk_avc_df.head()

##  Check NA

In [None]:
risk_avc_df.isna().sum()

In [None]:
risk_avc_df.bmi.fillna(risk_avc_df['bmi'].mean(), inplace=True)
risk_avc_df.head()

In [None]:
print('Total = ', len(risk_avc_df))
print('Total people with stroke: = ', len( risk_avc_df[risk_avc_df['stroke'] == 1] ))
print('percentage with Stroke: =', round((len(risk_avc_df[risk_avc_df['stroke'] == 1]) / len(risk_avc_df)) * 100))
print('Total people without Stroke =', len( risk_avc_df[risk_avc_df['stroke'] == 0] ))
print('percentage without Stroke = ', round((len(risk_avc_df[risk_avc_df['stroke'] == 0]) / len(risk_avc_df)) * 100))

In [None]:
risk_avc_df.dtypes

## Descriptive Statistical

In [None]:
#SEPARATING NUMERICAL AND CATEGORICAL VARIABLES FOR FURTHER ANALYSIS

num_attributes = risk_avc_df[['age', 'avg_glucose_level', 'bmi']]
cat_attributes = risk_avc_df[['gender', 'hypertension', 'heart_disease', 'ever_married',
                             'work_type', 'Residence_type', 'smoking_status']]

In [None]:
cat_attributes.sample()

In [None]:
num_attributes.sample()

# FEATURE ENGINEERING

In [None]:
df1_avc = risk_avc_df.copy()

In [None]:
df1_avc.head()

# EXPLORATORY DATA ANALYSIS

In [None]:
df2_avc = df1_avc.copy()

In [None]:
df2_avc.head()

## Numerical Variable

In [None]:
num_attributes.hist(bins=25, figsize=(20,10));

In [None]:
plt.figure(figsize=(15,5))
sns.countplot( x='stroke', palette='Set2', data=risk_avc_df);

## BIVARIATED ANALYSIS

In [None]:
plt.figure(figsize=[15,5])
sns.boxplot(x=num_attributes['age'], y=cat_attributes['gender']);

In [None]:
plt.figure(figsize=(15,5))

sns.countplot( x='gender', hue=risk_avc_df['stroke'], palette='Set2', data=cat_attributes);

**1)** Most people who have had a stroke work in the private sector, because it is where they most employ them.

**2)** Next comes self-employed workers

In [None]:
plt.figure(figsize=(15,5))

sns.countplot( x='work_type', hue=risk_avc_df['stroke'], palette='Set2', data=cat_attributes);

In [None]:
plt.figure(figsize=(15,5))
sns.countplot( x='Residence_type', palette='Set2', data=cat_attributes);

Workers residing in urban and rural areas are distributed even though they are equally

In [None]:
plt.figure(figsize=(15,5))
sns.countplot( x='ever_married', hue=risk_avc_df['stroke'], palette='Set2', data=cat_attributes);

Most people who have had a stroke are married.A maioria da pesssoas que tiveram AVC são casadas.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot( x='smoking_status', hue=risk_avc_df['stroke'], palette='Set2', data=cat_attributes);

People who did not smoke had a higher incidence of stroke than others who smoke or have already smoked. However, there is a large part that is unknown and that may - or not - change this scenario.

## Numerical Attributes


In [None]:
plt.figure(figsize=(10,5))
correlation = num_attributes.corr( method='pearson' )
sns.heatmap( correlation, annot=True );

## Categorical Attributes

In [None]:
plt.figure(figsize=(10,5))
correlation = cat_attributes.corr( method='pearson' )
sns.heatmap( correlation, annot=True );

# PRE-PRECEDING THE BASE

In [None]:
risk_avc_df.head(3)

## CREATING DUMMYS VARIABLES

The use of the dummy variable will allow the capture of the difference of the expected value between categories, that is, the coefficient (Beta) of the model will be the average value that a given category represents.

In [None]:
cat_attributes.head(2)

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
cat_attributes = onehotencoder.fit_transform(cat_attributes).toarray()

In [None]:
cat_attributes = pd.DataFrame(cat_attributes)

In [None]:
cat_attributes.head(3)

In [None]:
x = pd.concat([cat_attributes, num_attributes], axis=1)

In [None]:
x.head()

## PUTTING VALUES ON THE SAME SCALE

The transformation of your data is a practice to prevent your algorithm from being biased towards variables with a higher order of magnitude.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [None]:
x

In [None]:
y = risk_avc_df['stroke']
y

## SORTING OUT BASE TRAIN/TEST**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size = 0.25)

# TESTING UNDERSIMPLING

In [None]:
plt.figure(figsize=(15,5))
sns.countplot( x='stroke', palette='Set2', data=risk_avc_df);

## As we can see in the graph, the base is unbalanced

In [None]:
# use under-sampling technique
rus = RandomUnderSampler()
x_res, y_res = rus.fit_resample(x_train, y_train)
 
# see the balance of classes
print(pd.Series(y_res).value_counts())
 
# plot the new class distribution
sns.countplot(y_res);

# LINEAR REGRESSION AFTER UNDER-SAMPLING

In [None]:
model_res = LogisticRegression()

In [None]:
model_res.fit(x_res, y_res)

In [None]:
# make predictions on top of test data
y_pred_res = model_res.predict(x_test)
y_proba_res = model_res.predict_proba(x_test)

# print model accuracy
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, y_pred_res)))
 
# plot the confusion matrix
cm = confusion_matrix(y_test, y_pred_res)
sns.heatmap(cm, annot=True);
 
# print classification report
print("Relatório de Classificação:\n", classification_report(y_test, y_pred_res, digits=4))

# LINEAR REGRESSION BEFORE UNDER-SAMPLING

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(x_train, y_train)

In [None]:
y_lr_predictic = lr.predict(x_test)

In [None]:
# Generating confusion matrix
cm = confusion_matrix(y_test, y_lr_predictic)
sns.heatmap(cm, annot=True);

In [None]:
print(classification_report(y_test, y_lr_predictic)); # View all the metrics together