In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
warnings.filterwarnings("ignore")
sns.set()
sns.set_palette('colorblind')

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# Copying the original data into a new python variable object data_new
data_new = data.copy()

# Converting the above python object into a pandas dataframe object
df = pd.DataFrame(data_new)

## 1. Let's understand our data

In [None]:
print(df.shape)

In [None]:
df.head(15)

In [None]:
df.describe().transpose()

### Following are the insights gathered from the current dataframe

1. We have <b>12</b> variables and <b>5110</b> entries.
2. Average <b>Age</b> of an individual is <b>43 yrs</b>.
3. The mean <b>Average Glucose Level</b> in a person is recorded approximately as <b>106.15</b>.
4. The average <b>BMI</b> of a person is recorded approximately as <b>29</b>.

## 2. Data Categorization

1. We would categorize the existing variables of our existing dataframe into numerical and categorical variables.
2. So, let's analyse the datatypes of all the variables and then we would categorize them.

In [None]:
df.info()

In [None]:
# Separating the categorical and numerical variables
df_cat = df.select_dtypes("object")
df_num = df.select_dtypes("number")

In [None]:
# Displaying the initial entries of the new datatypes created
df_cat.head()

In [None]:
df_num.head()

## 3. EDA(Exploratory Data Analysis)

1. We now begin our journey of EDA by first analysing the categorical variables.
2. First we would check if there are any null variables present in any variable of the <b>df_cat</b> dataframe.

In [None]:
df_cat.isnull().sum()

## 3.1 Univariate Analysis

## 1) Analysis of unique values and their count for each variable of the df_cat dataframe.  

In [None]:
print('Unique values gender count: ', df_cat['gender'].nunique()) 
print('gender values: ', df_cat['gender'].unique())

In [None]:
print('Unique values ever_married count: ', df_cat['ever_married'].nunique()) 
print('ever_married values: ', df_cat['ever_married'].unique())

In [None]:
print('Unique values work_type count: ', df_cat['work_type'].nunique())
print('work_type values: ', df_cat['work_type'].unique())

In [None]:
print('Unique values Residence_type count: ', df_cat['Residence_type'].nunique())
print('Residence_type values: ', df_cat['Residence_type'].unique())

In [None]:
print('Unique values smoking_status count: ', df_cat['smoking_status'].nunique())
print('smoking_status values: ', df_cat['smoking_status'].unique())

### a) Analysis of count of each category of the categorical variables of df_cat dataframe.

In [None]:
pd.value_counts(df_cat['gender'])

In [None]:
pd.value_counts(df_cat['ever_married'])

In [None]:
pd.value_counts(df_cat['work_type'])

In [None]:
pd.value_counts(df_cat['Residence_type'])

In [None]:
pd.value_counts(df_cat['smoking_status'])

### b) ever_married variable analysis to find out the percentage of married and unmarried people.

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(df_cat['ever_married']), 
        labels = ['Yes', 'No'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people married', fontsize = 18, fontweight = 'bold')
plt.show()

### c) work_type variable analysis to find out the percentage of people employed in different sectors.

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(df_cat['work_type']), 
        labels = ['Private', 'Self-employed', 'children', 'Govt_job', 'Never_worked'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people working in different sectors', fontsize = 18, fontweight = 'bold')
plt.show()

### d) Residence_type variable analysis to find out the percentage of people staying in Urban and Rural areas.

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(df_cat['Residence_type']), 
        labels = ['Urban', 'Rural'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people staying in different areas', fontsize = 18, fontweight = 'bold')
plt.show()

### e) smoking_status variable analysis to find out the percentage of different smoking categories. 

In [None]:
plt.figure(figsize = (10, 8))
plt.pie(pd.value_counts(df_cat['smoking_status']), 
        labels = ['never smoked', 'Unknown', 'formerly smoked', 'smokes'],
        autopct = '%.2f%%',
        textprops = {'size' : 'x-large',
                     'fontweight' : 'bold', 
                     'rotation' : '30',
                     'color' : 'w'})

plt.legend()
plt.title('Percentage of people of different smoking categories', fontsize = 18, fontweight = 'bold')
plt.show()

### Following are the insights gathered from the df_cat dataframe

1. <b>Maximum entries</b> are of <b>females</b> as compared to <b>males</b>.
2. <b>66.62%</b> of the total population is <b>married</b> which amounts to a count of <b>3353</b>.
3. <b>34.38%</b> of the total population is <b>unmarried</b> which amounts to a count of <b>1757</b>.
4. <b>57.24%</b> people are <b>Private</b> sector employees.
5. <b>16.03%</b> people are <b>Self-employed</b>.
6. <b>13.44%</b> of the total population comprises of <b>children</b>.
7. <b>12.86%</b> people are <b>Government</b> job employees.
8. <b>0.43%</b> of the population have <b>never worked</b> at all.
9. People staying in <b>Urban</b> and <b>Rural</b> areas are <b>approximately same</b>.
10. <b>37.03%</b> people have <b>never smoked</b> in their life.
11. The <b>smoking status</b> of <b>30.22%</b> of the total population is <b>unknown</b>.
12. <b>17.32%</b> people had <b>smoked earlier</b> in their life but then quit it afterwards.
13. <b>15.44%</b> people are <b>currently smoking</b> atleast one cigarette a day on an average.

## 2)  Analysis of each category of the numerical variables of df_num dataframe.

1. First we would check if there are any null variables present in any variable of the df_num dataframe.

In [None]:
df_num.isnull().sum()

### a) bmi variable analysis to find out null values.

1. We observe that the variable <b>bmi</b> has <b>201</b> missing values.
2. Let's first observe the distribution of <b>bmi</b> variable without the missing values.
3. Then we will find out if we can replace them either by mean, median or mode.

In [None]:
print('Mean of bmi variable: ', df_num['bmi'].mean())
print('Median of bmi variable: ', df_num['bmi'].median())
print('Mode of bmi variable: ', df_num['bmi'].mode())

In [None]:
plt.subplot(1,2,1)
df_num_nonull = df_num.dropna()
sns.distplot(df_num_nonull['bmi'])
plt.subplot(1,2,2)
df_num['bmi'].plot.box(figsize = (16, 5))
plt.show()

In [None]:
# Replacing the missing values of bmi by it's median.

df_num['bmi'].fillna(df['bmi'].median(), inplace = True)
plt.subplot(1,2,1)
sns.distplot(df_num['bmi'])
plt.subplot(1,2,2)
df_num['bmi'].plot.box(figsize = (16, 5))
plt.show()

1. We chose to replace the missing values of the <b>bmi</b> variable by it's median value <b>28.1</b> after analysis. 
2. Now let's observe the distribution and box plots of the other remaining variables of the df_num dataframe.

### b) age variable analysis.

In [None]:
plt.subplot(1,2,1)
sns.distplot(df_num['age'])
plt.subplot(1,2,2)
df_num['age'].plot.box(figsize = (16, 5))
plt.show()

### c) avg_glucose_level variable analysis.

In [None]:
plt.subplot(1,2,1)
sns.distplot(df_num['avg_glucose_level'])
plt.subplot(1,2,2)
df_num['avg_glucose_level'].plot.box(figsize = (16, 5))
plt.show()

### d) Outlier treatment of bmi and avg_glucose_level variables

1. From the above box plot diagrams, we can figure out that the bmi and avg_glucose_level variables have outliers above the upper limits.
2. So let's find out the upper limit values for variables bmi and average_glucose_level.

### bmi

In [None]:
bmi_high1 = np.percentile(df_num['bmi'], 99.7)
bmi_high2 = np.mean(df_num['bmi']) + 3 * (np.std(df_num['bmi']))
print(bmi_high1)
print(bmi_high2)

In [None]:
(df_num['bmi'] > bmi_high1).value_counts()

### avg_glucose_level

In [None]:
avg_glucose_level_high1 = np.percentile(df_num['avg_glucose_level'], 99.7)
avg_glucose_level_high2 = np.mean(df_num['avg_glucose_level']) + 3 * (np.std(df_num['avg_glucose_level']))
print(avg_glucose_level_high1)
print(avg_glucose_level_high2)

In [None]:
(df_num['avg_glucose_level'] > avg_glucose_level_high1).value_counts()

1. Let's replace the outlier values of bmi and avg_glucose_level variables by the values obtained above.

In [None]:
df_num['bmi'] = np.where(df_num['bmi'] > bmi_high1, bmi_high1, df_num['bmi'])
df_num['avg_glucose_level'] = np.where(df_num['avg_glucose_level'] > avg_glucose_level_high1, avg_glucose_level_high1, 
                                       df_num['avg_glucose_level'])

1. Let's observe the impact of the changes made in the bmi and avg_glucose_level variables.

### New bmi - Part 1

In [None]:
plt.subplot(1,2,1)
sns.distplot(df_num['bmi'])
plt.subplot(1,2,2)
df_num['bmi'].plot.box(figsize = (16, 5))
plt.show()

### New avg_glucose_level

In [None]:
plt.subplot(1,2,1)
sns.distplot(df_num['avg_glucose_level'])
plt.subplot(1,2,2)
df_num['avg_glucose_level'].plot.box(figsize = (16, 5))
plt.show()

From the revised distribution plots of the bmi and avg_glucose_level variables, we observe that:
1. Variable bmi has achieved a normal distribution and we can further work on it to remove the outliers from this new distribution.
2. Variable avg_glucose_level cannot be worked out further because no changes are seen in it.

### e) Manipulation of New bmi variable obtained in Part 1.

In [None]:
new_bmi_high1 = np.percentile(df_num['bmi'], 99.7)
new_bmi_high2 = np.mean(df_num['bmi']) + 3 * (np.std(df_num['bmi']))
print(new_bmi_high1)
print(new_bmi_high2)

In [None]:
(df_num['bmi'] > new_bmi_high1).value_counts()

1. Let's replace the 16 outlier values of the new bmi distribution by the value obtained above.

In [None]:
df_num['bmi'] = np.where(df_num['bmi'] > new_bmi_high1, new_bmi_high1, df_num['bmi'])

1. Let's observe the impact of the changes made in the new bmi variable.

### New bmi - Part 2

In [None]:
plt.subplot(1,2,1)
sns.distplot(df_num['bmi'])
plt.subplot(1,2,2)
df_num['bmi'].plot.box(figsize = (16, 5))
plt.show()

### f) Concatenating df_cat and df_num dataframes

1. Now that we are done with the univariate analysis of all the variables of df_cat and df_num dataframes, let's combine the 2 dataframes into a single dataframe again named df_new. 
2. Then, we would restructure this new dataframe like the original one.

In [None]:
df_new = pd.concat([df_num, df_cat], axis = 1)

# Restructuring the new datframe like the original one.
df_new = df_new[['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type',
                 'avg_glucose_level','bmi', 'smoking_status', 'stroke']] 

df_new.head()

## 3.2) Bivariate Analysis

### a) Analysis of gender vs. hypertension

1. Let's analyse how many males and females are suffering from hypertension

In [None]:
sns.set_style('whitegrid')
sns.countplot(x = 'hypertension', hue = 'gender', data = df_new)

### b) Analysis of gender vs. heart_disease

1. Let's analyse how many males and females are suffering from any heart disease

In [None]:
sns.set_style('whitegrid')
sns.countplot(x = 'heart_disease', hue = 'gender', data = df_new)

### c) Analysis of avg_glucose_level in different genders

1. Let's analyse the average glucose level in all genders.

In [None]:
fig = px.pie(df_new, values='avg_glucose_level', 
             names='gender',
             title='Average level of glucose in Males and Females',
             width=800)
fig.show()

### d) Analysis of avg_glucose_level in different Residence_type

1. Let's analyse the average glucose level of different Residential areas.

In [None]:
fig = px.pie(df_new,
             values = 'avg_glucose_level', 
             names = 'Residence_type', 
             title = 'Average Glucose Level In Urban And Rural areas', 
             width = 800)
fig.show()

### e) Analysis of bmi in different genders

1. Let's analyse the bmi of different genders.

In [None]:
fig = px.pie(df_new,
            values = 'bmi',
            names = 'gender',
            title = 'BMI of Males and Females',
            width = 800)
fig.show()

### f) Analysis of bmi in different Residence_type

1. Let's analyse the bmi of different Residential areas.

In [None]:
fig = px.pie(df_new,
            values = 'bmi',
            names = 'Residence_type',
            title = 'BMI of Urban and Rural',
            width = 800)
fig.show()

### g) Analysis of bmi in different age groups

1. Let's analyse the bmi for different age groups

In [None]:
fig = px.scatter(df_new,
                x = 'age',
                y = 'bmi',
                height = 800,
                width = 1000,
                title = 'BMI Of Different Age Groups')
fig.show()

### h) Analysis of average glucose level in different age groups

1. Let's analyse the average glucose level for different age groups

In [None]:
fig = px.scatter(df_new,
                x = 'age',
                y = 'avg_glucose_level',
                height = 800,
                width = 1000,
                title = 'Average Glucose Level Of Different Age Groups')
fig.show()

### Following are the insights gathered from the Bivariate Analysis

1. Approximately <b>250 Females</b> and <b>200 Males</b> suffer from <b>hypertension</b>.
2. Approximately <b>100 Females</b> and <b>150 Males</b> suffer from a <b>heart disease</b>.
3. <b>Average Glucose Level</b> of <b>Females</b> is more than <b>Males</b>.
4. <b>Average Glucose Level</b> in <b>Urban</b> areas is more than <b>Rural</b> areas.
5. The <b>BMI</b> of <b>Females</b> is more than <b>Males</b>.
6. The <b>BMI</b> in <b>Urban</b> areas is more than <b>Rural</b> areas.

## 4) Feature Engineering

### a) Performing One Hot Encoding to label the object variables of our dataframe

1. We would assign numeric labels to the following variables - gender, ever_married, work_type, Residence_type, smoking_status.
2. This would convert all the object data into numeric data.

In [None]:
to_encode_var = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for i in range(len(to_encode_var)):
    df_new[to_encode_var[i]] = LabelEncoder().fit_transform(df_new[to_encode_var[i]])

### b) Target variable separation from the dataframe df_new

In [None]:
# Storing the target variable 'stroke' in Y and storing the rest of the variables in X 
Y = df_new['stroke']
X = df_new.drop(['id','stroke'], axis = 1)

### c) Using Chi-Square test to identify significant variables for our model

In [None]:
fs = SelectKBest(score_func = chi2, k = 'all')
fs.fit(X, Y)
per = []

# for loop to calculate variable importance percentage
for i in fs.scores_:
    per.append(round(((i/sum(fs.scores_))*100), 3))

# Creating a new dataframe to display the Chi-square and Importance(%) scores    
features_data = pd.DataFrame({'Feature' : X.columns, 'Scores' : fs.scores_,
                              'Importance(%)' : per}).sort_values(by = ['Scores'], ascending = False)

plt.figure(figsize = (10, 8))
sns.barplot(x = 'Importance(%)', y = 'Feature', orient = 'h', data = features_data)

# Creating an insignificant variable to store the variables with Importance % < 0.005
insignificant = features_data.loc[features_data['Importance(%)'] < 0.005]['Feature'].unique()

features_data = features_data.set_index('Feature')
features_data

### d) Dropping the insignificant variables

In [None]:
X = X.drop(insignificant, axis = 1)

### e) Performing Feature Scaling

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

### f) Performing Train, Test & Split 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 100)

## 5) Implementing Model Selection basis the GridSearchCV Process

1. We would first import the Logisic Regression & Random Forest models.

In [None]:
# Logistic Regression
log_reg = LogisticRegression()

# Random Forest
rand_for = RandomForestClassifier()

#SVM
svc = SVC()

# Model validation basis GridSearch Cross validation
param_grid = {'C' : [0.1, 1, 10, 100, 1000, 2000], 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

print('Models Imported')

### a) Displaying Model Accuracy of different models

1. Let's print the accuracy of different models

In [None]:
model_acc = []
models = [log_reg, rand_for, svc, grid]

# Using for loop for fitting the train data i.e X_train and Y_train
for i in models:
    i.fit(X_train, Y_train)
    model_acc.append(accuracy_score(Y_test, i.predict(X_test)))
    
models = pd.DataFrame({'Models' : models, 'Accuracy Score' : model_acc})

In [None]:
models

In [None]:
best = models['Models'][0]
models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
models

## 6) Displaying the result of the best model i.e. Logistic Regression

In [None]:
print('Hence the best model is Logistic Regression')
print('\nThe classification report is:')
print(classification_report(Y_test,best.predict(X_test)))