In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns #visualization of the variables

from scipy.stats import chi2_contingency, ttest_ind

from xgboost import XGBClassifier
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Stroke Prediction Research:

The main question is that we want to understand how the predictor variables can help estimate the probability of sufferign a stroke. 

* Is there other than age relationship?
* Does having a heart disease or high BMI and glucose level related to have a higher change of suffering a stroke?

### Plans:
We should visualize a distribution of the target variable, which is the stroke, then a distribution of variables in respect to the target variable.
1. Split the model into categorial features and objects. - Done. Do Hot encoding?
2. Call the distributions on an object based way e.g. fig, ax.
3. Continue building on the models. Next XGBoost.
4. Predict, predict, predict.
5. Draw final conclusions.
6. Add an index to notebook.
7. Add more distribution visualizations.

### Models: 
Logistic regression, random forest and xgboost.

### Exploratory Data Analysis (EDA)

Is nothing but data exploration technique to understand the various aspects of the data. The idea is to check for relationship between variables and to check their distributions.


* It follow a systematic set of steps to explore the data in the most efficient way possible

Steps:
1. Understand the Data

2. Clean up the Data
3. Analysis of Relationship between variables

In [None]:
#Import Dataset to Pandas Dataframe
data = pd.read_csv(os.path.join(dirname, filename))

### 1. Understanding the Data

In [None]:
data.head()

In [None]:
data.tail()

From this quick overview, it comes to mind that we can use different variables correlations. Like age against stroke. We see that there are many variables that can be related to having a stroke: age, being married, work, etc.

### Features or data points. 

To find out how many columns, how many entries and if there are some missing values. We can use dataframe.info()

In [None]:
data.columns

In [None]:
data.info()

In [None]:
#Categorical info
cat_feat = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [None]:
#remove categorial data from our set to create the model. Can be added encoded later in the process.
num_feat = data.drop(cat_feat, axis = 1)

Another way to get the number of rows and columsn is using the data.shape panda feature. It returns a tuple. First one is for the rows and second one is for the number of columns.

In [None]:
data.shape
#12 variables and 5110 observations

### Describing the data statistically speaking function

The describe function allows us to have basic statistical information of the data. This is useful because it allows us to detect possible outliers or any strange data.

In [None]:
#data = data.drop('Id', axis = 1).describe()
data.describe()

We see that most people are on the 43 years of age. Then we can see that the mean of bmi of the population in study is at 28.

A healty range of a person BMI is between 18-25. 

The BMI depends on different factors, Like height, muscle and body type.

Note on the avg of glucose level below 140 is normal. Between 140-199 is pre-diabetes. 


In [None]:
num_feat = num_feat.drop('id', axis = 1)

In [None]:
num_feat.describe()

In [None]:
# num_feat.groupby(num_feat['bmi'].isnull()).mean()
num_feat.isna().sum()

In [None]:
#97.6 BMI? That is odd. Let's find out how many 
num_feat[num_feat['bmi']==97.6]

In [None]:
data[data['bmi']==97.6]

This is excessively strange. What should do with this data row. Only one entry with a very high bmi. Has hypertension. It's a young age male, who work in the private sector and live in a rural area with a glucose level that seems correct and has not suffered a stroke.

Proably we will need to do an imputation to update his bmi base on median bmi for his age and other related features.

But for now we are going to remove it for the purpose of fixing the distribution.

In [None]:
num_feat = num_feat[num_feat['bmi']!=97.6]

In [None]:
#Checking again
num_feat[num_feat['bmi']>40] #sort_values('bmi')

In [None]:
num_feat.describe()

In [None]:
num_feat[num_feat.bmi > 40].describe()

In [None]:
num_feat.groupby('stroke').mean()

The average age of people that has suffered a strok are a 67 with a bmi of 30 or over.

So, it shows there is is more entries with abnormal Body Mass Index. Let's check for those with a BMI over 40 with obesity class 2

In [None]:
num_feat.groupby(num_feat.bmi > 40)[['stroke', 'hypertension', 'heart_disease']].sum()

In [None]:
bmi_over_40 = num_feat[num_feat['bmi'] > 40 ]

In [None]:
bmi_over_40[num_feat['stroke'] == 1 ].sort_values(by='age')

We see that the corelation of suffering a stroke is not just age, but having a bmi over 40 and a higher sugar level. 

In [None]:
plt.figure(figsize = (9,7))
sns.scatterplot(x = 'bmi', y = 'avg_glucose_level', hue = 'stroke', data =bmi_over_40)
plt.show()

### Adult Body Mass Index (BMI)

BMI does not measure body fat directly, but research has shown that BMI is moderately correlated with more direct measures of body fat obtained from skinfold thickness measurements, bioelectrical impedance, underwater weighing, dual energy x-ray absorptiometry (DXA) and other methods 1,2,3. Furthermore, BMI appears to be strongly correlated with various adverse health outcomes consistent with these more direct measures of body fatness

### Check for unique values

In [None]:
#check for unique values
data.nunique()

It seems the majorities of values are binaries, which mean that they are categorical values e.g. "yes" or "no" except for gender which is says it has 3 types. We need to check if that is not because a typo or blank entries. 

The categorial variables with more different values are the following in ascending order:
1. smoking_status          4
2. work_type               5


### Checking Specific Unique values

In [None]:
data.gender.unique()

### Distribution of gender


In [None]:
data.gender.value_counts()

Given the fact the other gender is only 1 value. We can remove that data point from our study.

In [None]:
data = data[data['gender']!='Other']

In [None]:
data.smoking_status.unique()

In [None]:
data.smoking_status.value_counts()

In [None]:
data[data['smoking_status'] == 'Unknown']

In [None]:
#smokers and goverment jobs
smokers = data[data['smoking_status']=='smokes']

smokers.work_type.value_counts(normalize=True)

### Age distribution of smokers


In [None]:
smokers['age'].groupby(smokers['age']).count()

Given the fact that of those who are smokers. Only a few smoke at young ages and at late ages as well. So, let's see if we can slice the data from 35 - 65 years of age.

In [None]:
age_smokers = smokers['age'].groupby(smokers['age']).count()

age_smokers[35:65].sort_values(ascending=False, axis=0)

According to this result we can see that the mayority of smokers account for more than 10 are effectively on the age range of 35 through 63. With the exception of of less smokers at the age of 41, 37, 62 and 64 of age, only 9 smokers. 

On the visualiaztion section we can plot this one out to see the histogram distribution. 

In [None]:
#smokers and goverment jobs
unkn_smokers = data[data['smoking_status']=='Unknown']

unkn_smokers.work_type.value_counts(normalize=True)

In [None]:
data.work_type.unique()

## Step 2: Cleaning the data

In [None]:
data.isnull().sum() 

Questions:
No missing values except for BMI. Should we need to fill those empty values

In [None]:
data['bmi'].isnull().sum()/len(data)*100 

We have 4% of BMI missing data. 

In [None]:
#handling missing values
data['bmi'] = data['bmi'].fillna(round (data['bmi'].median(), 2))
data.isnull().sum()

Checking for outliers:
is a datapoint that differ from other observations

### Relationship Analysis

In [None]:
data.columns

In [None]:
corelation = data.drop('id', axis = 1).corr()

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(corelation, xticklabels =corelation.columns, yticklabels = corelation.columns, annot=True)
plt.show()

As we can see it looks that the more related variable to stroke is the age feature. We may consider to use a model to only use the wanted variables to remove id for example.

In [None]:
plt.figure(figsize=(7,7))
sns.heatmap(corelation, xticklabels =corelation.columns, yticklabels = corelation.columns,
            vmin=-1, vmax=1, center=0,annot=True)
plt.show()

## Data Visualizations

Checking the distribution of the target variable(stroke)

In [None]:
data.columns

In [None]:
sns.countplot(x = 'smoking_status', data = data)
plt.title("Count Plot for smoking status")
plt.show()

In [None]:
sns.countplot(x = 'work_type', data = data)
plt.title('Count Plot for Work Type')
plt.show()

In [None]:
num_data = num_feat

In [None]:
#Ploting the distribution of Stroke
sns.countplot(x='stroke', data=num_data)
plt.show()

In [None]:
x = pd.DataFrame(num_data.groupby(['stroke'])['stroke'].count())

# plot
fig, ax = plt.subplots(figsize = (6,6), dpi = 70)
ax.barh([1], x.stroke[1], height = 0.7, color = 'red')
plt.text(-1150,-0.08, 'Healthy',{'font': 'Serif','weight':'bold','Size': '16','style':'normal', 'color':'green'})
#plt.text(5000,-0.08, '95%',{'font':'Serif','weight':'bold' ,'size':'16','color':'green'})
plt.text(5000,-0.08, f"{(num_data.shape[0]/num_data.shape[0]*100) - (x.shape[0]/(num_data.shape[0])*100)*100:.0f}%" ,{'font':'Serif','weight':'bold' ,'size':'16','color':'green'})
ax.barh([0], x.stroke[0], height = 0.7, color = 'green')
plt.text(-1000,1, 'Stroke', {'font': 'Serif','weight':'bold','Size': '16','style':'normal', 'color':'red'})
plt.text(300,1, f"{((x.shape[0]/data.shape[0])*100)*100:.0f}%",{'font':'Serif', 'weight':'bold','size':'16','color':'red'})

fig.patch.set_facecolor('#f6f5f5')
ax.set_facecolor('#f6f5f5')

plt.text(-1150,1.77, 'Percentage of People Having Strokes' ,{'font': 'Serif', 'Size': '25','weight':'bold', 'color':'black'})
plt.text(4650,1.65, 'Stroke ', {'font': 'Serif','weight':'bold','Size': '16','weight':'bold','style':'normal', 'color':'red'})
plt.text(5650,1.65, '|', {'color':'black' , 'size':'16', 'weight': 'bold'})
plt.text(5750,1.65, 'Healthy', {'font': 'Serif','weight':'bold', 'Size': '16','style':'normal', 'weight':'bold','color':'green'})
plt.text(-1150,1.5, 'It is a highly unbalanced distribution,\nand clearly seen that 4 in 100 people are susceptible \nto strokes.', 
        {'font':'Serif', 'size':'12.5','color': 'black'})

ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

In [None]:
plt.figure(figsize = (16,11))
plt.subplot(2,3,1)
sns.countplot(x = 'gender', data = data)
plt.title('Countplot of Gender distribution')

plt.subplot(2,3,2)
sns.countplot(x = 'ever_married', data = data)
plt.title('Countplot of Married status distribution')

plt.subplot(2,3,3)
sns.countplot(x='work_type', data = data)
plt.title('Countplot of Work Type distribution')

plt.subplot(2,3,4)
sns.countplot(x = 'Residence_type', data = data)
plt.title('Countplot of Residence type distribution')

plt.subplot(2,3,5)
sns.countplot(x = 'smoking_status',data = data)
plt.title('Countplot of Smoking status distribution')

plt.subplot(2,3,6)
sns.countplot(x = 'heart_disease',data = data)
plt.title('Countplot of Heart Disease distribution')
plt.show()

### Distribution of BMI

Shape and the spread with histograms and box plots

In [None]:
num_data = num_feat
#handling missing values
num_data['bmi'] = num_data['bmi'].fillna(round (num_data['bmi'].median(), 2))

In [None]:
# Checking the distribution of the predictor variables. 
# Here, we will use both distplot and boxplot as shown below. 
# Let us plot each variable to show its distribution in the dataset.
#fig, ax = plt.subplots(figsize = (6,6), dpi = 70)
plt.figure(1)
plt.title('BMI Distribution before droping the abnormal entry')
plt.subplot(121), sns.distplot(num_data['bmi'])
plt.subplot(122), num_data['bmi'].plot.box(figsize=(16,5))
plt.show()

Stroke Distrution of people with a BMI over 40

In [None]:
plt.figure(1)
plt.title('Stroke Distribution with BMI over 40')
plt.subplot(121), sns.distplot(bmi_over_40['bmi'])
plt.subplot(122), bmi_over_40['bmi'].plot.box(figsize=(16,5))
plt.show()


### Distribution of Age

In [None]:
plt.figure(1)
plt.subplot(121), sns.distplot(data['age'])
plt.subplot(122), data['age'].plot.box(figsize=(16,5))
plt.show()

### Distribution of Heart Disease

In [None]:
plt.figure(1)
plt.subplot(121), sns.countplot(data['heart_disease'])
plt.subplot(122), data['heart_disease'].plot.box(figsize=(16,5))
plt.show()

### Distribution of AVG Glucose Level

In [None]:
plt.figure(1)
plt.subplot(121), sns.distplot(data['avg_glucose_level'])
plt.subplot(122), data['avg_glucose_level'].plot.box(figsize=(16,5))
plt.show()

### Plotting relationships in the dataset. 

There are different ways to display relationships using a dataset. You can use pair plots, joint plots, correlations, etc. we will the use pairplot to find out relationships in the dataset.

In [None]:
#sns.pairplot(corelation)

import warnings
warnings.filterwarnings('ignore')
sns.pairplot(data, hue= 'stroke')
plt.show()

In [None]:
sns.relplot(x='stroke', y='age', hue='gender', data=data ) 
plt.show()

With this it seems that a confusion matrix and a logistic regression may whow a better relationship because this is showing that there is not a linear relationship. 

From this bar chart we can clearly see that for people over 40 years old the majority suffered a stroke. We have an uptick at age 40 then it drops until about age 55 through 65 and drops again and goes all the way up at age 80.

In [None]:
# Scatter Plot
plt.figure(figsize = (9,7))
sns.scatterplot(x = 'bmi', y = 'avg_glucose_level', hue = 'stroke', data =bmi_over_40)
plt.title('Stroke cases - For those with a BMI over 40 ',y=1.05)

plt.xlabel('BMI Level')
plt.ylabel('Avg Glucose Level')
plt.show()

## Hypothesis Testing

##### Chi Square testing

In [None]:
def chi2_dependency(data_df, x,y):
    ctab = pd.crosstab(data_df[x], data_df[y])
    stat, p, dof, expected = chi2_contingency(ctab)
    alpha1 = 0.05
    alpha2 = 0.01
    print('--------------Chi Squared Hypothesis Test Results-------------------')
    print('Variable X: ',x)
    print('Variable Y: ',y)
    if p<alpha1 and p > alpha2:
        print('P-value: ',p)
        print('We reject the NUll Hypothesis H0')
        print('There is some evidence to suggest that {} and {} are dependent'.format(x,y))
    if p < alpha1 and p < alpha2:
        print('P-value: ',p)
        print('We reject the NUll Hypothesis H0')
        print('There is substantial evidence to suggest that {} and {} are dependent'.format(x,y))
    else:
        print('P-value: ',p)
        print('We fail to reject the NUll Hypothesis H0')
        print('There is no evidence to suggest that {} and {} are independent'.format(x,y))
        
    print()

In [None]:
chi2_dependency(data,'gender','stroke')
chi2_dependency(data,'ever_married','stroke')
chi2_dependency(data,'hypertension','stroke')
chi2_dependency(data,'heart_disease','stroke')
chi2_dependency(data,'work_type','stroke')
chi2_dependency(data,'Residence_type','stroke')
chi2_dependency(data,'smoking_status','stroke')

* Gender and Residential Type do not seem to have an impact on stroke
* Smoking Status, Work Type, Heart Disease, Hypertension and Married status have an impact on stroke

In [None]:
data.columns

In [None]:
ctab = pd.crosstab(data['smoking_status'], data['stroke'])


ctab.plot.bar(stacked = True, figsize = (8,5))
plt.xlabel('Smoking Status')
plt.ylabel('Stroke')
plt.title('Smoking Status and Stroke')
plt.show()

In [None]:
ctab = pd.crosstab(data['ever_married'], data['stroke'])

ctab.plot.bar(stacked = True, figsize = (8,5))
plt.xlabel('Ever Married')
plt.ylabel('Stroke')
plt.title('Married Status and Stroke')
plt.show()

print('Ratio of stroke affected from ever_married class',
      len(data[data['stroke']==1])/len(data[data['ever_married']=='Yes']))
      
print('Ratio of stroke affected from never married class',
      len(data[data['stroke']==1])/len(data[data['ever_married']=='No']))

In [None]:
ctab = pd.crosstab(data['hypertension'], data['stroke'])

ctab.plot.bar(stacked = True, figsize = (8,5))
plt.xlabel('hypertension')
plt.ylabel('Stroke')
plt.title('hypertension Status and Stroke')
plt.show()

print('Ratio of stroke affected from hypertension=1 class',
      len(data[data['stroke']==1])/len(data[data['hypertension']==1]))
      
print('Ratio of stroke affected from no hypertension class',
      len(data[data['stroke']==1])/len(data[data['hypertension']==0]))

Almost 50% of samples having hypertension were found to have suffered stroke

In [None]:
ctab = pd.crosstab(data['heart_disease'], data['stroke'])

ctab.plot.bar(stacked = True, figsize = (8,5))
plt.xlabel('heart_disease')
plt.ylabel('Stroke')
plt.title('heart_disease and Stroke')
plt.show()

print('Ratio of stroke affected from heart_disease = 1 class',
      len(data[data['stroke']==1])/len(data[data['heart_disease']==1]))
      
print('Ratio of stroke affected from no heart_disease class',
      len(data[data['stroke']==1])/len(data[data['heart_disease']==0]))

Almost 90% of samples having heartdisease were found to have suffered stroke

#### T-tests

We will perform 2 sample t-test on 'BMI' column to check if the mean BMI of stroke group is different from the non stroke group.
<br></br>
Before performing this test we will check ratio of variance of each group

In [None]:
data[data['stroke']==0]['bmi'].var()/data[data['stroke']==1]['bmi'].var()

Since the ratio of variance < 4, we will assume them to be having equal variance

In [None]:
statistic, pval = ttest_ind(a=data[data['stroke']==0]['bmi']  , b = data[data['stroke']==1]['bmi'], equal_var=True)
pval

* Since pvalue < 0.01, we reject the Null Hypothesis H0
* We can conclude that the population BMI mean of stroke vs non-stroke groups are different



Now we can start creating our model and start our predictions. Also, we can include other features to see if there is any other related variable. 

### Data Transformation

In [None]:
target_col = ['stroke']
num_cols = ['id', 'age', 'avg_glucose_level', 'bmi']
cat_cols = [col for col in data.columns if col not in num_cols+target_col]

### Label encoding

Label encode the binary categorical columns containing strings

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['ever_married'] = label_encoder.fit_transform(data['ever_married'])
data['Residence_type'] = label_encoder.fit_transform(data['Residence_type'])

One-hot encode the multi category columns

In [None]:
data = pd.get_dummies(data, prefix = ['work_type'], columns = ['work_type'])
data = pd.get_dummies(data, prefix = ['smoking_status'], columns = ['smoking_status'])

## Training the Data

We will now split our dataset before we train it. X will contain all the Independent variables while y will have the Dependent variable ('stroke')

In [None]:
#Splitting the dataset
x = num_data.drop('stroke', axis=1)
y = num_data.stroke


After successfully splitting the dataset, let us train it using train_test_split.

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size = 0.3, random_state=1)

In [None]:
#include categorical values in the dataset
#Since we are using a Tree based model, One-Hot encoding is not an absolute necessity
#However, this dataset, train and test sets will be updated whenever one-hot encoding will be used
from sklearn.model_selection import train_test_split

#Splitting the dataset
x = data.drop('stroke', axis=1)
y = data.stroke

xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size = 0.3, random_state=1)

## Building the Models

As I stated earlier, we will use four models i.e. Random Forests, Decision Trees, Support Vector Machine and XGBoost to get the best accuracy score. ‘Accuracy’ metric is used to evaluate models. It is the ratio of the number of correctly predicted instances in a dataset divided by the total number of instances in the dataset. We will proceed further to explore more metrics to determine the best model.

In [None]:
#Let explore with the Random Forests Algo

In [None]:
#Before proceeding for tree based models, lets check rank of feature importance on a decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(xtrain,ytrain)

In [None]:
print(len(xtrain.columns.tolist()))
len(dt.feature_importances_)

In [None]:
plt.figure(figsize = (8,8))
sns.barplot(x = dt.feature_importances_, y = xtrain.columns.tolist())

### Random Forests Classifier

In [None]:
#Building the model using RandomForest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500)
rfc.fit(xtrain, ytrain)
preds = rfc.predict(xtest)

print('Predictions',list(preds[0:500]))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, preds)

True positive are on the upper left. Then the botton right is the true negative. Which means, that I was supposed a negative and the model got a negative.

The false positive is the number on the upper right. 
False negative are the numbers on the bottom left. 

Here we have the True negatives or 0s because we don't have many cases of strokes. Meaning that 3,388 people did not have a stroke.

On the inverse we have the True positive or 1s for those who suffered a stroke.

  is the False negative, those who were predicted as 1 but they were 0s. Number 13 

In [None]:
#To find the False Negatives and Predictions. 
xp = (ytest == 0 and preds == 1)

## Accuracy Score

The accuracy score for this Random forest classifier

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest,preds)

In [None]:
from sklearn.metrics import f1_score
f1_score(ytest, preds, average='micro')

True positive are on the upper left. Then the botton right is the true negative. Which means, that I was supposed a negative and the model got a negative.

The false positive is the number on the upper right. 
False negative are the numbers on the bottom left. 

### Gradient Boost Classifier

Include categorical values in the dataset.
Since we are using a Tree based model, One-Hot encoding is not an absolute necessity

However, this dataset, tran and test sets will be update whenever one-hot enconding will be use

In [None]:
#Splitting the data set
x = data.drop('stroke', axis = 1)
y = data.stroke

xtrain, xtest, ytrain, ytest = train_test_split(x,y, train_size = 0.3, random_state =1)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 123, n_estimators = 500)
gbc.fit(xtrain, ytrain)

In [None]:
preds = gbc.predict(xtest)

print(preds)
accuracy_score(ytest,preds)

In [None]:
print(confusion_matrix(ytest, preds))
print(classification_report(ytest, preds))

f1-score is very good for umbalanced data. 

### Gradient Boost Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 123, n_estimators = 500)
gbc.fit(xtrain,ytrain)

In [None]:
preds = gbc.predict(xtest)
print(confusion_matrix(ytest, preds))
print(classification_report(ytest, preds, output_dict = True))
print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

#### Using SMOTE
<br>
SMOTE is a technique to artificially oversample the minority class by creating synthetic samples. These synthetic samples are created by finding the intermediate values between neighbouring samples of minority class<br>
<br>
SMOTE is applied ONLY on the training set and not on the test set to avoid biased results

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
xtrain_mod, ytrain_mod = sm.fit_resample(xtrain, ytrain)

In [None]:
gbc2 = GradientBoostingClassifier(random_state = 123, n_estimators = 30, max_depth = 2)
gbc2.fit(xtrain_mod,ytrain_mod)
preds = gbc2.predict(xtest)
print(confusion_matrix(ytest, preds))
#print(classification_report(ytest, preds, output_dict = True))
print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

### XGBoost Classifier

In [None]:
#Fitting model on non-SMOTE dataset
xgb1 = XGBClassifier(n_estimators = 250)
xgb1.fit(xtrain, ytrain)
preds = xgb1.predict(xtest)
print(confusion_matrix(ytest, preds))

train_preds = xgb1.predict(xtrain)
print('Train Accuracy Score: ',accuracy_score(ytrain, train_preds))
print('Train F1 Score: ',f1_score(ytrain, train_preds))

print('Test Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

In [None]:
#Fitting model on SMOTE dataset
xgb = XGBClassifier(n_estimators = 255, reg_alpha=0.5, reg_lambda = 0.4, max_depth= 1)
xgb.fit(xtrain_mod, ytrain_mod)
preds = xgb.predict(xtest)
print(confusion_matrix(ytest, preds))
#print(classification_report(ytest, preds, output_dict = True))

train_preds = xgb.predict(xtrain_mod)
print('Train Accuracy Score: ',accuracy_score(ytrain_mod, train_preds))
print('Train F1 Score: ',f1_score(ytrain_mod, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

### Gaussian Naive Bayes

In [None]:
#Applying on non-SMOTE dataset
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(xtrain, ytrain)

preds = gnb.predict(xtest)
print(confusion_matrix(ytest, preds))

train_preds = gnb.predict(xtrain)
print('Train Accuracy Score: ',accuracy_score(ytrain, train_preds))
print('Train F1 Score: ',f1_score(ytrain, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

In [None]:
#Applying on SMOTE dataset
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(xtrain_mod, ytrain_mod)

preds = gnb.predict(xtest)
print(confusion_matrix(ytest, preds))

train_preds = gnb.predict(xtrain_mod)
print('Train Accuracy Score: ',accuracy_score(ytrain_mod, train_preds))
print('Train F1 Score: ',f1_score(ytrain_mod, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

We observe that there is a significant increase in True negatives, but there are also a significant increase in false negatives

Earlier, we had noticed that Gender and Residence_type do not have an impact on stroke. 
<br></br>
To strengthen our classifier, they can be dropped from train and test X dataset

In [None]:
#Removing non-impactful features
xtrain_mod_dropped = xtrain_mod.drop(['gender', 'Residence_type'], axis = 1)
xtest_dropped = xtest.drop(['gender', 'Residence_type'], axis = 1)

gnb = GaussianNB()
gnb.fit(xtrain_mod_dropped, ytrain_mod)

preds = gnb.predict(xtest_dropped)
print(confusion_matrix(ytest, preds))

train_preds = gnb.predict(xtrain_mod_dropped)
print('Train Accuracy Score: ',accuracy_score(ytrain_mod, train_preds))
print('Train F1 Score: ',f1_score(ytrain_mod, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

### K Nearest Neighbour Classifier

In [None]:
#Applying on SMOTE dataset
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(xtrain, ytrain)

preds = knn.predict(xtest)
print(confusion_matrix(ytest, preds))

train_preds = knn.predict(xtrain)
print('Train Accuracy Score: ',accuracy_score(ytrain, train_preds))
print('Train F1 Score: ',f1_score(ytrain, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

In [None]:
#Applying on SMOTE dataset

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2, p=1)

knn.fit(xtrain_mod, ytrain_mod)

preds = knn.predict(xtest)
print(confusion_matrix(ytest, preds))

train_preds = knn.predict(xtrain_mod)
print('Train Accuracy Score: ',accuracy_score(ytrain_mod, train_preds))
print('Train F1 Score: ',f1_score(ytrain_mod, train_preds))

print('Accuracy Score: ',accuracy_score(ytest, preds))
print('F1 Score: ',f1_score(ytest,preds))

## References