Mounting the Google Drive

In [2]:
#mounting the google drive to google colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Data Cleaning

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#reading the csv file
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/stroke.csv')

In [4]:
#displaying the first row of dataframe to see if it has missing values
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [None]:
#check how many missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [None]:
#filling the missing values using fillna() function
df['bmi'].fillna(df['bmi'].mean(), inplace = True)

In [None]:
#check if the missing values has been replaced
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,28.893237,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [None]:
#dropping the "Unknown" smoking status to make it null
df.loc[df['smoking_status'] == 'Unknown', 'smoking_status'] = np.nan

In [None]:
df['smoking_status'].isnull().sum()

1544

In [None]:
#changing the null values to most frequent values
from sklearn.impute import SimpleImputer
smoke_status = SimpleImputer()
smoke_status.set_params(missing_values= np.nan, strategy='most_frequent')
smoke_status.fit(df[['smoking_status']])

In [None]:
df[['smoking_status']] = smoke_status.transform(df[['smoking_status']])

In [None]:
df['smoking_status'].isnull().sum()

0

In [None]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,28.893237,never smoked,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,never smoked,1


In [None]:
#changing the male and female values to numerical value
df["gender"] = df["gender"].apply(lambda toLabel:0 if toLabel == 'Male' else 1)
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,1,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,0,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,1,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,1,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,0,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,0,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,1,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,1,59.0,0,0,Yes,Private,Rural,76.15,28.893237,never smoked,1
9,60491,1,78.0,0,0,Yes,Private,Urban,58.57,24.2,never smoked,1


In [None]:
#changing the smoking status to categorical numerical values
df['smoking_status'] = df['smoking_status'].astype('category')
df['smoking_status'] = df['smoking_status'].cat.codes
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,Yes,Private,Urban,228.69,36.6,0,1
1,51676,1,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,1,1
2,31112,0,80.0,0,1,Yes,Private,Rural,105.92,32.5,1,1
3,60182,1,49.0,0,0,Yes,Private,Urban,171.23,34.4,2,1
4,1665,1,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,1,1
5,56669,0,81.0,0,0,Yes,Private,Urban,186.21,29.0,0,1
6,53882,0,74.0,1,1,Yes,Private,Rural,70.09,27.4,1,1
7,10434,1,69.0,0,0,No,Private,Urban,94.39,22.8,1,1
8,27419,1,59.0,0,0,Yes,Private,Rural,76.15,28.893237,1,1
9,60491,1,78.0,0,0,Yes,Private,Urban,58.57,24.2,1,1


In [None]:
#separating the variables
columns = ['gender', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'smoking_status']
x = df[list(columns)].values
y = df['stroke'].values

In [None]:
#splitting the train and test models
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 0)

In [None]:
#scaling each values to unit variance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Linear Regression

In [None]:
#linear regression
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression().fit(df.age, df.stroke)
linear_reg.predict(X_test_scaled)

array([ 0.2057962 , -0.0210482 ,  0.07126049, ...,  0.03935853,
        0.15313588,  0.03668612])

In [None]:
linear_reg.score(X_train_scaled, y_train)

0.0764302330512645

In [None]:
linear_reg.score(X_test_scaled, y_test)

0.0669823494466969

Evaluation: Knowing that our data set is more focused on binary classification, we noticed that linear regression is still applicable even though its not as accurate and doesnt provide efficient results as opposed to Logistic Regressions and Trees.

Singular LR

In [None]:
#model for singular linear regression
linear_reg = LinearRegression().fit(X_train_scaled, y_train)
linear_reg.predict(X_test_scaled)

array([ 0.2057962 , -0.0210482 ,  0.07126049, ...,  0.03935853,
        0.15313588,  0.03668612])

In [None]:
linear_reg.score(X_train_scaled, y_train)

0.0764302330512645

In [None]:
linear_reg.score(X_test_scaled, y_test)

0.0669823494466969

Evaluation: Just like the result above, these regression tasks are more focused on data with continuous/numerical variables, unlike in our case with Stroke Prediction Dataset that contains categorical/binary outcomes. It still gave us prett low scores on both training and test scores

Multiple LR

In [None]:
#model for multiple linear regression
m_linear_reg = LinearRegression().fit(X_train_scaled, y_train)

In [None]:
print("Coefficients:", m_linear_reg.coef_)
print("Intercept:", m_linear_reg.intercept_)

Coefficients: [-0.00108272  0.0477177   0.00780058  0.0167788   0.01125943 -0.01265398
 -0.00308585]
Intercept: 0.04770058708414866


In [None]:
m_linear_reg.predict(X_test_scaled)

array([ 0.2057962 , -0.0210482 ,  0.07126049, ...,  0.03935853,
        0.15313588,  0.03668612])

In [None]:
m_linear_reg.score(X_train_scaled, y_train)

0.0764302330512645

In [None]:
m_linear_reg.score(X_test_scaled, y_test)

0.0669823494466969

Evaluation: These regression tasks are more focused on data with continuous/numerical variables, unlike in our case with Stroke Prediction Dataset that contains categorical/binary outcomes. It still gave us prett low scores on both training and test scores


Polynomial LR

In [None]:
#model for polynomial linear regression
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_reg = LinearRegression().fit(PolynomialFeatures(degree=2).fit_transform(X_train_scaled), y_train)

In [None]:
poly_reg.predict(PolynomialFeatures(degree=2).fit_transform(X_test_scaled))

array([ 0.38643646, -0.00704193,  0.05145264, ...,  0.01137543,
        0.18302155,  0.00415039])

In [None]:
poly_reg.score(PolynomialFeatures(degree=2).fit_transform(X_train_scaled), y_train)

0.10532030945314774

In [None]:
poly_reg.score(PolynomialFeatures(degree=2).fit_transform(X_test_scaled), y_test)

0.07466041955257685

Evaluation: These regression tasks are more focused on data with continuous/numerical variables, unlike in our case with Stroke Prediction Dataset that contains categorical/binary outcomes. It still gave us prett low scores on both training and test scores

Logistic Regression

In [None]:
#model for logistic regression
from sklearn.linear_model import LogisticRegression
log = LogisticRegression().fit(X_train_scaled, y_train)
log.predict(X_test_scaled)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
log.score(X_train_scaled, y_train)

0.9522994129158513

In [None]:
log.score(X_test_scaled, y_test)

0.9471624266144814

Evaluation: From our evaluations earlier about Linear Regression Applications, here we can clearly see how Logisitic Regression is directly applicable on our chosen dataset (Binary Classification). Our results gave us a **training score of 95%** and a **test score of 95%** as well which tells us that the model demonstrates great performance in distinguishing between people who are prone to strokes and those who are not.

Decision Tree

In [None]:
#model for decision tree
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)
clf.predict(X_test_scaled)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
clf.score(X_train_scaled, y_train)

1.0

In [None]:
clf.score(X_test_scaled, y_test)

0.9099804305283757

Evaluation: The model shows us that the **training accuracy** is at 1.0 which indicates that it successfully trained the data on our dataset and got every predition correct. The **test accuracy** on the other hand, gave us a percentage of 91% which is still relatively high, meaning that as our model has seen new unseen data it still got it correct most of the time.

Random Forest

In [None]:
#model for random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train_scaled, y_train)
rf.predict(X_test_scaled)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
rf.score(X_train_scaled, y_train)

1.0

In [None]:
rf.score(X_test_scaled, y_test)

0.9471624266144814

Evaluation: After running the random forest model, we noticed that this model has the same percentage of accuracy from decision tree in training set, and the accuracy percentage for training is same from logistic regression.