# Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import StandardScaler

# Data Collection and Analysis

In [57]:
# Loading the cancer patient data sets to a pandas DataFrame
strokes= pd.read_csv(r'F:\Machine Learning Project\healthcare-dataset-stroke-data.csv')

In [58]:
# Print the first 5 rows of the dataset 
strokes.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [59]:
# number of rows and columns in this dataset
strokes.shape

(5110, 12)

In [60]:
# Getting some informations about the dataset
strokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [61]:
# Checking for missing values
strokes.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [62]:
# Statistical measure about the data
strokes.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [63]:
strokes['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [64]:
# mean value of bmi column
strokes['bmi'].mean()

28.893236911794666

In [92]:
# filling the missing value in bmi column with Mean value 
strokes['bmi'].fillna(strokes['bmi'].mean() ,inplace = True)

In [66]:
# Checking for missing values 
strokes.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [26]:
le = LabelEncoder()

In [67]:
strokes['gender']=le.fit_transform(strokes['gender'])

strokes['ever_married']=le.fit_transform(strokes['ever_married'])

strokes['work_type']=le.fit_transform(strokes['work_type'])

strokes['Residence_type']=le.fit_transform(strokes['Residence_type'])

strokes['smoking_status']=le.fit_transform(strokes['smoking_status'])


In [68]:
strokes.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [69]:
# separating the data and labes 
X = strokes.drop(columns = 'stroke',axis = 1)
Y = strokes['stroke']

In [70]:
print(X)

         id  gender   age  hypertension  heart_disease  ever_married  \
0      9046       1  67.0             0              1             1   
1     51676       0  61.0             0              0             1   
2     31112       1  80.0             0              1             1   
3     60182       0  49.0             0              0             1   
4      1665       0  79.0             1              0             1   
...     ...     ...   ...           ...            ...           ...   
5105  18234       0  80.0             1              0             1   
5106  44873       0  81.0             0              0             1   
5107  19723       0  35.0             0              0             1   
5108  37544       1  51.0             0              0             1   
5109  44679       0  44.0             0              0             1   

      work_type  Residence_type  avg_glucose_level        bmi  smoking_status  
0             2               1             228.69  36.

In [71]:
print(Y)


0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64


In [72]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)

In [73]:
print(standardized_data)

[[-1.29831203  1.18807255  1.05143428 ...  2.70637544  1.00123401
  -0.35178071]
 [ 0.71637149 -0.840344    0.78607007 ...  2.12155854  0.
   0.58155233]
 [-0.25547819  1.18807255  1.62639008 ... -0.0050283   0.46857725
   0.58155233]
 ...
 [-0.79371959 -0.840344   -0.36384151 ... -0.51144264  0.22173632
   0.58155233]
 [ 0.04849658  1.18807255  0.34379639 ...  1.32825706 -0.4278451
  -0.35178071]
 [ 0.38569496 -0.840344    0.03420481 ... -0.46086746 -0.34989533
  -1.28511375]]


In [74]:
X = standardized_data
Y = strokes['stroke']

# Train Test Split 

In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, stratify = Y, random_state = 2)

In [76]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 11) (3577, 11) (1533, 11)


# Support Vector Machine

In [77]:
classifier = svm.SVC(kernel = 'linear')

In [78]:
# Training the support vector Machine Classifier 
classifier.fit(X_train , Y_train)

SVC(kernel='linear')

In [79]:
# accuracy score on the training data 
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

In [80]:
print('Accuracy score of the training data : ',training_data_accuracy)

Accuracy score of the training data :  0.9513558848196813


In [81]:
# accuracy score on the test data 
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [82]:
print('Accuracy score of the test data : ',test_data_accuracy)

Accuracy score of the test data :  0.9510763209393346


F1 SCORE

In [83]:
# F1_Score for training data 
f1_train = f1_score(Y_train , predict_X_train)
print('Training data f1 : ',f1_train)

Training data f1 :  1.0


In [84]:
 # F1_Score for testing data 
f1_test = f1_score(Y_test , predict_X_test)
print('Testing data f1 : ',f1_test)

Testing data f1 :  0.0


# Random Forests

In [85]:
model = RandomForestClassifier()

In [86]:
model.fit(X_train , Y_train)

RandomForestClassifier()

In [87]:
# Accuracy score on the training data 
predict_X_train = model.predict(X_train )
training_accuracy = accuracy_score(predict_X_train , Y_train)

In [45]:
print('Accuracy score on training data : ',training_accuracy)

Accuracy score on training data :  1.0


In [88]:
# Accuracy score on the test data
predict_X_test = model.predict(X_test)
test_accuracy = accuracy_score(predict_X_test , Y_test)

In [89]:
print('Accuracy score on test data : ',test_accuracy)

Accuracy score on test data :  0.9510763209393346


In [90]:
# F1_Score for training data 
f1_train = f1_score(Y_train , predict_X_train)
print('Training data f1 : ',f1_train)

Training data f1 :  1.0


In [91]:
 # F1_Score for testing data 
f1_test = f1_score(Y_test , predict_X_test)
print('Testing data f1 : ',f1_test)

Testing data f1 :  0.0
