In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# loading the stroke dataset to a pandas DataFrame
df = pd.read_csv(r"healthcare-dataset-stroke-data.csv")


In [3]:
#display the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   a                  5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
df.tail()

Unnamed: 0,a,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [5]:
df.shape

(5110, 12)

In [6]:
df.isnull().sum()

a                      0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df['age'] = df['age'].astype(int)

# Display the DataFrame after converting the 'age' column
print(df)

          a  gender  age  hypertension  heart_disease ever_married  \
0      9046    Male   67             0              1          Yes   
1     51676  Female   61             0              0          Yes   
2     31112    Male   80             0              1          Yes   
3     60182  Female   49             0              0          Yes   
4      1665  Female   79             1              0          Yes   
...     ...     ...  ...           ...            ...          ...   
5105  18234  Female   80             1              0          Yes   
5106  44873  Female   81             0              0          Yes   
5107  19723  Female   35             0              0          Yes   
5108  37544    Male   51             0              0          Yes   
5109  44679  Female   44             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  formerly smoked   

In [8]:
df.head(10)

Unnamed: 0,a,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [9]:
df.work_type.value_counts()

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [10]:
df.drop("a",axis=1,inplace=True)

In [11]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Gender" column
df["gender"] = label_encoder.fit_transform(df["gender"])

df["ever_married"] = label_encoder.fit_transform(df["ever_married"])

df["work_type"] = label_encoder.fit_transform(df["work_type"])

df["Residence_type"] = label_encoder.fit_transform(df["Residence_type"])


df["smoking_status"] = label_encoder.fit_transform(df["smoking_status"])

In [12]:
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67,0,1,1,2,1,228.69,36.6,1,1
1,0,61,0,0,1,3,0,202.21,,2,1
2,1,80,0,1,1,2,0,105.92,32.5,2,1
3,0,49,0,0,1,2,1,171.23,34.4,3,1
4,0,79,1,0,1,3,0,174.12,24.0,2,1
5,1,81,0,0,1,2,1,186.21,29.0,1,1
6,1,74,1,1,1,2,0,70.09,27.4,2,1
7,0,69,0,0,0,2,1,94.39,22.8,2,1
8,0,59,0,0,1,2,0,76.15,,0,1
9,0,78,0,0,1,2,1,58.57,24.2,0,1


In [13]:
# Display the resulting DataFramea
print(df)
print(df["ever_married"])
df.columns

bmi_mean = df['bmi'].mean()

# Replace NaN values with the mean
df['bmi'].fillna(bmi_mean, inplace=True)


      gender  age  hypertension  heart_disease  ever_married  work_type  \
0          1   67             0              1             1          2   
1          0   61             0              0             1          3   
2          1   80             0              1             1          2   
3          0   49             0              0             1          2   
4          0   79             1              0             1          3   
...      ...  ...           ...            ...           ...        ...   
5105       0   80             1              0             1          2   
5106       0   81             0              0             1          3   
5107       0   35             0              0             1          3   
5108       1   51             0              0             1          2   
5109       0   44             0              0             1          0   

      Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0                  1       

In [14]:
df.dtypes

gender                 int64
age                    int64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object

In [15]:
df.head(4)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67,0,1,1,2,1,228.69,36.6,1,1
1,0,61,0,0,1,3,0,202.21,28.893237,2,1
2,1,80,0,1,1,2,0,105.92,32.5,2,1
3,0,49,0,0,1,2,1,171.23,34.4,3,1


In [16]:
df.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,0.414286,43.215264,0.097456,0.054012,0.656164,2.16771,0.508023,106.147677,28.893237,1.376908,0.048728
std,0.493044,22.633866,0.296607,0.226063,0.475034,1.090293,0.499985,45.28356,7.698018,1.071534,0.21532
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0
25%,0.0,25.0,0.0,0.0,0.0,2.0,0.0,77.245,23.8,0.0,0.0
50%,0.0,45.0,0.0,0.0,1.0,2.0,1.0,91.885,28.4,2.0,0.0
75%,1.0,61.0,0.0,0.0,1.0,3.0,1.0,114.09,32.8,2.0,0.0
max,2.0,82.0,1.0,1.0,1.0,4.0,1.0,271.74,97.6,3.0,1.0


In [17]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [18]:
X = df.drop(columns='stroke', axis=1)
Y = df['stroke']

In [19]:
print(X)

      gender  age  hypertension  heart_disease  ever_married  work_type  \
0          1   67             0              1             1          2   
1          0   61             0              0             1          3   
2          1   80             0              1             1          2   
3          0   49             0              0             1          2   
4          0   79             1              0             1          3   
...      ...  ...           ...            ...           ...        ...   
5105       0   80             1              0             1          2   
5106       0   81             0              0             1          3   
5107       0   35             0              0             1          3   
5108       1   51             0              0             1          2   
5109       0   44             0              0             1          0   

      Residence_type  avg_glucose_level        bmi  smoking_status  
0                  1          

In [20]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64


# Train-Test-Split

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 10) (4088, 10) (1022, 10)


# LogisticRegression

In [23]:
model = LogisticRegression()

In [24]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [26]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9510763209393346


In [27]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [28]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9510763209393346


In [29]:
df.head(1)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67,0,1,1,2,1,228.69,36.6,1,1


In [30]:
input_data = (	1,	67,	0,	1,	1,	2	,1,	228.69,	36.6,	1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not having stroke')
else:
  print('The Person has got a stroke')

[0]
The Person does not having stroke




# DecisionTreeClassifier

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
# Create and fit the Decision Tree model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)

In [33]:
# Make predictions on the test set
y_pred = dtree.predict(X_test)

In [34]:
# Calculate accuracy of the model
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9129158512720157


In [35]:
input_data = (	1,	67,	0,	1,	1,	2	,1,	228.69,	36.6,	1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = dtree.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not having stroke')
else:
  print('The Person has got a stroke')

[1]
The Person has got a stroke




# KNeighborsClassifier

In [36]:
# Create and fit the KNN model with k=3 (you can adjust k as needed)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)

In [37]:
# Make predictions on the test set
y_pred = knn.predict(X_test)

In [38]:
# Calculate accuracy of the model
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9373776908023483


In [39]:
input_data = (	1,	67,	0,	1,	1,	2	,1,	228.69,	36.6,	1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = knn.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not having stroke')
else:
  print('The Person has got a stroke')

[0]
The Person does not having stroke




# svm.SVC

In [40]:
classifier = svm.SVC(kernel='linear')

In [41]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [42]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score of the test data : ', test_data_accuracy)



Accuracy score of the training data :  0.951320939334638
Accuracy score of the test data :  0.9510763209393346


In [43]:
input_data = (	1,	67,	0,	1,	1,	2	,1,	228.69,	36.6,	1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not having stroke')
else:
  print('The Person has got a stroke')

[0]
The Person does not having stroke




In [44]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [45]:
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,0,80,1,0,1,2,1,83.75,28.893237,2,0
5106,0,81,0,0,1,3,1,125.2,40.0,2,0
5107,0,35,0,0,1,3,0,82.99,30.6,2,0
5108,1,51,0,0,1,2,0,166.29,25.6,1,0
5109,0,44,0,0,1,0,1,85.28,26.2,0,0
