<a href="https://colab.research.google.com/github/sujalbindra1012/MULTIPLE_DISEASE_PREDICTION_MODELS/blob/Stroke_Disease_Prediction/Stroke_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING THE DEPENDENCIES

In [161]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

DATA COLLECTION AND PREPROCESSING

In [162]:
#LOADING CSV DATA TO PANDAS DATA FRAME TO STRUCTURE TABLE
data = pd.read_csv('/content/cleaned_healthcare_dataset.csv')

In [163]:
# Handle missing values in 'bmi' by imputing the mean value
imputer = SimpleImputer(strategy='mean')
data['bmi'] = imputer.fit_transform(data[['bmi']])

convert categorical columns to numeric using label encodeing


In [164]:
label_encoders = {}
for column in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [165]:
# PRINT THE FIRST 5 ROWS OF DATASET
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
2,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
3,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
4,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [166]:
#print the below 5 rows of dataset
data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4904,14180,0,13.0,0,0,0,4,0,103.08,18.6,0,0
4905,44873,0,81.0,0,0,1,3,1,125.2,40.0,2,0
4906,19723,0,35.0,0,0,1,3,0,82.99,30.6,2,0
4907,37544,1,51.0,0,0,1,2,0,166.29,25.6,1,0
4908,44679,0,44.0,0,0,1,0,1,85.28,26.2,0,0


In [167]:
#number of rows and columns
data.shape

(4909, 12)

In [168]:
#getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   int64  
 2   age                4909 non-null   float64
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   int64  
 6   work_type          4909 non-null   int64  
 7   Residence_type     4909 non-null   int64  
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   int64  
 11  stroke             4909 non-null   int64  
dtypes: float64(3), int64(9)
memory usage: 460.3 KB


In [169]:
#checking for missing values
data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,0


In [170]:
#statistical measures about the data
data.describe()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,0.410063,42.865374,0.091872,0.049501,0.652679,2.170096,0.507232,105.30515,28.893237,1.375433,0.042575
std,20995.098457,0.492309,22.555115,0.288875,0.216934,0.476167,1.092593,0.499999,44.424341,7.854067,1.067322,0.201917
min,77.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,55.12,10.3,0.0,0.0
25%,18605.0,0.0,25.0,0.0,0.0,0.0,2.0,0.0,77.07,23.5,0.0,0.0
50%,37608.0,0.0,44.0,0.0,0.0,1.0,2.0,1.0,91.68,28.1,2.0,0.0
75%,55220.0,1.0,60.0,0.0,0.0,1.0,3.0,1.0,113.57,33.1,2.0,0.0
max,72940.0,2.0,82.0,1.0,1.0,1.0,4.0,1.0,271.74,97.6,3.0,1.0


In [171]:
#checking the distribution of target variable
data['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4700
1,209


1--> STROKE DISEASE
0--> NO STROKE DISEASE

In [172]:
#SPLITTING FEATURES AND TARGET
X=data.drop(columns='stroke',axis=1)
Y=data['stroke']

In [173]:
print(X)

         id  gender   age  hypertension  heart_disease  ever_married  \
0      9046       1  67.0             0              1             1   
1     31112       1  80.0             0              1             1   
2     60182       0  49.0             0              0             1   
3      1665       0  79.0             1              0             1   
4     56669       1  81.0             0              0             1   
...     ...     ...   ...           ...            ...           ...   
4904  14180       0  13.0             0              0             0   
4905  44873       0  81.0             0              0             1   
4906  19723       0  35.0             0              0             1   
4907  37544       1  51.0             0              0             1   
4908  44679       0  44.0             0              0             1   

      work_type  Residence_type  avg_glucose_level   bmi  smoking_status  
0             2               1             228.69  36.6    

In [174]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
4904    0
4905    0
4906    0
4907    0
4908    0
Name: stroke, Length: 4909, dtype: int64


In [175]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [176]:
print(X.shape,X_train.shape,X_test.shape)

(4909, 11) (3927, 11) (982, 11)


  MODEL TRAINING

  LOGISTIC REGRESSION MODEL

In [177]:
model = RandomForestClassifier(class_weight='balanced', random_state=42)

In [178]:
#training the logistic regression model with training data
model.fit(X_train,Y_train)

MODEL EVALUATION ACCURACY SCORE

In [179]:
#accuracy on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [180]:
print('accuracy on training data : ', training_data_accuracy)

accuracy on training data :  1.0


In [181]:
#accuracy on testing data
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [182]:
print('accuracy on test data : ', test_data_accuracy)

accuracy on test data :  0.9572301425661914


In [183]:
input_data = (56543,'Female',70,0,0,'Yes','Private','Rural',69.04,35.9,'formerly smoked')

# Convert the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data, dtype=object)

# Use the LabelEncoders that were used during model training
input_data_as_numpy_array[1] = label_encoders['gender'].transform([input_data[1]])[0]
input_data_as_numpy_array[5] = label_encoders['ever_married'].transform([input_data[5]])[0]
input_data_as_numpy_array[6] = label_encoders['work_type'].transform([input_data[6]])[0]
input_data_as_numpy_array[7] = label_encoders['Residence_type'].transform([input_data[7]])[0]
input_data_as_numpy_array[10] = label_encoders['smoking_status'].transform([input_data[10]])[0]

# Handle missing value for 'bmi' (index 9) using the mean value or the strategy you used during training
input_data_as_numpy_array[9] = imputer.transform([[input_data_as_numpy_array[9]]])[0][0]

# Convert the entire array to float
input_data_as_numpy_array = input_data_as_numpy_array.astype(float)

# Reshape the numpy array as we are predicting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make the prediction
prediction = model.predict(input_data_reshaped)

# Output the result
if prediction[0] == 1:
    print('The person has stroke disease')
else:
    print('The person does not have stroke disease')

The person does not have stroke disease


