Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Processing

In [2]:
# loading the csv data to a Pandas DataFrame
cerebral_data = pd.read_csv("dataset.csv")

In [3]:
# print first 5 rows of the data
cerebral_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [4]:
# print last 5 rows of the data
cerebral_data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0
43399,36271,Female,82.0,0,0,Yes,Private,Urban,79.48,20.6,never smoked,0


In [5]:
# number of rows and columns in the dataset
cerebral_data.shape

(43400, 12)

In [6]:
# getting some info about the data
cerebral_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


In [7]:
# checking for missing values
cerebral_data.isnull().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [8]:
# handling missing values
cerebral_data = cerebral_data.dropna()
cerebral_data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [9]:
cerebral_data = cerebral_data[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']]
cerebral_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
1,Male,58.0,1,0,Yes,Private,87.96,39.2,never smoked,0
3,Female,70.0,0,0,Yes,Private,69.04,35.9,formerly smoked,0
6,Female,52.0,0,0,Yes,Private,77.59,17.7,formerly smoked,0
7,Female,75.0,0,1,Yes,Self-employed,243.53,27.0,never smoked,0
8,Female,32.0,0,0,Yes,Private,77.67,32.3,smokes,0


In [10]:
# checking variations of data
cerebral_data['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [11]:
# labeling string type data as processable 
le_gender = LabelEncoder()
cerebral_data['gender'] = le_gender.fit_transform(cerebral_data['gender'])
cerebral_data['gender'].unique()

array([1, 0, 2])

In [12]:
# checking variations of data
cerebral_data['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [13]:
# labeling string type data as processable 
le_marriage = LabelEncoder()
cerebral_data['ever_married'] = le_marriage.fit_transform(cerebral_data['ever_married'])
cerebral_data['ever_married'].unique()

array([1, 0])

In [14]:
# checking variations of data
cerebral_data['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [15]:
# labeling string type data as processable 
le_work = LabelEncoder()
cerebral_data['work_type'] = le_work.fit_transform(cerebral_data['work_type'])
cerebral_data['work_type'].unique()

array([2, 3, 0, 4, 1])

In [16]:
# checking variations of data
cerebral_data['work_type'].unique()

array([2, 3, 0, 4, 1])

In [17]:
# labeling string type data as processable 
le_smoking = LabelEncoder()
cerebral_data['smoking_status'] = le_smoking.fit_transform(cerebral_data['smoking_status'])
cerebral_data['smoking_status'].unique()

array([1, 0, 2])

In [18]:
# statistical measures about the data
cerebral_data.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
count,29072.0,29072.0,29072.0,29072.0,29072.0,29072.0,29072.0,29072.0,29072.0,29072.0
mean,0.386179,47.671746,0.111482,0.052146,0.746079,1.929313,106.403225,30.054166,0.969971,0.01885
std,0.487375,18.73449,0.314733,0.222326,0.435261,0.916367,45.268512,7.193908,0.676357,0.135997
min,0.0,10.0,0.0,0.0,0.0,0.0,55.01,10.1,0.0,0.0
25%,0.0,32.0,0.0,0.0,0.0,2.0,77.6275,25.0,1.0,0.0
50%,0.0,48.0,0.0,0.0,1.0,2.0,92.13,28.9,1.0,0.0
75%,1.0,62.0,0.0,0.0,1.0,2.0,113.91,33.9,1.0,0.0
max,2.0,82.0,1.0,1.0,1.0,4.0,291.05,92.0,2.0,1.0


In [19]:
cerebral_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke
1,1,58.0,1,0,1,2,87.96,39.2,1,0
3,0,70.0,0,0,1,2,69.04,35.9,0,0
6,0,52.0,0,0,1,2,77.59,17.7,0,0
7,0,75.0,0,1,1,3,243.53,27.0,1,0
8,0,32.0,0,0,1,2,77.67,32.3,2,0


In [20]:
# checking the distribution of target variable
cerebral_data['stroke'].value_counts()

0    28524
1      548
Name: stroke, dtype: int64

1 --> Cerebral Stroke
0 --> Healthy

Splitting the features & Target

In [21]:
X = cerebral_data.drop(columns='stroke', axis=1)
Y = cerebral_data['stroke']

In [22]:
print(X)

       gender   age  hypertension  heart_disease  ever_married  work_type  \
1           1  58.0             1              0             1          2   
3           0  70.0             0              0             1          2   
6           0  52.0             0              0             1          2   
7           0  75.0             0              1             1          3   
8           0  32.0             0              0             1          2   
...       ...   ...           ...            ...           ...        ...   
43395       0  10.0             0              0             0          4   
43396       0  56.0             0              0             1          0   
43397       0  82.0             1              0             1          2   
43398       1  40.0             0              0             1          2   
43399       0  82.0             0              0             1          2   

       avg_glucose_level   bmi  smoking_status  
1                  87.96  

In [23]:
print(Y)

1        0
3        0
6        0
7        0
8        0
        ..
43395    0
43396    0
43397    0
43398    0
43399    0
Name: stroke, Length: 29072, dtype: int64


Splitting the data into Training & Testing

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, stratify=Y, random_state=2)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(29072, 9) (23257, 9) (5815, 9)


Model Training

Logistic Regression

In [26]:
model = LogisticRegression()

In [27]:
# training the LogisticRegression model with training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

Model Evaluation

Accuracy Score

In [28]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [29]:
print(f'Accuracy on Training Data: {round(100 * training_data_accuracy, 2)}%')

Accuracy on Training Data: 98.12%


In [30]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print(f'Accuracy on Test Data: {round(100 * test_data_accuracy, 2)}%')

Accuracy on Test Data: 98.11%


Buidling a Predicting System

In [32]:
input_data = (1, 58.0, 1, 0, 1, 2, 87.96, 39.2, 1)

# change the input data to a numpy array

input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we're predicting for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = model.predict(input_data_reshaped)

if prediction == 1:
    print('In Danger of Cerebral Stroke')
else:
    print('You are okay')

You are okay
