# HCV data Machine Learning Analysing

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import plotly.express as px
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

from functions import *

## Cleaning Data

Import dataset:

In [5]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [6]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


Check how many NA valuse do we have in the data:

In [7]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

because the number of NAs were not many, so we droped all the rows containing NA value:

In [8]:
data = data.dropna()
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [9]:
data.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

Check if the data type of all columns are correct:

In [10]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

Transform the "AgeGroup" column into differernt bins and make it numerical:

In [11]:
data = clean_age_column(data)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = pd.cut(data['age'], bins=[0, 18, 35, 50, 65, 100], labels=[0, 1, 2, 3, 4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AgeGroup'] = data['AgeGroup'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["age"], inplace=True)


Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,Male,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,4
2,31112,Male,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,4
3,60182,Female,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,2
4,1665,Female,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,4
5,56669,Male,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,4


In [12]:
data['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

Transform the "gender" column from categorical to numerical:

In [13]:
# Drop rows where 'gender' is 'Other'
data = data[data['gender'] != 'Other']

In [14]:
data['gender'] = data['gender'].map({'Male': 0,'Female': 1})
data['gender'].astype(int)
data.head()

Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,4
2,31112,0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,4
3,60182,1,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,2
4,1665,1,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,4
5,56669,0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1,4


The "Category" column is our Target column. so we want to make sure how to categorize it in the best form.

In [15]:
data["stroke"].unique()

array([1, 0])

In [16]:
data["Residence_type"].value_counts()

Urban    2490
Rural    2418
Name: Residence_type, dtype: int64

In [17]:
data['Residence_type'] = data['Residence_type'].map({'Urban': 0,'Rural': 1})
data.head()

Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,0,0,1,Yes,Private,0,228.69,36.6,formerly smoked,1,4
2,31112,0,0,1,Yes,Private,1,105.92,32.5,never smoked,1,4
3,60182,1,0,0,Yes,Private,0,171.23,34.4,smokes,1,2
4,1665,1,1,0,Yes,Self-employed,1,174.12,24.0,never smoked,1,4
5,56669,0,0,0,Yes,Private,0,186.21,29.0,formerly smoked,1,4


In [18]:
data["work_type"].value_counts()

Private          2810
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64

In [19]:
data['work_type'] = data['work_type'].map({'Never_worked': 0,'Private': 1,'Self-employed': 2,'children': 3,'Govt_job': 4})
data['work_type'].astype(int)
data.head()

Unnamed: 0,id,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,9046,0,0,1,Yes,1,0,228.69,36.6,formerly smoked,1,4
2,31112,0,0,1,Yes,1,1,105.92,32.5,never smoked,1,4
3,60182,1,0,0,Yes,1,0,171.23,34.4,smokes,1,2
4,1665,1,1,0,Yes,2,1,174.12,24.0,never smoked,1,4
5,56669,0,0,0,Yes,1,0,186.21,29.0,formerly smoked,1,4


In [20]:
data.drop(columns="id", inplace=True)
data.head()

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,0,0,1,Yes,1,0,228.69,36.6,formerly smoked,1,4
2,0,0,1,Yes,1,1,105.92,32.5,never smoked,1,4
3,1,0,0,Yes,1,0,171.23,34.4,smokes,1,2
4,1,1,0,Yes,2,1,174.12,24.0,never smoked,1,4
5,0,0,0,Yes,1,0,186.21,29.0,formerly smoked,1,4


In [21]:
data["smoking_status"].value_counts()

never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: smoking_status, dtype: int64

In [22]:
data['smoking_status'] = data['smoking_status'].map({'never smoked': 0,'Unknown': 1,'formerly smoked': 2,'smokes': 3})
data['smoking_status'].astype(int)
data.head()

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,0,0,1,Yes,1,0,228.69,36.6,2,1,4
2,0,0,1,Yes,1,1,105.92,32.5,0,1,4
3,1,0,0,Yes,1,0,171.23,34.4,3,1,2
4,1,1,0,Yes,2,1,174.12,24.0,0,1,4
5,0,0,0,Yes,1,0,186.21,29.0,2,1,4


In [23]:
data["ever_married"].value_counts()

Yes    3204
No     1704
Name: ever_married, dtype: int64

In [24]:
data['ever_married'] = data['ever_married'].map({'Yes': 0,'No': 1})
data['ever_married'].astype(int)
data.head()

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,AgeGroup
0,0,0,1,0,1,0,228.69,36.6,2,1,4
2,0,0,1,0,1,1,105.92,32.5,0,1,4
3,1,0,0,0,1,0,171.23,34.4,3,1,2
4,1,1,0,0,2,1,174.12,24.0,0,1,4
5,0,0,0,0,1,0,186.21,29.0,2,1,4


## Feature Selection

In [25]:
# Calculate the correlation matrix
correlation_matrix = np.abs(data.corr())

# Create the heatmap using Plotly Express
fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
                zmin=-1,
                zmax=1,
                aspect="auto",
                title='Correlation Heatmap of Numerical Variables')

# Update the layout for better readability
fig.update_layout(
    xaxis_title="",
    yaxis_title="",
    xaxis={'side': 'top'},  # Move x-axis labels to the top
    width=800,
    height=700
)

# Add correlation values as text annotations
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        fig.add_annotation(
            x=correlation_matrix.columns[j],
            y=correlation_matrix.columns[i],
            text=f"{value:.2f}",
            showarrow=False,
            font=dict(size=8)
        )

# Show the plot
fig.show()

## Split Data into Train and Test

In [26]:
features = data.drop(columns = ["stroke"])
target = data["stroke"]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

### Standardize The Data

In [28]:
# Standardize the features
X_train_scaled, X_test_scaled = Standardizer(X_train, X_test)

### Normalize The Data

In [29]:
# Normalize the features
X_train_norm, X_test_norm = Normalizer(X_train, X_test)

Based on the result of Heatmap we can drop these two not important columns : 

In [30]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
# X_train_reduced = X_train_norm.drop(columns = ["gender","Residence_type"])

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)
# X_test_reduced = X_test_norm.drop(columns = ["gender","Residence_type"])

In [31]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)

X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

# Decision Tree Model

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


### Original Train and Test Data

In [39]:
# Train a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, report

(0.9114052953156823,
 {'0': {'precision': 0.9561027837259101,
   'recall': 0.9510117145899893,
   'f1-score': 0.9535504538174052,
   'support': 939.0},
  '1': {'precision': 0.041666666666666664,
   'recall': 0.046511627906976744,
   'f1-score': 0.04395604395604396,
   'support': 43.0},
  'accuracy': 0.9114052953156823,
  'macro avg': {'precision': 0.49888472519628835,
   'recall': 0.49876167124848303,
   'f1-score': 0.49875324888672457,
   'support': 982.0},
  'weighted avg': {'precision': 0.9160612836917477,
   'recall': 0.9114052953156823,
   'f1-score': 0.9137209633652275,
   'support': 982.0}})

In [41]:
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.956103,0.951012,0.95355,939.0
1,0.041667,0.046512,0.043956,43.0
accuracy,0.911405,0.911405,0.911405,0.911405
macro avg,0.498885,0.498762,0.498753,982.0
weighted avg,0.916061,0.911405,0.913721,982.0


### Normalized Train and Test Data

In [42]:
# Train a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_norm, y_train)

# Make predictions
y_pred = clf.predict(X_test_norm)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

accuracy, report

(0.9124236252545825,
 {'0': {'precision': 0.9561497326203209,
   'recall': 0.952076677316294,
   'f1-score': 0.9541088580576307,
   'support': 939.0},
  '1': {'precision': 0.0425531914893617,
   'recall': 0.046511627906976744,
   'f1-score': 0.044444444444444446,
   'support': 43.0},
  'accuracy': 0.9124236252545825,
  'macro avg': {'precision': 0.49935146205484127,
   'recall': 0.49929415261163534,
   'f1-score': 0.49927665125103754,
   'support': 982.0},
  'weighted avg': {'precision': 0.9161449960942198,
   'recall': 0.9124236252545825,
   'f1-score': 0.9142763022680513,
   'support': 982.0}})