In [48]:
import pandas as pd
from sklearn.compose import make_column_transformer
import requests
import zipfile
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

## Import the data

In [49]:
url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/dzz48mvjht-1.zip"

request = requests.get(url)
with open("../data/raw/Cardiovascular_Disease_Dataset_original.zip", 'wb') as f:
    f.write(request.content)

with zipfile.ZipFile("../data/raw/Cardiovascular_Disease_Dataset_original.zip", 'r') as zip_ref:
    zip_ref.extractall("../data/raw")


In [50]:
df = pd.read_csv('../data/raw/Cardiovascular_Disease_Dataset/Cardiovascular_Disease_Dataset.csv')
df.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


## Split the data

In [51]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size = 0.3,random_state=123)

X_train = train_df.drop(columns = ['target'])
y_train = train_df['target']
X_test = test_df.drop(columns = ['target'])
y_test = test_df['target']

X_train.to_csv('../data/processed/train_data.csv')
X_test.to_csv('../data/processed/test_data.csv')

## Define the column preprocessor

In [52]:
binary = ['gender','fastingbloodsugar','exerciseangia']
ohe = ['chestpain','restingrelectro']
numerical = ['age','restingBP','serumcholestrol','maxheartrate','oldpeak','noofmajorvessels']
ordinal = ['slope']
drop = ['patientid']

preprocessor = make_column_transformer(
 (StandardScaler(), numerical),
 (OneHotEncoder(), ohe),
 (OrdinalEncoder(), ordinal),
 ('passthrough', binary),
 ('drop', drop)
)

## Viewing the preprocessed columns

In [53]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
column_names = (
 numerical
 + ordinal
 + binary
 + preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(ohe).tolist())
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns = column_names)
X_train_preprocessed

Unnamed: 0,age,restingBP,serumcholestrol,maxheartrate,oldpeak,noofmajorvessels,slope,gender,fastingbloodsugar,exerciseangia,chestpain_0,chestpain_1,chestpain_2,chestpain_3,restingrelectro_0,restingrelectro_1,restingrelectro_2
0,1.524759,-0.326142,-0.415573,-0.295436,-1.128842,-1.236907,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,-1.336118,-1.567080,0.162547,0.589566,-0.138557,0.783711,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,-0.045918,-0.829225,-1.259932,-1.722858,-0.196809,-1.236907,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-1.616596,1.149569,0.360325,1.446020,0.036199,0.783711,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0
4,1.356472,1.585574,1.546993,-0.124145,-1.245346,-1.236907,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1.412568,-0.527375,-0.248222,-0.666566,-0.837581,-1.236907,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
696,1.300377,1.417880,-0.035231,0.332630,1.783761,-0.226598,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0
697,0.739420,-0.728608,-0.484034,-1.066244,-0.895834,0.783711,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
698,0.571134,1.384341,1.151437,-0.181242,-1.361850,0.783711,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [54]:
X_train_preprocessed.to_csv('../data/processed/train_preprocessed.csv')

X_test_preprocessed = preprocessor.transform(X_test)
column_names = (
 numerical
 + ordinal
 + binary
 + preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(ohe).tolist())
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns = column_names)
X_test_preprocessed.to_csv('../data/processed/test_preprocessed.csv')

## Testing preprocessed output with a classifier

In [55]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [56]:
dt = DecisionTreeClassifier()
pipe = make_pipeline(preprocessor, DecisionTreeClassifier(max_depth=5))

In [57]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('decisiontreeclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('standardscaler', ...), ('onehotencoder', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [58]:
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.9666666666666667