In [12]:
import requests
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  
import matplotlib.pyplot as plt
import numpy as np
from ucimlrepo import fetch_ucirepo

In [13]:
# fetch dataset 
hcv_data = fetch_ucirepo(id=571)

# data (as pandas dataframes) 
X = hcv_data.data.features 
y = hcv_data.data.targets

# metadata 
print(hcv_data.metadata)

{'uci_id': 571, 'name': 'HCV data', 'repository_url': 'https://archive.ics.uci.edu/dataset/571/hcv+data', 'data_url': 'https://archive.ics.uci.edu/static/public/571/data.csv', 'abstract': 'The data set contains laboratory values of blood donors and Hepatitis C patients and demographic values like age.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 615, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['Category'], 'index_col': ['ID'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5D612', 'creators': ['Ralf Lichtinghagen', 'Frank Klawonn', 'Georg Hoffmann'], 'intro_paper': {'title': 'Using machine learning techniques to generate laboratory diagnostic pathways—a case study', 'authors': 'Georg F. Hoffmann, A. Bietenbeck, R. Lichtinghagen, F. Kla

In [14]:
# variable information 
print(hcv_data.variables)

        name     role         type demographic  \
0         ID       ID      Integer        None   
1        Age  Feature      Integer         Age   
2        Sex  Feature       Binary         Sex   
3        ALB  Feature   Continuous        None   
4        ALP  Feature   Continuous        None   
5        AST  Feature   Continuous        None   
6        BIL  Feature   Continuous        None   
7        CHE  Feature   Continuous        None   
8       CHOL  Feature   Continuous        None   
9       CREA  Feature   Continuous        None   
10       CGT  Feature   Continuous        None   
11      PROT  Feature   Continuous        None   
12  Category   Target  Categorical        None   
13       ALT  Feature   Continuous        None   

                                          description  units missing_values  
0                                          Patient ID   None             no  
1                                                None  years             no  
2              

In [15]:
# Identify categorical and numerical columns
categorical_cols = ['Sex']  # Replace with your actual categorical columns
numerical_cols = X.columns.difference(categorical_cols + ['ID'])  # Exclude 'ID' and categorical columns

In [16]:
 # Define the preprocessing pipelines for both numerical and categorical data
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent
    ('encoder', OneHotEncoder(drop='if_binary'))  # Encode binary categorical features
])

In [17]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [18]:
# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

In [19]:
# Convert preprocessed data back to a DataFrame for easy manipulation and saving
X_preprocessed_df = pd.DataFrame(X_preprocessed, index=X.index,
                                 columns=(numerical_cols.tolist() + preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols).tolist()))

In [20]:
# Save the preprocessed data to a new CSV file
X_preprocessed_df.to_csv('preprocessed_hcv_data.csv', index=False)