In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer, MaxAbsScaler, StandardScaler, OneHotEncoder

# Load the CSV file
file_path = './rawdata/1_1.csv'  # Adjust the file path as needed
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Print the identified variables
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)
print("Binary Variables:", binary_vars)

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=categorical_vars, drop_first=True)

# Function to apply different scalers and save the transformed data
def apply_and_save_scaler(scaler, scaler_name, data, continuous_vars):
    data_copy = data.copy()
    data_copy[continuous_vars] = scaler.fit_transform(data_copy[continuous_vars])
    output_path = f'./preprocessed_data/3-{scaler_name}_onehot_encoding_1_1.csv'
    data_copy.to_csv(output_path, index=False)
    print(f"Data transformed using {scaler_name} and saved to {output_path}")

# Apply MinMaxScaler
apply_and_save_scaler(MinMaxScaler(), "minmax", data, continuous_vars)

# Apply RobustScaler
apply_and_save_scaler(RobustScaler(), "robust", data, continuous_vars)

# Apply Log Transformation
log_data = data.copy()
log_data[continuous_vars] = log_data[continuous_vars].applymap(lambda x: np.log(x + 1) if x >= 0 else np.nan)
log_data.dropna(inplace=True)  # Drop rows with NaN values resulting from negative inputs
log_output_path = './preprocessed_data/3-log_onehot_encoding_1_1.csv'
log_data.to_csv(log_output_path, index=False)
print(f"Data transformed using Log Transformation and saved to {log_output_path}")

# Apply Normalizer
apply_and_save_scaler(Normalizer(), "normalize", data, continuous_vars)

# Apply MaxAbsScaler
apply_and_save_scaler(MaxAbsScaler(), "maxabs", data, continuous_vars)

# Apply StandardScaler (Z-score normalization)
apply_and_save_scaler(StandardScaler(), "zscore", data, continuous_vars)

# Final check for the transformed data shapes and contents
print("Transformation complete for all specified scalers.")


Continuous Variables: ['AGE', 'S_AD_ORIT', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE']
Categorical Variables: ['INF_ANAM', 'STENOK_AN', 'IBS_POST', 'GB', 'ZSN_A', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n']
Binary Variables: ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_02', 'nr_03', 'nr_04', 'endocr_01', 'endocr_02', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_06', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_04', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_p_ecg_p_03', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_03', 'GIPO_K', 'GIPER_NA', 'NITR_S', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n', 'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n']
Data transformed using minmax and saved to ./preprocessed_data/3-minmax_oneho

  log_data[continuous_vars] = log_data[continuous_vars].applymap(lambda x: np.log(x + 1) if x >= 0 else np.nan)
