# minMax + one-hot

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Load the CSV file
file_path = './rawdata/1_1.csv'  # 파일 경로를 적절히 수정하세요
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Print the identified variables
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)
print("Binary Variables:", binary_vars)

# Apply MinMaxScaler to continuous variables
scaler = MinMaxScaler()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Apply OneHotEncoder to categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_categorical_data = encoder.fit_transform(data[categorical_vars])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_vars))

# Drop original categorical columns and concatenate encoded columns
data = data.drop(columns=categorical_vars)
data = pd.concat([data, encoded_categorical_df], axis=1)

# check after transformed data shape
print("Transformed Data Shape:", data.shape)

# If you want to save the transformed dataframe to a new CSV file
output_path = './2-minmax_onehot_1_5.csv'  # 파일 저장 경로를 적절히 수정하세요
data.to_csv(output_path, index=False)


Continuous Variables: ['Unnamed: 0.1', 'AGE', 'S_AD_ORIT', 'D_AD_ORIT', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE']
Categorical Variables: ['INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n']
Binary Variables: ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_02', 'nr_03', 'nr_04', 'endocr_01', 'endocr_02', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_06', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_p_ecg_p_03', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_03', 'GIPO_K', 'GIPER_NA', 'NITR_S', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n', 'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TR

# minMax + label encoding

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load the CSV file
file_path = './rawdata/1_1.csv'  # Adjust the file path as needed
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Print the identified variables
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)
print("Binary Variables:", binary_vars)

# Apply MinMaxScaler to continuous variables
scaler = MinMaxScaler()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Apply LabelEncoder to categorical variables
label_encoders = {}
for column in categorical_vars:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le


# Check the shape of the transformed dataframe
print("Transformed Data Shape:", data.shape)

# Verify label encoding by checking unique values in one of the categorical columns
for column in categorical_vars:
    print(f"Unique values in {column} after label encoding: {data[column].unique()}")

# If you want to save the transformed dataframe to a new CSV file
output_path = './preprocessed_data/2-minmax_label_1_1.csv'  # Adjust the file save path as needed
data.to_csv(output_path, index=False)


Continuous Variables: ['AGE', 'S_AD_ORIT', 'D_AD_ORIT', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE']
Categorical Variables: ['INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n']
Binary Variables: ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_02', 'nr_03', 'nr_04', 'endocr_01', 'endocr_02', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_06', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_p_ecg_p_03', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_03', 'GIPO_K', 'GIPER_NA', 'NITR_S', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n', 'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n']
Transf

# only minMax, not (categorical preprocesseing) included

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the CSV file
file_path = './rawdata/1_1.csv'  # Adjust the file path as needed
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Print the identified variables
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)
print("Binary Variables:", binary_vars)

# Apply MinMaxScaler to continuous variables
scaler = MinMaxScaler()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Categorical variables are left in their original form
# Concatenate the data to ensure we have the continuous and original categorical variables

# Drop the original categorical columns if needed to avoid redundancy
# In this case, it's not necessary since we're not encoding them

# Ensure binary variables remain unchanged
# They should already be in an appropriate form (0 and 1)

# Final check for the transformed data shape and contents
print("Transformed Data Shape:", data.shape)

# If you want to save the transformed dataframe to a new CSV file
output_path = './preprocessed_data/2-minmax_no_encoding_1_1.csv'  # Adjust the file save path as needed
data.to_csv(output_path, index=False)


Continuous Variables: ['AGE', 'S_AD_ORIT', 'D_AD_ORIT', 'K_BLOOD', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE']
Categorical Variables: ['INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n']
Binary Variables: ['SEX', 'SIM_GIPERT', 'nr_11', 'nr_02', 'nr_03', 'nr_04', 'endocr_01', 'endocr_02', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_06', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_p_ecg_p_03', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_03', 'GIPO_K', 'GIPER_NA', 'NITR_S', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n', 'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n']
Transf