In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, OneHotEncoder

# Load the CSV file
file_path = './1_5.csv'  # 파일 경로를 적절히 수정하세요
data = pd.read_csv(file_path)

# Check the columns in the dataframe
print("Columns before dropping:", data.columns.tolist())

# Identify and drop columns that are either unnamed or completely empty
columns_to_drop = [col for col in data.columns if 'Unnamed' in col or data[col].isnull().all() or data[col].eq('').all()]
data = data.drop(columns=columns_to_drop)

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Print the identified variables
print("Continuous Variables:", continuous_vars)
print("Categorical Variables:", categorical_vars)
print("Binary Variables:", binary_vars)

# Apply RobustScaler to continuous variables
scaler = RobustScaler()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Apply OneHotEncoder to categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_categorical_data = encoder.fit_transform(data[categorical_vars])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_vars))

# Drop original categorical columns and concatenate encoded columns
data = data.drop(columns=categorical_vars)
data = pd.concat([data, encoded_categorical_df], axis=1)

# Check after transformed data shape
print("Shape after transformation:", data.shape)

# If you want to save the transformed dataframe to a new CSV file
output_path = './2-robust_onehot_1_5.csv'  # 파일 저장 경로를 적절히 수정하세요
data.to_csv(output_path, index=False)


Columns before dropping: ['Unnamed: 0.1', 'Unnamed: 0', 'AGE', 'SEX', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'SIM_GIPERT', 'DLIT_AG', 'ZSN_A', 'nr_11', 'nr_02', 'nr_03', 'nr_04', 'endocr_01', 'endocr_02', 'zab_leg_01', 'zab_leg_02', 'zab_leg_03', 'zab_leg_06', 'S_AD_ORIT', 'D_AD_ORIT', 'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02', 'ritm_ecg_p_04', 'ritm_ecg_p_07', 'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05', 'n_r_ecg_p_06', 'n_p_ecg_p_03', 'n_p_ecg_p_06', 'n_p_ecg_p_07', 'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12', 'fibr_ter_03', 'GIPO_K', 'K_BLOOD', 'GIPER_NA', 'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S', 'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NITR_S', 'NA_R_1_n', 'NA_R_2_n', 'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n', 'LID_S_n', 'B_BLOK_S_n', 'ANT_CA_S_n', 'GEPAR_S_n', 'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n']
Continuous Varia