In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer, OneHotEncoder

# Load the CSV file
file_path = './1_5.csv'  # 파일 경로를 적절히 수정하세요
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
columns_to_drop = [col for col in data.columns if 'Unnamed' in col or data[col].isnull().all() or data[col].eq('').all()]
data = data.drop(columns=columns_to_drop)

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Apply Normalizer to continuous variables
scaler = Normalizer()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Apply OneHotEncoder to categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical_data = encoder.fit_transform(data[categorical_vars])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_vars))

# Drop original categorical columns and concatenate encoded columns
data = data.drop(columns=categorical_vars)
data = pd.concat([data, encoded_categorical_df], axis=1)

# Check the shape of the transformed dataframe
print("Transformed Data Shape (Normalizer + OneHotEncoder):", data.shape)

# Save the transformed dataframe to a new CSV file
output_path = './2-normalized_onehot_1_5.csv'  # 파일 저장 경로를 적절히 수정하세요
data.to_csv(output_path, index=False)


Transformed Data Shape (Normalizer + OneHotEncoder): (3400, 132)


In [2]:
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder

# Load the CSV file
file_path = './1_5.csv'  # 파일 경로를 적절히 수정하세요
data = pd.read_csv(file_path)

# Removing the potential index column 'Unnamed: 0' for clarity
columns_to_drop = [col for col in data.columns if 'Unnamed' in col or data[col].isnull().all() or data[col].eq('').all()]
data = data.drop(columns=columns_to_drop)

# Identifying types of variables
continuous_vars = []
categorical_vars = []
binary_vars = []

for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values == 2:
        binary_vars.append(column)
    elif unique_values <= 10:
        categorical_vars.append(column)
    else:
        continuous_vars.append(column)

# Apply Normalizer to continuous variables
scaler = Normalizer()
data[continuous_vars] = scaler.fit_transform(data[continuous_vars])

# Apply LabelEncoder to categorical variables
label_encoders = {}
for column in categorical_vars:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Check the shape of the transformed dataframe
print("Transformed Data Shape (Normalizer + LabelEncoder):", data.shape)

# Save the transformed dataframe to a new CSV file
output_path = './2-normalized_label_1_5.csv'  # 파일 저장 경로를 적절히 수정하세요
data.to_csv(output_path, index=False)


Transformed Data Shape (Normalizer + LabelEncoder): (3400, 73)
