### 결측치 처리
- Education_Level
    - 학력을 기입하지 않은 범주로 분류
- Marital_Status
    - 혼인 상태를 기입하지 않은 범주로 분류
- Income_Category
    - 연령대 별 카드 등급에 따른 평균값으로 채우기

In [1]:
import pandas as pd

data = pd.read_csv("../../data/BankChurners.csv")
data = data.iloc[:, :-2]
F_Income_Category = "Income_Category"
F_Converted_Income_Category = "Converted_Income_Category"
F_Customer_Age = "Customer_Age"
F_Card_Category = "Card_Category"
F_Converted_Age = "Converted_Age"
F_Education_Level = "Education_Level"
F_Converted_Education_Level = "Converted_Education_Level"
F_Marital_Status = "Marital_Status"
F_Converted_Marital_Status = "Converted_Marital_Status"

def convert_value(income_val: str) -> int:
    if income_val == "Less than $40K":
        return 20_000
    elif income_val == "$40K - $60K":
        return 50_000
    elif income_val == "$60K - $80K":
        return 70_000
    elif "$80K - $120K":
        return 100_000
    else:
        return income_val

converted_income_data = data[F_Income_Category].apply(convert_value)
converted_income_data.name = F_Converted_Income_Category
data_set_income = pd.concat([data, converted_income_data], axis="columns")
valid_income_indices = data_set_income[F_Income_Category] != "Unknown"
selected_other_data = data_set_income[valid_income_indices][[F_Customer_Age, F_Card_Category, F_Converted_Income_Category]]

grouped_age_card_mean_data = selected_other_data.groupby([F_Customer_Age, F_Card_Category]).agg("mean")
grouped_age_card_mean_data[F_Converted_Income_Category] = grouped_age_card_mean_data[F_Converted_Income_Category].apply(lambda x: int(x))
grouped_age_card_mean_data = grouped_age_card_mean_data.reset_index()

age_card_indexed_data = grouped_age_card_mean_data.set_index(keys=[F_Customer_Age, F_Card_Category], drop=True)

serise_means_of_age = selected_other_data[F_Customer_Age].apply(lambda x: x // 10)
selected_other_data[F_Converted_Age] = serise_means_of_age
grouped_age_mean_data = selected_other_data[[F_Converted_Age, F_Converted_Income_Category]].groupby(F_Converted_Age).agg("mean")
grouped_age_mean_data[F_Converted_Income_Category] = grouped_age_mean_data[F_Converted_Income_Category].apply(lambda x: int(x))

def get_income(age: int, card_category: str) -> int | None :
    value = age_card_indexed_data[F_Converted_Income_Category].get((age, card_category), None)
    if value is not None:
        return value
    else:
        return grouped_age_mean_data[F_Converted_Income_Category].get(age // 10)

def replace_income(idx: int) -> int:
    if data.loc[idx, F_Income_Category] != "Unknown":
        return converted_income_data[idx]
    else:
        return get_income(data.loc[idx, F_Customer_Age], data.loc[idx, F_Card_Category])

data[F_Converted_Income_Category] = data[:].index.to_series().map(replace_income)
data[F_Converted_Education_Level] = data[F_Education_Level].apply(lambda x: x if x != "Unknown" else "Unanswered")
data[F_Converted_Marital_Status] = data[F_Marital_Status].apply(lambda x: x if x != "Unknown" else "Unanswered")

data.to_csv("missing_value_processing_data_yc.csv")

### 인코딩 처리
- Attrition_Flag: One-hot
- Gender: One-hot
- Education_Level: One-hot
- Marital_Status: One-hot
- Income_Category: value
- Card_Category: value

In [3]:
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("missing_value_processing_data_yc.csv")

F_Attrition_Flag = "Attrition_Flag"
data[F_Attrition_Flag] = data[F_Attrition_Flag].apply(lambda x: 0 if x == "Existing Customer" else 1)

def one_hot_encode(data: list[str]) -> list[list[int]]:
    encoder = LabelEncoder()
    encoder.fit(data)
    labels = encoder.transform(data)
    labels = labels.reshape(-1, 1)
    oh_encoder = OneHotEncoder()
    oh_encoder.fit(labels)
    return oh_encoder.transform(labels).toarray()
    
F_Gender = "Gender"
data[F_Gender] = one_hot_encode(data[F_Gender])

F_Education_Level = "Education_Level"
data[F_Education_Level] = one_hot_encode(data[F_Education_Level])

F_Marital_Status = "Marital_Status"
data[F_Marital_Status] = one_hot_encode(data[F_Marital_Status])

F_Card_Category = "Card_Category"

def convert_card_category(str):
    if str == "Blue":
        return 1
    elif str == "Silver":
        return 2
    elif str == "Gold":
        return 3
    else:
        return 4

data[F_Card_Category] = data[F_Card_Category].apply(convert_card_category)

### 정규화 처리
- ML 진행 전 처리