### 결측치 처리
- Education_Level
    - 학력을 기입하지 않은 범주로 분류
- Marital_Status
    - 혼인 상태를 기입하지 않은 범주로 분류
- Income_Category
    - 연령대 별 카드 등급에 따른 평균값으로 채우기

In [7]:
import pandas as pd

data = pd.read_csv("../../data/BankChurners.csv")
data = data.iloc[:, :-2]
F_Income_Category = "Income_Category"
F_Converted_Income_Category = "Converted_Income_Category"
F_Customer_Age = "Customer_Age"
F_Card_Category = "Card_Category"
F_Converted_Age = "Converted_Age"
F_Education_Level = "Education_Level"
F_Converted_Education_Level = "Converted_Education_Level"
F_Marital_Status = "Marital_Status"
F_Converted_Marital_Status = "Converted_Marital_Status"

def convert_value(income_val: str) -> int:
    if income_val == "Less than $40K":
        return 20_000
    elif income_val == "$40K - $60K":
        return 50_000
    elif income_val == "$60K - $80K":
        return 70_000
    elif "$80K - $120K":
        return 100_000
    else:
        return income_val

converted_income_data = data[F_Income_Category].apply(convert_value)
converted_income_data.name = F_Converted_Income_Category
data_set_income = pd.concat([data, converted_income_data], axis="columns")
valid_income_indices = data_set_income[F_Income_Category] != "Unknown"
selected_other_data = data_set_income[valid_income_indices][[F_Customer_Age, F_Card_Category, F_Converted_Income_Category]]

grouped_age_card_mean_data = selected_other_data.groupby([F_Customer_Age, F_Card_Category]).agg("mean")
grouped_age_card_mean_data[F_Converted_Income_Category] = grouped_age_card_mean_data[F_Converted_Income_Category].apply(lambda x: int(x))
grouped_age_card_mean_data = grouped_age_card_mean_data.reset_index()

age_card_indexed_data = grouped_age_card_mean_data.set_index(keys=[F_Customer_Age, F_Card_Category], drop=True)

serise_means_of_age = selected_other_data[F_Customer_Age].apply(lambda x: x // 10)
selected_other_data[F_Converted_Age] = serise_means_of_age
grouped_age_mean_data = selected_other_data[[F_Converted_Age, F_Converted_Income_Category]].groupby(F_Converted_Age).agg("mean")
grouped_age_mean_data[F_Converted_Income_Category] = grouped_age_mean_data[F_Converted_Income_Category].apply(lambda x: int(x))

def get_income(age: int, card_category: str) -> int | None :
    value = age_card_indexed_data[F_Converted_Income_Category].get((age, card_category), None)
    if value is not None:
        return value
    else:
        return grouped_age_mean_data[F_Converted_Income_Category].get(age // 10)

def replace_income(idx: int) -> int:
    if data.loc[idx, F_Income_Category] != "Unknown":
        return converted_income_data[idx]
    else:
        return get_income(data.loc[idx, F_Customer_Age], data.loc[idx, F_Card_Category])

data[F_Converted_Income_Category] = data[:].index.to_series().map(replace_income)
data[F_Converted_Education_Level] = data[F_Education_Level].apply(lambda x: x if x != "Unknown" else "Unanswered")
data[F_Converted_Marital_Status] = data[F_Marital_Status].apply(lambda x: x if x != "Unknown" else "Unanswered")

data.to_csv("missing_value_processing_data_yc.csv")

data

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Converted_Income_Category,Converted_Education_Level,Converted_Marital_Status
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,777,11914.0,1.335,1144,42,1.625,0.061,70000,High School,Married
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,864,7392.0,1.541,1291,33,3.714,0.105,20000,Graduate,Single
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,0,3418.0,2.594,1887,20,2.333,0.000,100000,Graduate,Married
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,2517,796.0,1.405,1171,20,2.333,0.760,20000,High School,Unanswered
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,0,4716.0,2.175,816,28,2.500,0.000,70000,Uneducated,Married
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,...,1851,2152.0,0.703,15476,117,0.857,0.462,50000,Graduate,Single
10123,710638233,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,...,2186,2091.0,0.804,8764,69,0.683,0.511,50000,Unanswered,Divorced
10124,716506083,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,...,0,5409.0,0.819,10291,60,0.818,0.000,20000,High School,Married
10125,717406983,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,...,0,5281.0,0.535,8395,62,0.722,0.000,50000,Graduate,Unanswered


### 인코딩 처리
- Attrition_Flag: One-hot
- Gender: One-hot
- Education_Level: One-hot
- Marital_Status: One-hot
- Income_Category: value
- Card_Category: value

In [26]:
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("missing_value_processing_data_yc.csv")
data.drop(columns=["CLIENTNUM", "Avg_Open_To_Buy"], inplace=True)
F_Attrition_Flag = "Attrition_Flag"
data[F_Attrition_Flag] = data[F_Attrition_Flag].apply(lambda x: 0 if x == "Existing Customer" else 1)

def one_hot_encode(data: list[str]) -> list[list[int]]:
    encoder = LabelEncoder()
    encoder.fit(data)
    labels = encoder.transform(data)
    labels = labels.reshape(-1, 1)
    oh_encoder = OneHotEncoder()
    oh_encoder.fit(labels)
    return oh_encoder.transform(labels).toarray()
    
F_Gender = "Gender"
data[F_Gender] = one_hot_encode(data[F_Gender])


F_Education_Level = "Education_Level"
data[F_Education_Level] = data["Converted_Education_Level"]
data.drop(columns="Converted_Education_Level", inplace=True)
data[F_Education_Level] = one_hot_encode(data[F_Education_Level])

F_Marital_Status = "Marital_Status"
data[F_Marital_Status] = data["Converted_Marital_Status"]
data.drop(columns="Converted_Marital_Status", inplace=True)
data[F_Marital_Status] = one_hot_encode(data[F_Marital_Status])

F_Card_Category = "Card_Category"

def convert_card_category(str):
    if str == "Blue":
        return 1
    elif str == "Silver":
        return 2
    elif str == "Gold":
        return 3
    else:
        return 4

data[F_Card_Category] = data[F_Card_Category].apply(convert_card_category)

data["Income_Category"] = data["Converted_Income_Category"]
data.drop(columns="Converted_Income_Category", inplace=True)

data.drop(columns="Unnamed: 0", axis=1, inplace=True)

data

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,45,0.0,3,0.0,0.0,70000,1,39,5,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,0,49,1.0,5,0.0,0.0,20000,1,44,6,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,0,51,0.0,3,0.0,0.0,100000,1,36,4,1,0,3418.0,0,2.594,1887,20,2.333,0.000
3,0,40,1.0,4,0.0,0.0,20000,1,34,3,4,1,3313.0,2517,1.405,1171,20,2.333,0.760
4,0,40,0.0,3,0.0,0.0,70000,1,21,5,1,0,4716.0,0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,0,50,0.0,2,0.0,0.0,50000,1,40,3,2,3,4003.0,1851,0.703,15476,117,0.857,0.462
10123,1,41,0.0,2,0.0,1.0,50000,1,25,4,2,3,4277.0,2186,0.804,8764,69,0.683,0.511
10124,1,44,1.0,1,0.0,0.0,20000,1,36,5,3,4,5409.0,0,0.819,10291,60,0.818,0.000
10125,1,30,0.0,2,0.0,0.0,50000,1,36,4,3,3,5281.0,0,0.535,8395,62,0.722,0.000


### 정규화 처리
- ML 진행 전 처리