In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

df = pd.read_csv("../data/raw_data.csv")

In [None]:
# ----------------------------
# STEP 1. Drop Unused Columns
# ----------------------------
def drop_unused_columns(df):
    return df.drop(columns=["user_id"], errors="ignore")


# ----------------------------
# STEP 2. 결측치 처리
# ----------------------------
def handle_missing_values(df):
    df["listening_time"] = df["listening_time"].fillna(df["listening_time"].median())
    df["songs_played_per_day"] = df["songs_played_per_day"].fillna(df["songs_played_per_day"].median())
    return df


# ----------------------------]
# STEP 3. 이상치 처리
# ----------------------------
def handle_outliers(df):
    # skip_rate cap
    skip_cap = df["skip_rate"].quantile(0.99)
    df["skip_rate"] = np.where(df["skip_rate"] > skip_cap, skip_cap, df["skip_rate"])

    # ads_listened_per_week winsorizing
    lower = df["ads_listened_per_week"].quantile(0.01)
    upper = df["ads_listened_per_week"].quantile(0.99)
    df["ads_listened_per_week"] = df["ads_listened_per_week"].clip(lower, upper)
    
    return df


# ----------------------------
# STEP 4. 범주형 인코딩
# ----------------------------
def encode_categorical(df):
    # country grouping
    # top5 = df["country"].value_counts().head(5).index
    # df["country_grouped"] = df["country"].apply(lambda x: x if x in top5 else "other") 
    # 나라 범주가 8개 뿐이어서 굳이 상위 5개 꼽지 않아도 될 것 같음

    cat_cols = ["subscription_type", "device_type", "gender", "country_grouped"]

    encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
    encoded = encoder.fit_transform(df[cat_cols])

    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols))
    
    df = pd.concat([df.drop(columns=cat_cols), encoded_df], axis=1)
    return df


# ----------------------------
# STEP 5. 수치형 스케일링
# ----------------------------
def scale_numeric(df, method="standard"):
    numeric_cols = [
        "age", "listening_time", "songs_played_per_day", "skip_rate",
        "ads_listened_per_week", "offline_listening"
    ]

    if method == "standard":
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    elif method == "minmax":
        scaler = MinMaxScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df


# ----------------------------
# STEP 6. Feature Engineering
# ----------------------------
def feature_engineering(df):
    df["engagement_score"] = (
        df["listening_time"] * 0.4 +
        df["songs_played_per_day"] * 0.3 +
        (1 - df["skip_rate"]) * 0.3
    )

    df["listening_time_bin"] = pd.qcut(
        df["listening_time"], q=4, labels=["low", "mid_low", "mid_high", "high"]
    )

    df["skip_rate_cap"] = df["skip_rate"]

    df["ads_listened_log"] = np.log1p(df["ads_listened_per_week"])

    return df


# ----------------------------
# MASTER PIPELINE
# ----------------------------
def run_preprocessing(df, scale_type="standard", feature_eng=True):
    df = drop_unused_columns(df)
    df = handle_missing_values(df)
    df = handle_outliers(df)
    df = encode_categorical(df)
    
    if scale_type is not None:
        df = scale_numeric(df, method=scale_type)
    
    if feature_eng:
        df = feature_engineering(df)
    
    return df