In [77]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 컬럼 타입 최종 점검

In [56]:
def check_dtypes(df):
    print("=== dtype 분포 ===")
    print(df.dtypes.value_counts())
    print("\n=== 컬럼별 dtype ===")
    display(
        df.dtypes
          .to_frame("dtype")
          .reset_index()
          .rename(columns={"index": "column"})
    )

In [57]:
members_train = pd.read_parquet("data/processed/members_with_label_base_optimized.parquet")
user_logs_agg = pd.read_parquet("data/processed/user_logs_aggregated_optimized.parquet")
transactions_agg = pd.read_parquet("data/processed/transactions_aggregated_optimized.parquet")

In [58]:
check_dtypes(members_train)

=== dtype 분포 ===
int8              2
string[python]    1
int16             1
category          1
datetime64[ns]    1
float32           1
period[M]         1
Int8              1
Name: count, dtype: int64

=== 컬럼별 dtype ===


Unnamed: 0,column,dtype
0,msno,string[python]
1,city,int8
2,bd,int16
3,gender,category
4,registered_via,int8
5,registration_init_time,datetime64[ns]
6,bd_clean,float32
7,registration_month,period[M]
8,is_churn,Int8


In [59]:
check_dtypes(user_logs_agg)

=== dtype 분포 ===
float32           42
Int16             20
Int8               4
string[python]     1
Name: count, dtype: int64

=== 컬럼별 dtype ===


Unnamed: 0,column,dtype
0,msno,string[python]
1,num_days_active_w7,Int8
2,total_secs_w7,float32
3,avg_secs_per_day_w7,float32
4,std_secs_w7,float32
...,...,...
62,songs_trend_w14_w30,float32
63,skip_trend_w7_w30,float32
64,completion_trend_w7_w30,float32
65,recency_secs_ratio,float32


In [60]:
check_dtypes(transactions_agg)

=== dtype 분포 ===
UInt8             9
UInt16            4
float32           2
string[python]    1
Name: count, dtype: int64

=== 컬럼별 dtype ===


Unnamed: 0,column,dtype
0,msno,string[python]
1,days_since_last_payment,UInt16
2,has_ever_paid,UInt8
3,days_since_last_cancel,UInt16
4,has_ever_cancelled,UInt8
5,is_auto_renew_last,UInt8
6,last_plan_days,UInt16
7,last_payment_method,UInt8
8,is_free_user,UInt8
9,total_payment_count,UInt8


In [61]:
def find_categorical_candidates(df, max_unique=50):
    candidates = []

    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            nunique = df[col].nunique(dropna=True)
            if nunique <= max_unique:
                candidates.append((col, nunique))

    print("=== 범주형 후보 컬럼 ===")
    for col, n in sorted(candidates, key=lambda x: x[1]):
        print(f"{col:30s} | unique={n}")

    return [col for col, _ in candidates]

In [62]:
cat_candidates1 = find_categorical_candidates(members_train)

=== 범주형 후보 컬럼 ===
is_churn                       | unique=2
registered_via                 | unique=5
city                           | unique=21


In [63]:
cat_candidates2 = find_categorical_candidates(user_logs_agg)

=== 범주형 후보 컬럼 ===
num_days_active_w7             | unique=8
num_days_active_w14            | unique=15
num_days_active_w21            | unique=22
num_days_active_w30            | unique=31


In [64]:
cat_candidates3 = find_categorical_candidates(transactions_agg)

=== 범주형 후보 컬럼 ===
has_ever_paid                  | unique=2
has_ever_cancelled             | unique=2
is_auto_renew_last             | unique=2
is_free_user                   | unique=2
unique_plan_count              | unique=5
payment_count_last_30d         | unique=16
payment_count_last_90d         | unique=26
last_plan_days                 | unique=31
last_payment_method            | unique=35


In [65]:
TARGET = "is_churn"
ID_COL = "msno"

In [66]:
df = (
    members_train
        .merge(user_logs_agg, on="msno", how="left")
        .merge(transactions_agg, on="msno", how="left")
)

In [70]:
df.shape

(860967, 90)

In [67]:
assert len(df) == len(members_train)

In [68]:
print(df["is_churn"].value_counts())
df["is_churn"].value_counts(normalize=True)

is_churn
0    779519
1     81448
Name: count, dtype: Int64


is_churn
0    0.905399
1    0.094601
Name: proportion, dtype: Float64

In [69]:
df.isna().mean().sort_values(ascending=False).head(30)

bd_clean                   0.551209
num_100_w21                0.123623
short_play_ratio_w21       0.123623
avg_songs_per_day_w30      0.123623
num_songs_w30              0.123623
std_secs_w30               0.123623
avg_secs_per_day_w30       0.123623
total_secs_w30             0.123623
num_days_active_w30        0.123623
variety_ratio_w21          0.123623
completion_ratio_w21       0.123623
num_25_w30                 0.123623
skip_ratio_w21             0.123623
short_play_w21             0.123623
num_25_w21                 0.123623
num_unq_w21                0.123623
avg_songs_per_day_w21      0.123623
num_songs_w21              0.123623
num_unq_w30                0.123623
num_100_w30                0.123623
total_secs_w21             0.123623
days_trend_w7_w30          0.123623
recency_songs_ratio        0.123623
recency_secs_ratio         0.123623
completion_trend_w7_w30    0.123623
skip_trend_w7_w30          0.123623
songs_trend_w14_w30        0.123623
songs_trend_w7_w30         0

## 결측 처리
1) user_logs 집계 피처들 : 0으로 처리. (활동이 없었으니까.)
2) bd_clean : 쓸 수 없는 나이는 이미 전처리때 일부로 np.nan으로 채워놓음 (LightGBM 기준으로 유리한 처리). Logistic Regression에선 nan에 제로필해라.

In [71]:
log_cols = [c for c in df.columns if c.endswith(("_w7","_w14","_w21","_w30")) or "trend" in c or "ratio" in c]
df[log_cols] = df[log_cols].fillna(0)

In [72]:
df.isna().mean().sort_values(ascending=False).head(30)

bd_clean                   0.551209
payment_count_last_90d     0.041348
is_free_user               0.041348
days_since_last_payment    0.041348
days_since_last_cancel     0.041348
has_ever_cancelled         0.041348
is_auto_renew_last         0.041348
last_plan_days             0.041348
last_payment_method        0.041348
has_ever_paid              0.041348
total_payment_count        0.041348
avg_amount_per_payment     0.041348
unique_plan_count          0.041348
subscription_months_est    0.041348
payment_count_last_30d     0.041348
total_amount_paid          0.041348
std_secs_w30               0.000000
num_100_w30                0.000000
num_25_w30                 0.000000
num_unq_w30                0.000000
avg_songs_per_day_w30      0.000000
num_songs_w30              0.000000
num_days_active_w30        0.000000
avg_secs_per_day_w30       0.000000
total_secs_w30             0.000000
variety_ratio_w21          0.000000
short_play_ratio_w21       0.000000
completion_ratio_w21       0

3) transactions 파생 피처 결측 4.13% : 결제 이력이 아예 없는 유저 = free user

- 3-1. count / amount 계열 → 0

In [74]:
payment_zero_cols = [
    "payment_count_last_30d",
    "payment_count_last_90d",
    "total_payment_count",
    "total_amount_paid",
    "avg_amount_per_payment",
    "unique_plan_count",
    "subscription_months_est",
]

df[payment_zero_cols] = df[payment_zero_cols].fillna(0)

- 3-2. boolean / flag 계열 → 명시적으로 지정

In [75]:
df["has_ever_paid"] = df["has_ever_paid"].fillna(0).astype("int8")
df["has_ever_cancelled"] = df["has_ever_cancelled"].fillna(0).astype("int8")
df["is_auto_renew_last"] = df["is_auto_renew_last"].fillna(0).astype("int8")

# free user는 결제 이력 없음
df["is_free_user"] = df["is_free_user"].fillna(1).astype("int8")

- 3-3. “마지막 결제 기준” 날짜 계열

    - “결제한 적 없음”은
        → ‘오래됨’이 아니라 ‘존재하지 않음’ 이기 때문
    - LightGBM은 이 NaN을 자체적으로 분기해서 처리함

In [78]:
df["days_since_last_payment"] = df["days_since_last_payment"].fillna(np.nan)
df["days_since_last_cancel"] = df["days_since_last_cancel"].fillna(np.nan)
df["last_plan_days"] = df["last_plan_days"].fillna(np.nan)
df["last_payment_method"] = df["last_payment_method"].fillna(np.nan)

In [None]:
df.isna().mean().sort_values(ascending=False).head(10)

bd_clean                   0.551209
days_since_last_cancel     0.041348
last_payment_method        0.041348
last_plan_days             0.041348
days_since_last_payment    0.041348
msno                       0.000000
variety_ratio_w30          0.000000
short_play_ratio_w30       0.000000
completion_ratio_w30       0.000000
skip_ratio_w30             0.000000
dtype: float64

In [80]:
df.head(10)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,bd_clean,registration_month,is_churn,num_days_active_w7,total_secs_w7,avg_secs_per_day_w7,std_secs_w7,num_songs_w7,avg_songs_per_day_w7,num_unq_w7,num_25_w7,num_100_w7,short_play_w7,skip_ratio_w7,completion_ratio_w7,short_play_ratio_w7,variety_ratio_w7,num_days_active_w14,total_secs_w14,avg_secs_per_day_w14,std_secs_w14,num_songs_w14,avg_songs_per_day_w14,num_unq_w14,num_25_w14,num_100_w14,short_play_w14,skip_ratio_w14,completion_ratio_w14,short_play_ratio_w14,variety_ratio_w14,num_days_active_w21,total_secs_w21,avg_secs_per_day_w21,std_secs_w21,num_songs_w21,avg_songs_per_day_w21,num_unq_w21,num_25_w21,num_100_w21,short_play_w21,skip_ratio_w21,completion_ratio_w21,short_play_ratio_w21,variety_ratio_w21,num_days_active_w30,total_secs_w30,avg_secs_per_day_w30,std_secs_w30,num_songs_w30,avg_songs_per_day_w30,num_unq_w30,num_25_w30,num_100_w30,short_play_w30,skip_ratio_w30,completion_ratio_w30,short_play_ratio_w30,variety_ratio_w30,secs_trend_w7_w30,secs_trend_w14_w30,days_trend_w7_w14,days_trend_w7_w30,songs_trend_w7_w30,songs_trend_w14_w30,skip_trend_w7_w30,completion_trend_w7_w30,recency_secs_ratio,recency_songs_ratio,days_since_last_payment,has_ever_paid,days_since_last_cancel,has_ever_cancelled,is_auto_renew_last,last_plan_days,last_payment_method,is_free_user,total_payment_count,total_amount_paid,avg_amount_per_payment,unique_plan_count,subscription_months_est,payment_count_last_30d,payment_count_last_90d
0,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,unknown,7,2011-09-14,,2011-09,0,7,75448.625,10778.375,9128.514648,338,48.285713,159,39,271,54,0.115385,0.801775,0.159763,0.470414,14,177639.296875,12688.521484,10458.754883,842,60.142857,480,127,641,157,0.150831,0.761283,0.186461,0.570071,20,238367.4,11918.371094,9021.441406,1156,57.799999,663,170,863,220,0.147059,0.74654,0.190311,0.573529,30,358554.0,11951.799805,7876.637695,1776,59.200001,1040,277,1296,355,0.155968,0.72973,0.199887,0.585586,0.210425,0.495432,0.5,0.233333,0.190315,0.474099,-0.040584,0.072045,0.210425,0.190315,5,1,999,0,1,30,41,0,1,129,129.0,1,1.0,1,1
1,yLkV2gbZ4GLFwqTOXLVHz0VGrMYcgBGgKZ3kj9RiYu8=,4,30,male,9,2011-09-16,30.0,2011-09,0,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.7,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.7,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1,1,999,0,1,30,39,0,2,298,149.0,1,2.0,1,2
2,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,13,63,male,9,2011-09-18,63.0,2011-09,0,3,16989.527344,5663.175781,1434.409424,70,23.333334,65,2,65,3,0.028571,0.928571,0.042857,0.928571,10,50269.140625,5026.914062,3101.173584,249,24.9,182,43,195,47,0.172691,0.783133,0.188755,0.730924,15,63667.99,4244.532715,2992.634277,352,23.466667,273,77,206,107,0.21875,0.585227,0.303977,0.775568,18,80453.32,4469.628906,2823.026123,416,23.111111,337,77,269,108,0.185096,0.646635,0.259615,0.810096,0.211172,0.624824,0.3,0.166667,0.168269,0.598558,-0.156525,0.281937,0.211172,0.168269,5,1,999,0,1,30,40,0,1,149,149.0,1,1.0,1,1
3,OoDwiKZM+ZGr9P3fRivavgOtglTEaNfWJO4KaJcTTts=,1,0,unknown,7,2011-09-18,,2011-09,1,1,6168.049805,6168.049805,0.0,23,23.0,23,0,22,0,0.0,0.956522,0.0,1.0,2,8142.378906,4071.189453,2965.408447,35,17.5,34,2,30,4,0.057143,0.857143,0.114286,0.971429,2,8142.379,4071.189453,2965.408447,35,17.5,34,2,30,4,0.057143,0.857143,0.114286,0.971429,3,8613.392,2871.130615,2952.498535,38,12.666667,37,3,31,5,0.078947,0.815789,0.131579,0.973684,0.7161,0.945316,0.5,0.333333,0.605263,0.921053,-0.078947,0.140732,0.7161,0.605263,6,1,999,0,1,30,41,0,1,149,149.0,1,1.0,1,1
4,4De1jAxNRABoyRBDZ82U0yEmzYkqeOugRGVNIf92Xb8=,4,28,female,9,2011-09-20,28.0,2011-09,0,2,5703.128906,2851.564453,2644.321289,29,14.5,24,5,24,5,0.172414,0.827586,0.172414,0.827586,5,15160.677734,3032.135498,1988.283691,90,18.0,42,6,82,6,0.066667,0.911111,0.066667,0.466667,8,19365.29,2420.661865,1723.983154,118,14.75,55,9,105,10,0.076271,0.889831,0.084746,0.466102,10,22494.76,2249.476318,1725.134766,134,13.4,68,12,117,13,0.089552,0.873134,0.097015,0.507463,0.253531,0.673965,0.4,0.2,0.216418,0.671642,0.082862,-0.045548,0.253531,0.216418,29,1,999,0,1,30,36,0,1,180,180.0,1,1.0,1,1
5,GqYHRxlZChiZvB1uzR410wcQzuxqZNZci4AzOTzkAao=,5,27,male,9,2011-09-27,27.0,2011-09,0,5,38556.851562,7711.370117,9204.537109,274,54.799999,161,96,135,129,0.350365,0.492701,0.470803,0.587591,12,96963.203125,8080.26709,6853.640137,585,48.75,385,147,368,192,0.251282,0.62906,0.328205,0.65812,17,139694.8,8217.337891,7103.669434,806,47.411766,550,176,538,232,0.218362,0.667494,0.287841,0.682382,23,164500.2,7152.18457,6489.455078,970,42.173912,642,231,629,293,0.238144,0.648454,0.302062,0.661856,0.234388,0.589441,0.416667,0.217391,0.282474,0.603093,0.112221,-0.155753,0.234388,0.282474,1,1,999,0,1,30,39,0,2,298,149.0,1,2.0,1,2
6,Z6WIOK9vXy+e2XDBiioNAxuZ0ScXSU/Ebq4tUwqVSrE=,22,38,female,9,2011-09-29,38.0,2011-09,0,2,5300.563965,2650.281982,1935.042969,25,12.5,16,4,20,4,0.16,0.8,0.16,0.64,4,7571.279785,1892.819946,1430.834473,37,9.25,27,6,29,6,0.162162,0.783784,0.162162,0.72973,5,8060.28,1612.05603,1389.103149,39,7.8,29,6,31,6,0.153846,0.794872,0.153846,0.74359,6,9109.104,1518.18396,1263.549683,49,8.166667,35,10,33,12,0.204082,0.673469,0.244898,0.714286,0.581897,0.831177,0.5,0.333333,0.510204,0.755102,-0.044082,0.126531,0.581897,0.510204,21,1,999,0,1,30,40,0,1,149,149.0,1,1.0,1,1
7,den0Kb2s4BV47zV+tSC1u0W07M7BOMq+fnrGj+9ax0I=,14,26,female,9,2011-10-05,26.0,2011-10,0,7,84208.726562,12029.817383,6750.530273,350,50.0,241,20,304,28,0.057143,0.868571,0.08,0.688571,14,111287.726562,7949.123047,6742.541992,465,33.214287,326,26,401,38,0.055914,0.862366,0.08172,0.701075,21,144255.5,6869.308594,5890.024902,607,28.904762,441,33,522,51,0.054366,0.859967,0.08402,0.726524,31,183071.3,5905.524902,5199.90625,764,24.645161,590,42,668,61,0.054974,0.874346,0.079843,0.772251,0.459978,0.607893,0.5,0.225806,0.458115,0.608639,0.002169,-0.005774,0.459978,0.458115,28,1,999,0,1,30,37,0,1,149,149.0,1,1.0,1,1
8,i4kmzPli+nl4XagzznO+oCcPeXfjcLSKIn8xE9oGU5E=,4,58,male,9,2011-10-06,58.0,2011-10,0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,999,0,1,30,34,0,1,149,149.0,1,1.0,1,1
9,XmV2kHCnqnQf4oTJ4LCS7F02cpxnzqGIDDRGHe4dw8c=,22,31,female,9,2011-10-06,31.0,2011-10,1,7,379026.1875,54146.597656,16587.246094,1438,205.428574,358,16,1401,24,0.011127,0.97427,0.01669,0.248957,14,738704.75,52764.625,14518.09082,2762,197.285721,622,28,2701,40,0.010138,0.977915,0.014482,0.225199,21,1175617.0,55981.753906,14334.924805,4328,206.095245,834,33,4250,51,0.007625,0.981978,0.011784,0.192699,30,1747704.0,58256.8125,15002.376953,6361,212.03334,1245,42,6261,61,0.006603,0.984279,0.00959,0.195724,0.216871,0.422671,0.5,0.233333,0.226065,0.434208,0.004524,-0.010009,0.216871,0.226065,18,1,18,1,0,195,32,0,1,894,894.0,1,6.5,1,1


## 프레임 누수 방지
### 1) “미래 정보” 가능성이 높은 컬럼 목록 점검

In [81]:
leak_check_cols = [
    "registration_init_time",
    "days_since_last_payment",
    "days_since_last_cancel",
    "is_auto_renew_last",
    "last_plan_days",
    "last_payment_method",
    "recency_secs_ratio",
    "recency_songs_ratio",
    "secs_trend_w7_w30",
    "secs_trend_w14_w30",
    "days_trend_w7_w14",
    "days_trend_w7_w30",
    "songs_trend_w7_w30",
    "songs_trend_w14_w30",
    "skip_trend_w7_w30",
    "completion_trend_w7_w30",
]

[c for c in leak_check_cols if c in df.columns]

['registration_init_time',
 'days_since_last_payment',
 'days_since_last_cancel',
 'is_auto_renew_last',
 'last_plan_days',
 'last_payment_method',
 'recency_secs_ratio',
 'recency_songs_ratio',
 'secs_trend_w7_w30',
 'secs_trend_w14_w30',
 'days_trend_w7_w14',
 'days_trend_w7_w30',
 'songs_trend_w7_w30',
 'songs_trend_w14_w30',
 'skip_trend_w7_w30',
 'completion_trend_w7_w30']

### 2) 절대 있으면 안 되는 “시간 역전(미래)” 체크

#### 2-1) days_since_*가 음수인지 확인 (가장 강력)

In [82]:
for c in ["days_since_last_payment", "days_since_last_cancel"]:
    if c in df.columns:
        neg = (df[c].dropna() < 0).sum()
        print(c, "negative_count =", int(neg))

days_since_last_payment negative_count = 0
days_since_last_cancel negative_count = 0


#### 2-2) last_* 값 범위 sanity check

In [83]:
for c in ["last_plan_days", "last_payment_method"]:
    if c in df.columns:
        print(c)
        print(df[c].describe())
        print()

last_plan_days
count     825368.0
mean      34.39902
std      35.347833
min            0.0
25%           30.0
50%           30.0
75%           30.0
max          450.0
Name: last_plan_days, dtype: Float64

last_payment_method
count    825368.0
mean     38.73239
std       3.85895
min           3.0
25%          38.0
50%          41.0
75%          41.0
max          41.0
Name: last_payment_method, dtype: Float64



### 3) 누수 “가능성” 스크리닝: 타겟과 상관이 과도하게 큰지

In [84]:
def corr_screen(df, target="is_churn", top_n=30):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c not in [target]]
    corrs = df[num_cols + [target]].corr()[target].drop(target).sort_values(key=lambda s: s.abs(), ascending=False)
    return corrs.head(top_n)

corr_screen(df, TARGET, top_n=30)

has_ever_paid             -0.495495
is_free_user               0.495495
is_auto_renew_last        -0.480758
days_since_last_cancel    -0.480258
last_plan_days             0.465601
unique_plan_count         -0.367527
has_ever_cancelled         0.360411
payment_count_last_30d    -0.312786
subscription_months_est    0.281851
avg_amount_per_payment     0.270815
total_amount_paid          0.251481
days_since_last_payment    0.224264
last_payment_method       -0.213020
completion_trend_w7_w30   -0.166595
days_trend_w7_w14         -0.157480
variety_ratio_w7          -0.151710
completion_ratio_w7       -0.151195
songs_trend_w14_w30       -0.147870
secs_trend_w14_w30        -0.146499
payment_count_last_90d    -0.138593
days_trend_w7_w30         -0.136139
completion_ratio_w14      -0.132074
variety_ratio_w14         -0.128529
num_days_active_w7        -0.124411
recency_songs_ratio       -0.120895
songs_trend_w7_w30        -0.120895
recency_secs_ratio        -0.119362
secs_trend_w7_w30         -0

### 4) “의심 컬럼”을 타겟별 분포로 직접 확인

In [85]:
df.groupby(TARGET)["days_since_last_payment"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,771591.0,14.257464,20.310349,1.0,5.0,14.0,23.0,999.0
1,53777.0,56.970229,162.132166,1.0,10.0,19.0,28.0,999.0


In [86]:
pd.crosstab(df["is_auto_renew_last"], df[TARGET], normalize="columns")

is_churn,0,1
is_auto_renew_last,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.084008,0.649764
1,0.915992,0.350236


In [87]:
pd.crosstab(df["last_plan_days"], df[TARGET], normalize="columns").head(20)
pd.crosstab(df["last_payment_method"], df[TARGET], normalize="columns").head(20)

is_churn,0,1
last_payment_method,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.0,0.00013
6,0.0,9.3e-05
8,0.0,0.000167
10,4.4e-05,7.4e-05
11,8.6e-05,3.7e-05
12,9e-06,0.003831
13,1e-06,0.005634
14,0.000582,0.000167
15,0.000223,0.037934
16,0.000892,0.00212


In [88]:
tmp = df.copy()
tmp["registration_init_time"] = pd.to_datetime(tmp["registration_init_time"], errors="coerce")

print(tmp["registration_init_time"].min(), tmp["registration_init_time"].max())
print(tmp["registration_init_time"].isna().mean())

2004-03-26 00:00:00 2017-04-24 00:00:00
0.0


In [89]:
T_date = pd.Timestamp("2017-04-01")

too_new = (tmp["registration_init_time"] > T_date).sum()
print("가입일이 T 이후인 유저 수:", int(too_new))
print("비율:", float(too_new / len(tmp)))

가입일이 T 이후인 유저 수: 1
비율: 1.1614847026657236e-06


### 5) T 이후의 가입자 제거

In [90]:
df = df[df["registration_init_time"] <= T_date]

In [92]:
out_path = "data/processed/kkbox_train_feature_v1.parquet"

df.to_parquet(
    out_path,
    engine="pyarrow",
    compression="snappy",
    index=False
)

print("saved:", out_path)
print("shape:", df.shape)

saved: data/processed/kkbox_train_feature_v1.parquet
shape: (860966, 90)


In [93]:
df_chk = pd.read_parquet(out_path)

assert df_chk.shape == df.shape
assert df_chk["is_churn"].isna().mean() == 0.0

print("reload ok")

reload ok


In [94]:
df_chk.shape

(860966, 90)