In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_parquet("../data/processed/transactions_aggregated.parquet")

In [3]:
df.head(30)

Unnamed: 0,msno,days_since_last_payment,has_ever_paid,days_since_last_cancel,has_ever_cancelled,is_auto_renew_last,last_plan_days,last_payment_method,is_free_user,total_payment_count,total_amount_paid,avg_amount_per_payment,unique_plan_count,subscription_months_est,payment_count_last_30d,payment_count_last_90d
0,+++IZseRRiQS9aaSkH6cMYU6bGDcxUieAi/tH67sC5s=,160,1,160,1,0,395,22,0,1,1599,1599.0,1,13.166667,0,0
1,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,17,1,999,0,1,30,41,0,1,99,99.0,1,1.0,1,1
2,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,1,1,999,0,1,30,39,0,2,298,149.0,1,2.0,1,2
3,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,6,1,999,0,1,30,41,0,1,149,149.0,1,1.0,1,1
4,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,17,1,999,0,1,30,41,0,1,149,149.0,1,1.0,1,1
5,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,1,1,999,0,1,30,39,0,2,298,149.0,1,2.0,1,2
6,++/ZHqwUNa7U21Qz+zqteiXlZapxey86l6eEorrak/g=,1,1,999,0,1,30,14,0,2,298,149.0,1,2.0,1,2
7,++0+IdHga8fCSioOVpU8K7y4Asw8AveIApVH2r9q9yY=,271,1,271,1,0,410,32,0,1,1788,1788.0,1,13.666667,0,0
8,++0/NopttBsaAn6qHZA2AWWrDg7Me7UOMs1vsyo4tSI=,12,1,999,0,1,30,41,0,1,149,149.0,1,1.0,1,1
9,++0BJXY8tpirgIhJR14LDM1pnaRosjD1mdO1mIKxlJA=,5,1,5,1,0,30,38,0,1,149,149.0,1,1.0,1,1


In [4]:
import numpy as np
import pandas as pd

in_path = "../data/processed/transactions_aggregated.parquet"
out_path = "../data/processed/transactions_aggregated_optimized.parquet"

tx = pd.read_parquet(in_path)

print("Before")
tx.info(memory_usage="deep")

# msno -> string
if "msno" in tx.columns:
    try:
        tx["msno"] = tx["msno"].astype("string[pyarrow]")
    except Exception:
        tx["msno"] = tx["msno"].astype("string")

def to_uint_best(s: pd.Series, *, max_bits_hint=32):
    """
    결측 없고 음수 없는 정수형에 대해 범위에 맞는 UInt로 내림.
    """
    s_num = pd.to_numeric(s, errors="coerce")
    # 정수성 체크
    non_na = s_num.dropna()
    if len(non_na) == 0:
        # 전부 NA면 일단 UInt16(널러블)
        return pd.array(s_num, dtype="UInt16")

    is_int = np.isclose(non_na.values, np.round(non_na.values)).all()
    if not is_int:
        # 정수로 못내리면 float32
        return s_num.astype(np.float32)

    mn = int(non_na.min())
    mx = int(non_na.max())
    if mn < 0:
        # 음수 있으면 Int로
        return pd.to_numeric(np.round(s_num), errors="coerce", downcast="integer")

    # UInt 선택
    if mx <= np.iinfo(np.uint8).max:
        return pd.array(np.round(s_num), dtype="UInt8")
    if mx <= np.iinfo(np.uint16).max:
        return pd.array(np.round(s_num), dtype="UInt16")
    if mx <= np.iinfo(np.uint32).max:
        return pd.array(np.round(s_num), dtype="UInt32")
    return pd.array(np.round(s_num), dtype="UInt64")

# 1) days_since_* (0~999 같은 센티넬 포함 가능)
for c in ["days_since_last_payment", "days_since_last_cancel"]:
    if c in tx.columns:
        tx[c] = to_uint_best(tx[c])  # 보통 UInt16로 떨어질 것

# 2) binary flags (0/1)
for c in ["has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user"]:
    if c in tx.columns:
        tx[c] = to_uint_best(tx[c])  # 대부분 UInt8

# 3) plan/payment method / counts
for c in ["last_plan_days", "last_payment_method", "total_payment_count", "unique_plan_count",
          "payment_count_last_30d", "payment_count_last_90d"]:
    if c in tx.columns:
        tx[c] = to_uint_best(tx[c])  # 보통 UInt16

# 4) amounts
if "total_amount_paid" in tx.columns:
    # 누적 금액은 값이 커질 수 있으니 범위 보고 UInt32/UInt64로
    tx["total_amount_paid"] = to_uint_best(tx["total_amount_paid"])

if "avg_amount_per_payment" in tx.columns:
    tx["avg_amount_per_payment"] = pd.to_numeric(tx["avg_amount_per_payment"], errors="coerce").astype(np.float32)

if "subscription_months_est" in tx.columns:
    tx["subscription_months_est"] = pd.to_numeric(tx["subscription_months_est"], errors="coerce").astype(np.float32)

print("\nAfter")
tx.info(memory_usage="deep")

# 저장
tx.to_parquet(
    out_path,
    index=False,
    engine="pyarrow",
    compression="zstd",
)

print(f"\nSaved: {out_path}")

Before
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197050 entries, 0 to 1197049
Data columns (total 16 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   msno                     1197050 non-null  object 
 1   days_since_last_payment  1197050 non-null  int64  
 2   has_ever_paid            1197050 non-null  int64  
 3   days_since_last_cancel   1197050 non-null  int64  
 4   has_ever_cancelled       1197050 non-null  int64  
 5   is_auto_renew_last       1197050 non-null  int64  
 6   last_plan_days           1197050 non-null  int64  
 7   last_payment_method      1197050 non-null  object 
 8   is_free_user             1197050 non-null  int64  
 9   total_payment_count      1197050 non-null  int64  
 10  total_amount_paid        1197050 non-null  int64  
 11  avg_amount_per_payment   1197050 non-null  float64
 12  unique_plan_count        1197050 non-null  int64  
 13  subscription_months_est  1197050 no