In [1]:
import pandas as pd
import numpy as np

In [2]:
mda_pf = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/media_performance_classification.csv")
click = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/유저테이블.csv")
ads_pool = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/ads_pool.csv")

In [3]:
time_df = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv")

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
def cosine_vec(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = np.linalg.norm(B, axis=1) * (np.linalg.norm(a) + 1e-12)
    return num / (den + 1e-12)

In [7]:
def analyze_ads_performance(ads_idx, click_data, media_portfolio=None):
    """
    특정 광고의 매체별 성과를 분석하는 함수
    """
    
    # 1. 해당 광고의 데이터가 있는지 확인
    ads_data = click_data[click_data['ads_idx'] == ads_idx]
    if len(ads_data) == 0:
        print(f"광고 {ads_idx}에 대한 데이터가 없습니다.")
        return pd.DataFrame()
    
    # 2. 기본 성과 데이터 추출
    ads_performance = ads_data.groupby(['ads_idx', 'mda_idx']).agg({
        'click_key': 'count',
        'conversion': 'sum',
        'contract_price': 'first',
        'media_price': 'first',
        'domain': 'first',
        'ads_category': 'first'
    }).reset_index()
    
    # 컬럼명 변경
    ads_performance.columns = ['ads_idx', 'mda_idx', 'total_clicks', 'total_conversions', 
                              'contract_price', 'media_price', 'domain', 'ads_category']
    
    # 전환율 및 수익 계산
    ads_performance['cvr'] = (
        ads_performance['total_conversions'] / ads_performance['total_clicks']
    ).round(4)
    
    ads_performance['profit_per_conversion'] = (
        ads_performance['contract_price'] - ads_performance['media_price']
    )
    ads_performance['total_profit'] = (
        ads_performance['total_conversions'] * ads_performance['profit_per_conversion']
    )
    
    # 3. 날짜 범위 및 활동일 계산
    click_data_copy = click_data.copy()
    if not pd.api.types.is_datetime64_any_dtype(click_data_copy['click_date']):
        click_data_copy['click_date'] = pd.to_datetime(click_data_copy['click_date'])
    
    ads_activity = (
        click_data_copy.loc[click_data_copy['ads_idx'] == ads_idx]
                      .groupby('mda_idx')['click_date']
                      .agg(first_click='min', last_click='max')
                      .reset_index()
    )
    
    ads_activity['days_active_calc'] = (
        (ads_activity['last_click'] - ads_activity['first_click']).dt.days + 1
    )
    
    # 4. 데이터 병합
    merged = ads_performance.merge(
        ads_activity[['mda_idx', 'first_click', 'last_click', 'days_active_calc']],
        on='mda_idx', how='left'
    )
    
    # 5. 일평균 지표 계산
    merged['daily_clicks'] = merged['total_clicks'] / merged['days_active_calc']
    merged['daily_conversions'] = merged['total_conversions'] / merged['days_active_calc']
    merged['daily_profit'] = merged['total_profit'] / merged['days_active_calc']
    
    # 6. 배분 그룹 분류 (데이터가 충분한 경우에만)
    if len(merged) > 1:  # 최소 2개 이상의 매체가 있어야 중앙값 계산이 의미있음
        profit_median = merged['daily_profit'].median()
        conv_median = merged['daily_conversions'].median()
        
        merged['배분그룹'] = np.where(
            (merged['daily_profit'] >= profit_median) & (merged['daily_conversions'] >= conv_median),
            '잘 배분',
            '잘못 배분'
        )
        # 결과 정렬
        result = merged.sort_values(['배분그룹', 'daily_profit'], ascending=[True, False]).reset_index(drop=True)
    else:
        merged['배분그룹'] = '분류불가'
        result = merged.reset_index(drop=True)
    
    return result

# 73878

In [8]:
ads_73878_pf = analyze_ads_performance(73878, click)

In [9]:
ads_73878_pf.head()

Unnamed: 0,ads_idx,mda_idx,total_clicks,total_conversions,contract_price,media_price,domain,ads_category,cvr,profit_per_conversion,total_profit,first_click,last_click,days_active_calc,daily_clicks,daily_conversions,daily_profit,배분그룹
0,73878,14,1883,108,4000,3200,미디어/컨텐츠,8,0.0574,800,86400,2025-07-26 01:45:27,2025-08-25 10:34:32,31,60.741935,3.483871,2787.096774,잘 배분
1,73878,654,13309,40,4000,3200,미디어/컨텐츠,8,0.003,800,32000,2025-07-26 00:03:38,2025-08-25 11:02:42,31,429.322581,1.290323,1032.258065,잘 배분
2,73878,56,418,20,4000,3200,미디어/컨텐츠,8,0.0478,800,16000,2025-07-26 00:04:02,2025-08-25 09:04:07,31,13.483871,0.645161,516.129032,잘 배분
3,73878,58,779,20,4000,3200,미디어/컨텐츠,8,0.0257,800,16000,2025-07-26 02:19:02,2025-08-25 11:21:00,31,25.129032,0.645161,516.129032,잘 배분
4,73878,667,370,19,4000,3200,미디어/컨텐츠,8,0.0514,800,15200,2025-07-26 17:19:57,2025-08-25 10:08:01,30,12.333333,0.633333,506.666667,잘 배분


In [10]:
print(mda_pf.head())
print(mda_pf.columns)

   Unnamed: 0.1  Unnamed: 0  mda_idx  user_count  total_clicks  \
0             0           0       12       75700        232438   
1             1           1       14       31834         84763   
2             2           2       18        2825          5961   
3             3           3       22       37668         83355   
4             4           4       26         402           678   

   total_conversions          first_click           last_click  days_active  \
0              82938  2025-07-26 00:00:02  2025-08-25 11:22:16           31   
1              22983  2025-07-26 00:01:22  2025-08-25 11:22:40           31   
2                250  2025-07-26 00:00:49  2025-08-25 11:13:19           31   
3              35232  2025-07-26 00:00:31  2025-08-25 11:14:50           31   
4                 70  2025-07-26 01:45:31  2025-08-25 10:22:36           31   

   daily_avg_conversions  LARGE  MEDIUM  MEGA  SMALL  total_ads  MEGA_ratio  \
0            2675.419355      7       0    17    

In [44]:
# 유사도

import numpy as np
import pandas as pd

# --- 유틸: 코사인 유사도 (sklearn 없이) ---
def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

# --- mda_pf에서 유사도 계산에 쓸 피처 뽑기 ---
def pick_feature_cols(mda_pf):
    cols = []
    cols += [c for c in mda_pf.columns if c.startswith('domain_') and c.endswith('_pct')]
    cols += [c for c in mda_pf.columns if c.startswith('ads_os_type_') and c.endswith('_pct')]
    cols += [c for c in ['MEGA_ratio','LARGE_ratio','MEDIUM_ratio','SMALL_ratio'] if c in mda_pf.columns]
    # 스케일 편향 줄이려면 전역 성과( expected_total_profit 등)는 제외하고,
    # 필요한 경우 'conversion_rate' 정도만 보조로 넣기
    if 'conversion_rate' in mda_pf.columns:
        cols.append('conversion_rate')
    return cols

# --- 표준화(+그룹 가중치) ---
def build_profile_matrix(mda_pf, feature_cols, group_weights=None):
    X = mda_pf.set_index('mda_idx')[feature_cols].astype(float).fillna(0.0)

    # 그룹 가중치(선택): 도메인/OS/포맷 비중을 더 세게 보려면 가중치 부여
    if group_weights:
        w = np.ones(len(feature_cols), dtype=float)
        for pat, gw in group_weights.items():
            for i,c in enumerate(feature_cols):
                if pat(c):
                    w[i] = gw
        X = X * w

    # 표준화(z-score) → 스케일 다른 열들 균형
    X = (X - X.mean()) / (X.std() + 1e-9)
    return X

# --- 추천 함수 ---
def recommend_similar_media(
    ad_df,                # 특정 광고의 매체 수행 테이블 (ads_XXXX_pf)
    mda_pf,               # 전체 매체 프로필 테이블
    top_anchor_by='total_conversions',  # anchor 선정 기준
    n_anchor=3,           # 상위 anchor 개수 (2~3 추천)
    topN=15,              # 추천 결과 개수
    exclude_classes=('계약종료형','품질관리형'),  # 제외할 매체 유형
    min_days_active=7,    # 최소 활동일
    group_weights=None    # {'도메인':2.0, '포맷':1.5} 같은 식으로도 쓸 수 있음
):
    feature_cols = pick_feature_cols(mda_pf)
    X = build_profile_matrix(mda_pf, feature_cols, group_weights=group_weights)

    # 이 광고에서 이미 쓰는 매체는 제외
    used = set(ad_df['mda_idx'].astype(int))

    # anchor (이 광고에서 상위 성과 매체)
    anchors = (
        ad_df.sort_values(top_anchor_by, ascending=False)
             .drop_duplicates('mda_idx')
             .head(n_anchor)['mda_idx'].astype(int).tolist()
    )
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError('anchor 매체가 프로필 매트릭스에 없습니다.')

    centroid = X.loc[anchors].mean(axis=0).values  # anchor 평균 프로필

    # 후보 풀: 안 쓰는 매체 + 기본 필터(유형, 활동일)
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[~cand['basic_classification'].isin(exclude_classes)]
    if 'days_active' in cand.columns:
        cand = cand[cand['days_active'] >= min_days_active]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity'])  # 후보 없음

    # 유사도 계산 (코사인, anchor centroid 대비)
    Xc = X.loc[cand['mda_idx'].values]
    sims = _cosine(centroid, Xc.values)

    out = cand.copy()
    out['similarity'] = sims

    # 리포트용 보조 컬럼 몇 개
    keep_cols = ['mda_idx','similarity','basic_classification','days_active',
                 'conversion_rate','expected_total_profit','total_ads']
    keep_cols = [c for c in keep_cols if c in out.columns]
    out = out[keep_cols].sort_values('similarity', ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, feature_cols


In [45]:
# 특정 광고 테이블(예: ads_73878_pf) 과 전체 매체 프로필(mda_pf) 준비돼 있다고 가정
recs, anchors, used_feats = recommend_similar_media(
    ad_df=ads_73878_pf, 
    mda_pf=mda_pf,
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=20,
    exclude_classes=('계약종료형','품질관리형'),
    group_weights={
        # 도메인 가중 2.0, 포맷비중 1.5 (OS는 1.0)
        (lambda c: c.startswith('domain_') and c.endswith('_pct')): 2.0,
        (lambda c: c.endswith('_ratio')): 1.5,
    }
)
print("anchor 매체:", anchors)
recs


anchor 매체: [14, 654, 56]


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,384,0.671987,특화전문형,31,0.215892,568970,4
1,356,0.633956,안정공급형(후보),31,0.162064,99460,10
2,32,0.619627,관리 필요,30,0.103558,191080,7
3,686,0.619341,안정공급형(후보),26,0.37931,1510,23
4,540,0.614091,안정공급형(후보),31,0.348201,403320,46
5,401,0.596112,안정공급형(후보),31,0.086053,5921,56
6,785,0.463261,안정공급형(후보),29,0.28125,3967,27
7,583,0.456529,안정공급형(후보),31,0.044158,122510,41
8,772,0.443814,안정공급형(후보),30,0.46063,13330,56
9,294,0.434449,안정공급형(후보),31,0.202488,127250,49


In [46]:
ads_73878_pf['mda_idx'].unique()

array([ 14, 654,  56,  58, 667, 792,  18, 761,  54, 270,  30, 281, 371,
        26, 108, 246, 337, 480, 481, 621, 674, 769, 790, 805, 818, 854,
       978])

In [14]:
print(click.head())
print(click.columns)

   Unnamed: 0                                 click_key  ads_idx   dvc_idx  \
0           0  000000d54b9faad47ee99d6cd3cf53894dd4baa5   313780  61906528   
1           1  000002b4d92f7648b455877c2676452efcd22a09   412426  34422806   
2           2  0000057e97361ff3d0263aaecee34cfaa3ba30fb   443660  38366075   
3           3  00000607f60139015da3ee1dd5499db3faa100dc   360192  61894110   
4           4  0000066bc25d4a6d147c27326cf972a4de88024e   372307  61956954   

   mda_idx  contract_price  media_price   click_day  click_time  \
0      539            6000         4500  2025-08-17          21   
1       58             180          170  2025-07-26           2   
2      808             170          120  2025-08-12          18   
3      539            6000         4500  2025-08-17           3   
4      539           15600        11700  2025-08-18           8   

            click_date     exp_day network          user_ip      rwd_idx  \
0  2025-08-17 21:07:37  2025-09-16       0    16.184

In [15]:
import pandas as pd
import numpy as np
import re

# 컬럼 이름에서 공백/특수문자 → '_' 로 바꾸는 헬퍼
def _slug(s):
    return re.sub(r'[^0-9A-Za-z가-힣]+', '_', str(s)).strip('_')

def add_cat_domain_to_mda_pf(
    mda_pf: pd.DataFrame,
    clicks_df: pd.DataFrame,
    conv_col: str = "conversion",
    cat_col: str = "ads_category",
    dom_col: str = "domain",
    add_within_cat: bool = False,      # mda×카테고리 내부 도메인 구성비 추가 여부
    add_within_dom: bool = False       # mda×도메인 내부 카테고리 구성비 추가 여부
):
    """
    반환: (enriched_mda_pf, new_columns)
    - conv_cat{카테고리}_{도메인} : 해당 mda의 (카테고리×도메인) 전환수
    - share_cat{카테고리}_{도메인}: 해당 mda 전체 전환 대비 구성비(0~1)
    - (옵션) shareWithinCat_*, shareWithinDomain_* 도 함께 추가 가능
    """
    df = clicks_df.copy()

    # 전환수 정리 (0/1이 아니면 그대로 합산, 0/1이면 1 합산)
    df[conv_col] = pd.to_numeric(df[conv_col], errors="coerce").fillna(0)
    if df[conv_col].max() <= 1:
        df["conv"] = (df[conv_col] > 0).astype(int)
    else:
        df["conv"] = df[conv_col]

    # 전환 있는 행만
    conv = df[df["conv"] > 0].copy()
    if conv.empty:
        enriched = mda_pf.copy()
        return enriched, []

    # mda × category × domain 전환수 집계
    g = (conv.groupby(["mda_idx", cat_col, dom_col], as_index=False)["conv"]
              .sum())

    # mda 전체 전환 합 → mda 대비 구성비
    total_mda = (g.groupby("mda_idx", as_index=False)["conv"]
                   .sum()
                   .rename(columns={"conv":"total_mda"}))
    g = g.merge(total_mda, on="mda_idx", how="left")
    g["share_mda"] = g["conv"] / g["total_mda"].replace(0, np.nan)
    g["share_mda"] = g["share_mda"].fillna(0.0)

    # ---- 피벗: 전환수 / mda-구성비
    piv_cnt = (g.pivot(index="mda_idx",
                       columns=[cat_col, dom_col],
                       values="conv")
                 .fillna(0))
    piv_shr = (g.pivot(index="mda_idx",
                       columns=[cat_col, dom_col],
                       values="share_mda")
                 .fillna(0.0))

    # 컬럼 평탄화
    piv_cnt.columns = [f"conv_cat{c}_{_slug(d)}" for c, d in piv_cnt.columns]
    piv_shr.columns = [f"share_cat{c}_{_slug(d)}" for c, d in piv_shr.columns]

    out = (mda_pf.merge(piv_cnt, on="mda_idx", how="left")
                 .merge(piv_shr, on="mda_idx", how="left"))

    new_cols = list(piv_cnt.columns) + list(piv_shr.columns)
    out[new_cols] = out[new_cols].fillna(0)

    # ---- (옵션) mda×카테고리 내부 도메인 구성비
    if add_within_cat:
        tot_cat = (g.groupby(["mda_idx", cat_col], as_index=False)["conv"]
                     .sum()
                     .rename(columns={"conv":"_tot_cat"}))
        g2 = g.merge(tot_cat, on=["mda_idx", cat_col], how="left")
        g2["share_within_cat"] = g2["conv"] / g2["_tot_cat"].replace(0, np.nan)
        piv_wc = (g2.pivot(index="mda_idx",
                           columns=[cat_col, dom_col],
                           values="share_within_cat")
                    .fillna(0.0))
        piv_wc.columns = [f"shareWithinCat_cat{c}_{_slug(d)}" for c, d in piv_wc.columns]
        out = out.merge(piv_wc, on="mda_idx", how="left")
        out[piv_wc.columns] = out[piv_wc.columns].fillna(0.0)
        new_cols += list(piv_wc.columns)

    # ---- (옵션) mda×도메인 내부 카테고리 구성비
    if add_within_dom:
        tot_dom = (g.groupby(["mda_idx", dom_col], as_index=False)["conv"]
                     .sum()
                     .rename(columns={"conv":"_tot_dom"}))
        g3 = g.merge(tot_dom, on=["mda_idx", dom_col], how="left")
        g3["share_within_domain"] = g3["conv"] / g3["_tot_dom"].replace(0, np.nan)
        piv_wd = (g3.pivot(index="mda_idx",
                           columns=[cat_col, dom_col],
                           values="share_within_domain")
                    .fillna(0.0))
        piv_wd.columns = [f"shareWithinDomain_cat{c}_{_slug(d)}" for c, d in piv_wd.columns]
        out = out.merge(piv_wd, on="mda_idx", how="left")
        out[piv_wd.columns] = out[piv_wd.columns].fillna(0.0)
        new_cols += list(piv_wd.columns)

    return out, new_cols


In [16]:
# clicks_df: 원본 클릭/전환 테이블 (mda_idx, ads_category, domain, conversion 포함)
# mda_pf: 매체 프로필 테이블 (mda_idx 기준)

mda_pf_enriched, added_cols = add_cat_domain_to_mda_pf(
    mda_pf, click,
    add_within_cat=False,     # 필요하면 True
    add_within_dom=False      # 필요하면 True
)

print(f"추가된 컬럼 수: {len(added_cols)}")
# mda_pf_enriched.head()


추가된 컬럼 수: 168


In [17]:
print(mda_pf_enriched.head())
print(mda_pf_enriched.columns)

   Unnamed: 0.1  Unnamed: 0  mda_idx  user_count  total_clicks  \
0             0           0       12       75700        232438   
1             1           1       14       31834         84763   
2             2           2       18        2825          5961   
3             3           3       22       37668         83355   
4             4           4       26         402           678   

   total_conversions          first_click           last_click  days_active  \
0              82938  2025-07-26 00:00:02  2025-08-25 11:22:16           31   
1              22983  2025-07-26 00:01:22  2025-08-25 11:22:40           31   
2                250  2025-07-26 00:00:49  2025-08-25 11:13:19           31   
3              35232  2025-07-26 00:00:31  2025-08-25 11:14:50           31   
4                 70  2025-07-26 01:45:31  2025-08-25 10:22:36           31   

   daily_avg_conversions  LARGE  MEDIUM  MEGA  SMALL  total_ads  MEGA_ratio  \
0            2675.419355      7       0    17    

In [18]:
import numpy as np
import pandas as pd
import re

# --- 유틸 ---
def _slug(s): 
    return re.sub(r'[^0-9A-Za-z가-힣]+', '_', str(s)).strip('_')

def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

# --- 광고 데이터로 (카테고리×도메인) 가중치 만들기 ---
def make_ad_pair_weights_from_ad_df(ad_df, cat_col='ads_category', dom_col='domain',
                                    conv_col='total_conversions', power=1.0, min_frac=0.0):
    """
    ad_df에서 (카테고리×도메인)별 전환 비중 → share_cat{c}_{domain} 컬럼의 가중치 dict 반환
    - power: 비중에 지수 가중 (1.0=그대로, 0.5=루트, 2.0=제곱)
    - min_frac: 너무 작은 비중 컷(0~1)
    """
    t = ad_df.copy()
    t[conv_col] = pd.to_numeric(t[conv_col], errors='coerce').fillna(0.0)
    g = (t.groupby([cat_col, dom_col])[conv_col].sum()
           .rename('conv').reset_index())
    tot = g['conv'].sum()
    if tot <= 0:
        return {}
    g['frac'] = g['conv'] / tot
    if min_frac > 0:
        g = g[g['frac'] >= min_frac].copy()
    g['w'] = (g['frac'] ** power)
    # 정규화(합=1)
    s = g['w'].sum()
    if s > 0:
        g['w'] = g['w'] / s
    # share_cat{c}_{slug(domain)} 키로 변환
    weights = { f"share_cat{int(c)}_{_slug(d)}": float(w) for c,d,w in g[[cat_col, dom_col, 'w']].itertuples(index=False) }
    return weights

# --- 피처 행렬 만들기 (z-score + 컬럼별 가중적용) ---
def build_feature_matrix(mda_pf, feature_cols, col_weights=None, zscore=True):
    X = mda_pf.set_index('mda_idx')[feature_cols].astype(float).fillna(0.0)
    if col_weights:
        w = np.array([col_weights.get(c, 1.0) for c in feature_cols], dtype=float)
        X = X * w  # 가중치 적용 (열 스케일)
    if zscore:
        X = (X - X.mean()) / (X.std() + 1e-9)
    return X

# --- 추천 메인 ---
def recommend_with_weighted_similarity(
    ad_df,               # 특정 광고의 매체 성과 테이블(ads_XXXX_pf)
    mda_pf,              # (enriched) 전체 매체 프로필 (share_cat* 들어있는 테이블)
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=20,
    weight_power=1.0,    # 광고 전환 분포 가중치 지수
    min_pair_frac=0.0,   # 광고 전환 분포에서 너무 작은 비중 컷
    top_weight_feats=None,  # 상위 몇 개 가중치 feature만 사용할지 (None이면 전체)
    exclude_classes=('계약종료형','품질관리형'),  # 운영상 제외할 타입
    min_days_active=7,
    blend_pred_table=None,   # pred_table(ads_idx,mda_idx,pred_turn)이 있으면 넣기
    blend_ad_id=None,        # blend할 광고 ID
    blend_alpha=0.7          # 최종점수 = alpha*similarity + (1-alpha)*pred_norm
):
    # 1) share_cat* 피처만 사용 (전환 '비중' 기반 유사도에 초점)
    share_cols = [c for c in mda_pf.columns if c.startswith('share_cat')]
    if not share_cols:
        raise ValueError("mda_pf에 share_cat* 컬럼이 없습니다. 먼저 enrichment를 수행하세요.")

    # 2) 광고 전환 분포 기반 컬럼 가중치(없으면 균등 가중)
    col_w = make_ad_pair_weights_from_ad_df(ad_df, power=weight_power, min_frac=min_pair_frac)
    if top_weight_feats:
        # 광고에서 의미 있는 상위 조합만 남기고 나머지는 0으로 눌러서 노이즈 감소
        top_keys = set(pd.Series(col_w).sort_values(ascending=False).head(top_weight_feats).index)
        col_w = {k: (v if k in top_keys else 0.0) for k,v in col_w.items()}

    # 3) 피처 행렬(z-score + 가중치)
    X = build_feature_matrix(mda_pf, share_cols, col_weights=col_w, zscore=True)

    # 4) 앵커(이 광고에서 상위 성과 매체)
    used = set(ad_df['mda_idx'].astype(int))
    anchors = (ad_df.sort_values(top_anchor_by, ascending=False)
                   .drop_duplicates('mda_idx')
                   .head(n_anchor)['mda_idx']
                   .astype(int).tolist())
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError("anchor가 없습니다. ad_df에 상위 매체가 있는지 확인하세요.")

    centroid = X.loc[anchors].mean(axis=0).values

    # 5) 후보: 미사용 매체 + 운영 필터
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[~cand['basic_classification'].isin(exclude_classes)]
    if 'days_active' in cand.columns:
        cand = cand[cand['days_active'] >= min_days_active]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity']), anchors, share_cols, col_w

    # 6) 유사도 계산 (가중 코사인)
    B = X.loc[cand['mda_idx']].values
    sims = cosine_vec(centroid, B)
    cand['similarity'] = sims

    # 7) (선택) 예측 전환수와 블렌딩
    if blend_pred_table is not None and blend_ad_id is not None:
        pt = blend_pred_table[blend_pred_table['ads_idx']==blend_ad_id][['mda_idx','pred_turn']].copy()
        cand = cand.merge(pt, on='mda_idx', how='left')
        cand['pred_turn'] = cand['pred_turn'].fillna(0.0)
        # 간단 정규화
        maxv = cand['pred_turn'].max()
        cand['pred_norm'] = cand['pred_turn'] / (maxv + 1e-9)
        cand['final_score'] = blend_alpha*cand['similarity'] + (1.0-blend_alpha)*cand['pred_norm']
        sort_key = 'final_score'
    else:
        sort_key = 'similarity'

    # 8) 보기 좋게 컬럼 추리기 + 정렬
    keep = [c for c in ['mda_idx','similarity','final_score','pred_turn','basic_classification',
                        'days_active','conversion_rate','expected_total_profit','total_ads'] if c in cand.columns]
    out = cand[keep].sort_values(sort_key, ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, share_cols, col_w


In [19]:
# ad_df = ads_73878_pf     # 이 광고의 매체별 성과 테이블
# mda_pf = mda_pf_enriched # share_cat* 컬럼 붙인 테이블

topN = 20
recs, anchors, used_feats, weight_map = recommend_with_weighted_similarity(
    ad_df=ads_73878_pf,
    mda_pf=mda_pf_enriched,                    # 위에서 이미 share_cat*가 들어간 버전
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=topN,
    weight_power=1.0,                 # 광고 전환 분포를 그대로 사용(필요 시 1.5~2.0으로 강화)
    min_pair_frac=0.0,                # 아주 작은 비중도 반영(잡음 줄이려면 0.01 같이 줘도 됨)
    top_weight_feats=40,              # 광고에서 중요한 조합 상위 40개만 유의하게 사용 (노이즈 컷)
    exclude_classes=('계약종료형','품질관리형'),  # 리스크 제외
    min_days_active=7,
    # 있으면 켜기: 예측 전환수와 블렌딩
    blend_pred_table=(pred_table if 'pred_table' in globals() else None),
    blend_ad_id=73878 if 'pred_table' in globals() else None,
    blend_alpha=0.7
)

print("anchors:", anchors)       # 이 광고에서 기준으로 삼은 상위 매체
recs.head(topN)                  # 최종 추천


anchors: [14, 654, 56]


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,711,0.577354,안정공급형(후보),28,0.358333,16540,54
1,356,0.316496,안정공급형(후보),31,0.162064,99460,10
2,32,0.263135,관리 필요,30,0.103558,191080,7
3,87,0.211624,안정공급형(후보),29,0.329034,640407,6
4,785,0.183777,안정공급형(후보),29,0.28125,3967,27
5,397,0.18255,특화전문형,31,0.524409,550380,4
6,12,0.174407,특화전문형,31,0.356818,3003668,24
7,398,0.1681,안정공급형(후보),31,0.170626,185580,27
8,401,0.160045,안정공급형(후보),31,0.086053,5921,56
9,540,0.143652,안정공급형(후보),31,0.348201,403320,46


In [20]:
ads_9935_pf = analyze_ads_performance(9935, click)

In [21]:
# ad_df = ads_73878_pf     # 이 광고의 매체별 성과 테이블
# mda_pf = mda_pf_enriched # share_cat* 컬럼 붙인 테이블
ads_9935_pf = analyze_ads_performance(9935, click)

topN = 20
recs, anchors, used_feats, weight_map = recommend_with_weighted_similarity(
    ad_df=ads_9935_pf,
    mda_pf=mda_pf_enriched,                    # 위에서 이미 share_cat*가 들어간 버전
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=topN,
    weight_power=1.0,                 # 광고 전환 분포를 그대로 사용(필요 시 1.5~2.0으로 강화)
    min_pair_frac=0.0,                # 아주 작은 비중도 반영(잡음 줄이려면 0.01 같이 줘도 됨)
    top_weight_feats=40,              # 광고에서 중요한 조합 상위 40개만 유의하게 사용 (노이즈 컷)
    exclude_classes=('계약종료형','품질관리형'),  # 리스크 제외
    min_days_active=7,
    # 있으면 켜기: 예측 전환수와 블렌딩
    blend_pred_table=(pred_table if 'pred_table' in globals() else None),
    blend_ad_id=73878 if 'pred_table' in globals() else None,
    blend_alpha=0.7
)

print("anchors:", anchors)       # 이 광고에서 기준으로 삼은 상위 매체
recs.head(topN)    

anchors: [667, 539, 442]


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,87,0.32565,안정공급형(후보),29,0.329034,640407,6
1,568,0.291257,안정공급형(후보),29,0.306122,393,67
2,12,0.227919,특화전문형,31,0.356818,3003668,24
3,371,0.222589,안정공급형,31,0.432518,172278,973
4,634,0.208212,안정공급형(후보),31,0.630051,41638,2451
5,58,0.204022,안정공급형(후보),31,0.322761,3884970,26
6,492,0.184191,안정공급형(후보),30,0.510896,2643,646
7,375,0.168891,안정공급형(후보),31,0.427252,17591,103
8,562,0.162105,안정공급형,31,0.57318,395692,2686
9,56,0.161033,안정공급형(후보),31,0.23635,580630,18


In [33]:
import numpy as np
import pandas as pd
import re

# --- 유틸 ---
def _slug(s): 
    return re.sub(r'[^0-9A-Za-z가-힣]+', '_', str(s)).strip('_')

def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

# 기존 recommend 코드가 쓰는 이름과 맞추기 위한 래퍼
def cosine_vec(a, B):
    return _cosine(np.asarray(a, dtype=float), np.asarray(B, dtype=float))

# --- 광고 데이터로 (카테고리×도메인) 가중치 만들기 ---
def make_ad_pair_weights_from_ad_df(ad_df, cat_col='ads_category', dom_col='domain',
                                    conv_col='total_conversions', power=1.0, min_frac=0.0):
    """
    ad_df에서 (카테고리×도메인)별 전환 비중 → share_cat{c}_{domain} 컬럼의 가중치 dict 반환
    - power: 비중에 지수 가중 (1.0=그대로, 0.5=루트, 2.0=제곱)
    - min_frac: 너무 작은 비중 컷(0~1)
    """
    t = ad_df.copy()
    t[conv_col] = pd.to_numeric(t[conv_col], errors='coerce').fillna(0.0)
    g = (t.groupby([cat_col, dom_col])[conv_col].sum()
           .rename('conv').reset_index())
    tot = g['conv'].sum()
    if tot <= 0:
        return {}
    g['frac'] = g['conv'] / tot
    if min_frac > 0:
        g = g[g['frac'] >= min_frac].copy()
    g['w'] = (g['frac'] ** power)
    s = g['w'].sum()
    if s > 0:
        g['w'] = g['w'] / s
    weights = { f"share_cat{int(c)}_{_slug(d)}": float(w)
               for c,d,w in g[[cat_col, dom_col, 'w']].itertuples(index=False) }
    return weights

# --- 구성비 CLR 변환 유틸 ---
def _clr_block(df_block, eps=1e-6):
    Z = df_block.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

# --- 피처 행렬 만들기 (추가 피처 + 스케일링 + 컬럼 가중) ---
def build_feature_matrix_plus(
    mda_pf,
    share_cols,                 # 반드시 포함: share_cat*
    volume_cols=None,           # 볼륨형( log1p + z )
    size_ratio_cols=None,       # 사이즈 분포 MEGA/LARGE/MEDIUM/... (CLR or drop-1 + z)
    os_ratio_cols=None,         # OS 분포 (CLR or drop-1 + z)
    use_clr=True,              # True면 ratio 블록들을 CLR, False면 이미 drop-1 가정
    col_weights=None,           # 보통 share_cols에만 가중 들어옴
    zscore=True
):
    volume_cols = list(volume_cols or [])
    size_ratio_cols = list(size_ratio_cols or [])
    os_ratio_cols   = list(os_ratio_cols or [])
    all_cols = list(share_cols) + volume_cols + size_ratio_cols + os_ratio_cols

    X = mda_pf.set_index('mda_idx')[all_cols].astype(float).copy()

    # 결측 처리
    X[volume_cols] = X[volume_cols].fillna(0.0)
    X[size_ratio_cols + os_ratio_cols + share_cols] = X[size_ratio_cols + os_ratio_cols + share_cols].fillna(0.0)

    # 변환: 볼륨형 안정화
    if volume_cols:
        X[volume_cols] = np.log1p(X[volume_cols])

    # 변환: 구성비 블록 CLR (옵션)
    if use_clr:
        if size_ratio_cols:
            X[size_ratio_cols] = _clr_block(X[size_ratio_cols])
        if os_ratio_cols:
            X[os_ratio_cols] = _clr_block(X[os_ratio_cols])

    # (선) 가중치 적용: 열 스케일
    if col_weights:
        w = pd.Series({c: col_weights.get(c, 1.0) for c in all_cols}, index=all_cols, dtype=float)
        X = X.mul(w, axis=1)

    # 표준화
    if zscore:
        X = (X - X.mean()) / (X.std() + 1e-9)

    return X, all_cols

# --- 추천 메인 (추가 피처 주입 가능하도록 확장) ---
def recommend_with_weighted_similarity(
    ad_df,               # 특정 광고의 매체 성과 테이블(ads_XXXX_pf)
    mda_pf,              # (enriched) 전체 매체 프로필 (share_cat* 들어있는 테이블)
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=20,
    weight_power=1.0,    # 광고 전환 분포 가중치 지수
    min_pair_frac=0.0,   # 광고 전환 분포에서 너무 작은 비중 컷
    top_weight_feats=None,  # 상위 몇 개 가중치 feature만 사용할지 (None이면 전체)
    exclude_classes=('계약종료형','품질관리형'),
    min_days_active=7,
    blend_pred_table=None,
    blend_ad_id=None,
    blend_alpha=0.7,

    # ====== 새로 추가된 선택 파라미터들 ======
    volume_cols=("user_count","total_clicks","total_conversions",
                 "daily_avg_conversions","total_ads"),
    size_ratio_cols=("MEGA_ratio","LARGE_ratio","MEDIUM_ratio","SMALL_ratio"),  # SMALL은 드롭 가정
    os_ratio_cols=("ads_os_type_1_pct","ads_os_type_2_pct","ads_os_type_3_pct","ads_os_type_7_pct"),  # 7 드롭
    use_clr=True,  # True면 ratio 전부 넣고 CLR 쓰는 구성으로 바꿔도 됨
    extra_col_weights=None  # share 외 추가 피처에 별도 가중 주고 싶으면 dict로 전달
):
    # 1) share_cat* 피처
    share_cols = [c for c in mda_pf.columns if c.startswith('share_cat')]
    if not share_cols:
        raise ValueError("mda_pf에 share_cat* 컬럼이 없습니다. 먼저 enrichment를 수행하세요.")

    # 2) 광고 전환 분포 기반 share 가중치
    col_w = make_ad_pair_weights_from_ad_df(ad_df, power=weight_power, min_frac=min_pair_frac)
    if top_weight_feats:
        top_keys = set(pd.Series(col_w).sort_values(ascending=False).head(top_weight_feats).index)
        col_w = {k: (v if k in top_keys else 0.0) for k,v in col_w.items()}

    # (선택) 추가 피처에 대한 가중치 병합
    if extra_col_weights:
        col_w.update(extra_col_weights)

    # 3) 피처 행렬 생성 (share + 추가 피처)
    X, all_feat_cols = build_feature_matrix_plus(
        mda_pf,
        share_cols=share_cols,
        volume_cols=list(volume_cols) if volume_cols else [],
        size_ratio_cols=list(size_ratio_cols) if size_ratio_cols else [],
        os_ratio_cols=list(os_ratio_cols) if os_ratio_cols else [],
        use_clr=use_clr,
        col_weights=col_w,
        zscore=True
    )

    # 4) 앵커(이 광고에서 상위 성과 매체)
    used = set(ad_df['mda_idx'].astype(int))
    anchors = (ad_df.sort_values(top_anchor_by, ascending=False)
                   .drop_duplicates('mda_idx')
                   .head(n_anchor)['mda_idx']
                   .astype(int).tolist())
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError("anchor가 없습니다. ad_df에 상위 매체가 있는지 확인하세요.")

    centroid = X.loc[anchors].mean(axis=0).values

    # 5) 후보: 미사용 매체 + 운영 필터
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[ ~cand['basic_classification'].isin(exclude_classes) ]
    if 'days_active' in cand.columns:
        cand = cand[ cand['days_active'] >= min_days_active ]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 6) 유사도 계산 (가중 코사인)
    B = X.loc[cand['mda_idx']].values
    sims = cosine_vec(centroid, B)
    cand['similarity'] = sims

    # 7) (선택) 예측 전환수와 블렌딩
    if blend_pred_table is not None and blend_ad_id is not None:
        pt = blend_pred_table[blend_pred_table['ads_idx']==blend_ad_id][['mda_idx','pred_turn']].copy()
        cand = cand.merge(pt, on='mda_idx', how='left')
        cand['pred_turn'] = cand['pred_turn'].fillna(0.0)
        maxv = cand['pred_turn'].max()
        cand['pred_norm'] = cand['pred_turn'] / (maxv + 1e-9)
        cand['final_score'] = blend_alpha*cand['similarity'] + (1.0-blend_alpha)*cand['pred_norm']
        sort_key = 'final_score'
    else:
        sort_key = 'similarity'

    # 8) 출력 정리
    keep = [c for c in ['mda_idx','similarity','final_score','pred_turn','basic_classification',
                        'days_active','conversion_rate','expected_total_profit','total_ads'] if c in cand.columns]
    out = cand[keep].sort_values(sort_key, ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, all_feat_cols, col_w


In [34]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ads_73878_pf, mda_pf_enriched,
    use_clr=True,
    size_ratio_cols=("MEGA_ratio","LARGE_ratio","MEDIUM_ratio","SMALL_ratio"),
    os_ratio_cols=("ads_os_type_1_pct","ads_os_type_2_pct","ads_os_type_3_pct","ads_os_type_7_pct")
)


In [35]:
# 0) 결과 요약
print(f"anchors (mda_idx): {anchors}")
print(f"피처 개수: {len(feats)}  (예: {feats[:5]} …)")

# 1) 추천 결과 미리보기
out.head(20)   # 노트북이면 display(out.head(20)) 도 OK

cols_for_label = [c for c in ["mda_idx","mda_name","basic_classification"] if c in mda_pf_enriched.columns]
display(
    out.head(20)
       .merge(mda_pf_enriched[cols_for_label], on="mda_idx", how="left")
       .sort_values(out.columns[1], ascending=False)  # similarity 또는 final_score
)

# col_w는 recommend 함수가 반환한 네 번째 값
w_ser = pd.Series(w).sort_values(ascending=False)
display(w_ser.head(20))
print("nonzero weights:", (w_ser>0).sum())



anchors (mda_idx): [14, 654, 56]
피처 개수: 97  (예: ['share_cat0_게임', 'share_cat0_금융', 'share_cat1_게임', 'share_cat1_생활', 'share_cat1_커머스'] …)


Unnamed: 0,mda_idx,similarity,basic_classification_x,days_active,conversion_rate,expected_total_profit,total_ads,basic_classification_y
0,356,0.484878,안정공급형(후보),31,0.162064,99460,10,안정공급형(후보)
1,397,0.38448,특화전문형,31,0.524409,550380,4,특화전문형
2,12,0.284132,특화전문형,31,0.356818,3003668,24,특화전문형
3,87,0.258709,안정공급형(후보),29,0.329034,640407,6,안정공급형(후보)
4,343,0.255459,안정공급형(후보),31,0.497797,3931080,17,안정공급형(후보)
5,711,0.25143,안정공급형(후보),28,0.358333,16540,54,안정공급형(후보)
6,32,0.225478,관리 필요,30,0.103558,191080,7,관리 필요
7,540,0.204923,안정공급형(후보),31,0.348201,403320,46,안정공급형(후보)
8,385,0.203838,특화전문형,31,0.206443,117170,5,특화전문형
9,344,0.200067,안정공급형(후보),31,0.572359,1737623,16,안정공급형(후보)


share_cat8_미디어_컨텐츠    1.0
dtype: float64

nonzero weights: 1


In [47]:
import numpy as np
import pandas as pd
import re

# --- 유틸 ---
def _slug(s): 
    return re.sub(r'[^0-9A-Za-z가-힣]+', '_', str(s)).strip('_')

def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

# 기존 recommend 코드가 쓰는 이름과 맞추기 위한 래퍼
def cosine_vec(a, B):
    return _cosine(np.asarray(a, dtype=float), np.asarray(B, dtype=float))

# --- 광고 데이터로 (카테고리×도메인) 가중치 만들기 + prior 스무딩 ---
def make_ad_pair_weights_from_ad_df(
    ad_df, cat_col='ads_category', dom_col='domain',
    conv_col='total_conversions', power=1.0, min_frac=0.0,
    prior_mix=0.0,            # 0이면 스무딩 없음, 0.1~0.3 권장
    prior_bg=None             # {"share_cat{c}_{slug}": prob} 형태(없으면 균등)
):
    """
    ad_df에서 (카테고리×도메인)별 전환 비중 → share_cat{c}_{domain} 가중치 dict.
    - power: 비중에 지수 가중(1.0=그대로, 0.5=루트, 2.0=제곱)
    - min_frac: 너무 작은 비중 컷
    - prior_mix: 라플라스/디리클레 느낌의 배경분포 혼합(0~1)
    - prior_bg: 배경분포 dict(없으면 관측 키들만 균등)
    """
    t = ad_df.copy()
    t[conv_col] = pd.to_numeric(t[conv_col], errors='coerce').fillna(0.0)
    g = (t.groupby([cat_col, dom_col])[conv_col].sum()
           .rename('conv').reset_index())
    tot = g['conv'].sum()
    if tot <= 0:
        return {}
    g['frac'] = g['conv'] / tot
    if min_frac > 0:
        g = g[g['frac'] >= min_frac].copy()
    g['w'] = (g['frac'] ** power)
    s = g['w'].sum()
    if s > 0:
        g['w'] = g['w'] / s
    g['key'] = [f"share_cat{int(c)}_{_slug(d)}" for c,d in g[[cat_col, dom_col]].itertuples(index=False)]
    w = dict(zip(g['key'], g['w']))

    # --- prior 스무딩 ---
    if prior_mix and prior_mix > 0:
        if (prior_bg is None) or (len(prior_bg) == 0):
            # 관측된 키들만 균등 prior
            prior_bg = {k: 1.0/len(w) for k in w.keys()}
        keys = set(w) | set(prior_bg)
        out = {}
        for k in keys:
            pw = w.get(k, 0.0)
            q  = prior_bg.get(k, 0.0)
            out[k] = (1.0 - prior_mix) * pw + prior_mix * q
        Z = sum(out.values()) or 1.0
        w = {k: v/Z for k,v in out.items()}
    return w

# --- 구성비 CLR 변환 유틸 ---
def _clr_block(df_block, eps=1e-6):
    Z = df_block.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

# --- 피처 행렬 만들기 (추가 피처 + 스케일링 + 컬럼 가중) ---
def build_feature_matrix_plus(
    mda_pf,
    share_cols,                 # 반드시 포함: share_cat*
    volume_cols=None,           # 볼륨형( log1p + z )
    size_ratio_cols=None,       # 사이즈 분포 (CLR or drop-1 + z)
    os_ratio_cols=None,         # OS 분포 (CLR or drop-1 + z)
    use_clr=True,               # True면 ratio 블록들을 CLR
    col_weights=None,           # 보통 share_cols에만 가중 들어옴
    zscore=True
):
    volume_cols = list(volume_cols or [])
    size_ratio_cols = list(size_ratio_cols or [])
    os_ratio_cols   = list(os_ratio_cols or [])
    all_cols = list(share_cols) + volume_cols + size_ratio_cols + os_ratio_cols

    X = mda_pf.set_index('mda_idx')[all_cols].astype(float).copy()

    # 결측 처리
    X[volume_cols] = X[volume_cols].fillna(0.0)
    X[size_ratio_cols + os_ratio_cols + share_cols] = X[size_ratio_cols + os_ratio_cols + share_cols].fillna(0.0)

    # 변환: 볼륨형 안정화
    if volume_cols:
        X[volume_cols] = np.log1p(X[volume_cols])

    # 변환: 구성비 블록 CLR
    if use_clr:
        if size_ratio_cols:
            X[size_ratio_cols] = _clr_block(X[size_ratio_cols])
        if os_ratio_cols:
            X[os_ratio_cols] = _clr_block(X[os_ratio_cols])

    # (선) 가중치 적용: 열 스케일
    if col_weights:
        w = pd.Series({c: col_weights.get(c, 1.0) for c in all_cols}, index=all_cols, dtype=float)
        X = X.mul(w, axis=1)

    # 표준화
    if zscore:
        X = (X - X.mean()) / (X.std() + 1e-9)

    return X, all_cols

# --- 추천 메인 (안정화 옵션들 포함) ---
def recommend_with_weighted_similarity(
    ad_df,               # 특정 광고의 매체 성과 테이블(ads_XXXX_pf)
    mda_pf,              # (enriched) 전체 매체 프로필 (share_cat* 들어있는 테이블)
    top_anchor_by='total_conversions',
    n_anchor=3,          # 앵커 확대(기본 5 권장)
    topN=20,
    weight_power=0.5,    # 루트가중으로 집중 완화(기본 0.5 권장)
    min_pair_frac=0.0,   # 광고 전환 분포에서 너무 작은 비중 컷
    top_weight_feats=None,   # 상위 몇 개 가중치 feature만 사용할지 (None이면 전체)
    exclude_classes=('계약종료형','품질관리형'),
    min_days_active=7,
    blend_pred_table=None,
    blend_ad_id=None,
    blend_alpha=0.7,

    # NEW: 정렬 기준 선택
    sort_by="final",     # "final" | "pred" | "sim"

    # ====== 새로 추가된 선택 파라미터들 ======
    volume_cols=("user_count","total_clicks","total_conversions",
                 "daily_avg_conversions","total_ads"),
    size_ratio_cols=("MEGA_ratio","LARGE_ratio","MEDIUM_ratio","SMALL_ratio"),
    os_ratio_cols=("ads_os_type_1_pct","ads_os_type_2_pct","ads_os_type_3_pct","ads_os_type_7_pct"),
    use_clr=True,  # 전 축 사용하므로 CLR 권장
    extra_col_weights=None,  # share 외 추가 피처 가중 dict

    # NEW: prior 스무딩 옵션
    prior_mix=0.2,              # 0~1, 0.2 권장
    prior_from="mda_mean",      # "mda_mean" | "uniform" | "none"
    prior_bg_dict=None,         # 직접 전달 시 우선

    # NEW: share 컬럼 IDF 보정
    use_idf=False,              # 희소/상용 보정(기본 끔)
    idf_smooth=1.0,             # log((N+1)/(df+smooth))
    min_similarity=None         # 유사도 최소치 필터(예: 0.1)
):
    # 1) share_cat* 피처
    share_cols = [c for c in mda_pf.columns if c.startswith('share_cat')]
    if not share_cols:
        raise ValueError("mda_pf에 share_cat* 컬럼이 없습니다. 먼저 enrichment를 수행하세요.")

    # 1-1) prior 배경 분포 준비
    prior_bg = None
    if prior_bg_dict is not None:
        prior_bg = dict(prior_bg_dict)
    elif prior_from == "mda_mean":
        # mda_pf의 share 평균을 배경분포로 사용
        avg = mda_pf[share_cols].fillna(0.0).mean(axis=0)
        s = avg.sum()
        if s > 0:
            prior_bg = (avg / s).to_dict()
    elif prior_from == "uniform":
        prior_bg = {c: 1.0/len(share_cols) for c in share_cols}
    # "none"이면 prior_bg=None (make_* 내부에서 관측 키 균등)

    # 2) 광고 전환 분포 기반 share 가중치 (+ prior 스무딩 + power 적용)
    col_w = make_ad_pair_weights_from_ad_df(
        ad_df, power=weight_power, min_frac=min_pair_frac,
        prior_mix=prior_mix if prior_mix else 0.0,
        prior_bg=prior_bg
    )
    if top_weight_feats:
        top_keys = set(pd.Series(col_w).sort_values(ascending=False).head(top_weight_feats).index)
        col_w = {k: (v if k in top_keys else 0.0) for k,v in col_w.items()}

    # 2-1) share 컬럼 IDF 보정(옵션)
    if use_idf:
        df_share = (mda_pf[share_cols].fillna(0) != 0).sum(axis=0)
        N = len(mda_pf)
        idf = np.log( (N + 1.0) / (df_share + idf_smooth) )
        # 평균 1로 정규화(스케일 안정)
        idf = idf / (idf.mean() + 1e-12)
        for k in list(col_w.keys()):
            if k in idf.index:
                col_w[k] *= float(idf[k])

    # (선택) 추가 피처에 대한 가중치 병합
    if extra_col_weights:
        col_w.update(extra_col_weights)

    # 3) 피처 행렬 생성 (share + 추가 피처)
    X, all_feat_cols = build_feature_matrix_plus(
        mda_pf,
        share_cols=share_cols,
        volume_cols=list(volume_cols) if volume_cols else [],
        size_ratio_cols=list(size_ratio_cols) if size_ratio_cols else [],
        os_ratio_cols=list(os_ratio_cols) if os_ratio_cols else [],
        use_clr=use_clr,
        col_weights=col_w,
        zscore=True
    )

    # 4) 앵커(이 광고에서 상위 성과 매체)
    used = set(ad_df['mda_idx'].astype(int))
    anchors = (ad_df.sort_values(top_anchor_by, ascending=False)
                   .drop_duplicates('mda_idx')
                   .head(n_anchor)['mda_idx']
                   .astype(int).tolist())
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError("anchor가 없습니다. ad_df에 상위 매체가 있는지 확인하세요.")

    centroid = X.loc[anchors].mean(axis=0).values

    # 5) 후보: 미사용 매체 + 운영 필터
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[ ~cand['basic_classification'].isin(exclude_classes) ]
    if 'days_active' in cand.columns:
        cand = cand[ cand['days_active'] >= min_days_active ]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 6) 유사도 계산 (가중 코사인)
    B = X.loc[cand['mda_idx']].values
    sims = cosine_vec(centroid, B)
    cand['similarity'] = sims

    # (옵션) 최소 유사도 필터
    if (min_similarity is not None):
        cand = cand[cand['similarity'] >= float(min_similarity)]
        if cand.empty:
            return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 7) (선택) 예측 전환수와 블렌딩
    has_pred = (blend_pred_table is not None) and (blend_ad_id is not None)
    if has_pred:
        pt = blend_pred_table[blend_pred_table['ads_idx']==blend_ad_id][['mda_idx','pred_turn']].copy()
        cand = cand.merge(pt, on='mda_idx', how='left')
        cand['pred_turn'] = cand['pred_turn'].fillna(0.0)
        maxv = cand['pred_turn'].max()
        cand['pred_norm'] = cand['pred_turn'] / (maxv + 1e-9)
        cand['final_score'] = blend_alpha*cand['similarity'] + (1.0-blend_alpha)*cand['pred_norm']

    # 8) 정렬 기준 결정
    if sort_by == "pred" and has_pred:
        sort_key = "pred_turn"
    elif sort_by == "sim":
        sort_key = "similarity"
    else:
        sort_key = "final_score" if has_pred else "similarity"

    # 9) 출력 정리
    keep = [c for c in [
        'mda_idx','similarity','final_score','pred_turn','pred_norm',
        'basic_classification','days_active','conversion_rate',
        'expected_total_profit','total_ads'
    ] if c in cand.columns]
    out = cand[keep].sort_values(sort_key, ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, all_feat_cols, col_w


In [48]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_73878_pf,
    mda_pf=mda_pf_enriched,
    # 안정화 옵션들:
    weight_power=0.5,       # 루트가중(집중 완화)
    prior_mix=0.2,          # 광고 분포에 mda 평균을 20% 섞기
    prior_from="mda_mean",  # 배경분포: mda_pf의 share 평균
    use_idf=True,           # share 컬럼 IDF 보정
    n_anchor=3,             # 앵커 확대
    # 예측과 블렌딩/정렬:
    # blend_pred_table=pred_table,  # ['ads_idx','mda_idx','pred_turn']
    # blend_ad_id=73878,
    # blend_alpha=0.6,
    sort_by="final",        # "pred"로 바꾸면 예측전환수 우선
    # 필터(선택):
    min_similarity=0.1
)

display(out.head(20))
print(anchors)


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,356,0.483327,안정공급형(후보),31,0.162064,99460,10
1,397,0.382861,특화전문형,31,0.524409,550380,4
2,540,0.305158,안정공급형(후보),31,0.348201,403320,46
3,12,0.282366,특화전문형,31,0.356818,3003668,24
4,87,0.257572,안정공급형(후보),29,0.329034,640407,6
5,343,0.253733,안정공급형(후보),31,0.497797,3931080,17
6,711,0.249602,안정공급형(후보),28,0.358333,16540,54
7,32,0.224923,관리 필요,30,0.103558,191080,7
8,385,0.202963,특화전문형,31,0.206443,117170,5
9,344,0.198404,안정공급형(후보),31,0.572359,1737623,16


[14, 654, 56]


In [54]:
mda_pf_enriched[mda_pf_enriched['mda_idx'].isin([14,654,56,356,384,397])]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,mda_idx,user_count,total_clicks,total_conversions,first_click,last_click,days_active,daily_avg_conversions,LARGE,MEDIUM,MEGA,SMALL,total_ads,MEGA_ratio,LARGE_ratio,MEDIUM_ratio,SMALL_ratio,ads_category_0_pct,ads_category_1_pct,ads_category_2_pct,ads_category_3_pct,ads_category_4_pct,ads_category_5_pct,ads_category_6_pct,ads_category_7_pct,ads_category_8_pct,ads_category_10_pct,ads_category_11_pct,ads_category_13_pct,ads_os_type_1_pct,ads_os_type_2_pct,ads_os_type_3_pct,ads_os_type_7_pct,domain_게임_pct,domain_교육_pct,domain_금융_pct,domain_기타_pct,domain_미디어/컨텐츠_pct,domain_뷰티_pct,domain_비영리/공공_pct,domain_생활_pct,domain_식당/카페_pct,domain_식음료_pct,domain_운동/스포츠_pct,domain_운세_pct,domain_의료/건강_pct,domain_채용_pct,domain_커머스_pct,expected_total_profit,conversion_rate,last_click_dt,classification,basic_classification,conv_cat0_게임,conv_cat0_금융,conv_cat1_게임,conv_cat1_생활,conv_cat1_커머스,conv_cat2_게임,conv_cat2_비영리_공공,conv_cat3_비영리_공공,conv_cat3_식음료,conv_cat8_미디어_컨텐츠,conv_cat8_의료_건강,conv_cat8_커머스,conv_cat1_금융,conv_cat1_기타,conv_cat1_미디어_컨텐츠,conv_cat1_뷰티,conv_cat1_의료_건강,conv_cat1_채용,conv_cat2_기타,conv_cat2_커머스,conv_cat3_미디어_컨텐츠,conv_cat3_뷰티,conv_cat3_생활,conv_cat5_게임,conv_cat8_게임,conv_cat8_금융,conv_cat8_생활,conv_cat10_미디어_컨텐츠,conv_cat0_미디어_컨텐츠,conv_cat0_생활,conv_cat0_의료_건강,conv_cat0_커머스,conv_cat2_식당_카페,conv_cat2_채용,conv_cat5_미디어_컨텐츠,conv_cat7_금융,conv_cat8_뷰티,conv_cat8_식음료,conv_cat8_운동_스포츠,conv_cat10_금융,conv_cat10_생활,conv_cat10_의료_건강,conv_cat3_채용,conv_cat8_비영리_공공,conv_cat4_식당_카페,conv_cat4_식음료,conv_cat4_운동_스포츠,conv_cat4_커머스,conv_cat8_교육,conv_cat10_커머스,conv_cat11_뷰티,conv_cat11_식음료,conv_cat11_운동_스포츠,conv_cat11_의료_건강,conv_cat11_커머스,conv_cat13_금융,conv_cat13_운세,conv_cat8_운세,conv_cat8_기타,conv_cat8_채용,conv_cat13_커머스,conv_cat3_금융,conv_cat4_게임,conv_cat4_교육,conv_cat4_금융,conv_cat4_기타,conv_cat4_미디어_컨텐츠,conv_cat4_뷰티,conv_cat4_생활,conv_cat4_운세,conv_cat4_의료_건강,conv_cat5_식음료,conv_cat5_운동_스포츠,conv_cat6_게임,conv_cat6_미디어_컨텐츠,conv_cat6_식당_카페,conv_cat10_뷰티,conv_cat10_운세,conv_cat13_생활,conv_cat4_비영리_공공,conv_cat5_금융,conv_cat4_채용,conv_cat1_교육,conv_cat11_미디어_컨텐츠,share_cat0_게임,share_cat0_금융,share_cat1_게임,share_cat1_생활,share_cat1_커머스,share_cat2_게임,share_cat2_비영리_공공,share_cat3_비영리_공공,share_cat3_식음료,share_cat8_미디어_컨텐츠,share_cat8_의료_건강,share_cat8_커머스,share_cat1_금융,share_cat1_기타,share_cat1_미디어_컨텐츠,share_cat1_뷰티,share_cat1_의료_건강,share_cat1_채용,share_cat2_기타,share_cat2_커머스,share_cat3_미디어_컨텐츠,share_cat3_뷰티,share_cat3_생활,share_cat5_게임,share_cat8_게임,share_cat8_금융,share_cat8_생활,share_cat10_미디어_컨텐츠,share_cat0_미디어_컨텐츠,share_cat0_생활,share_cat0_의료_건강,share_cat0_커머스,share_cat2_식당_카페,share_cat2_채용,share_cat5_미디어_컨텐츠,share_cat7_금융,share_cat8_뷰티,share_cat8_식음료,share_cat8_운동_스포츠,share_cat10_금융,share_cat10_생활,share_cat10_의료_건강,share_cat3_채용,share_cat8_비영리_공공,share_cat4_식당_카페,share_cat4_식음료,share_cat4_운동_스포츠,share_cat4_커머스,share_cat8_교육,share_cat10_커머스,share_cat11_뷰티,share_cat11_식음료,share_cat11_운동_스포츠,share_cat11_의료_건강,share_cat11_커머스,share_cat13_금융,share_cat13_운세,share_cat8_운세,share_cat8_기타,share_cat8_채용,share_cat13_커머스,share_cat3_금융,share_cat4_게임,share_cat4_교육,share_cat4_금융,share_cat4_기타,share_cat4_미디어_컨텐츠,share_cat4_뷰티,share_cat4_생활,share_cat4_운세,share_cat4_의료_건강,share_cat5_식음료,share_cat5_운동_스포츠,share_cat6_게임,share_cat6_미디어_컨텐츠,share_cat6_식당_카페,share_cat10_뷰티,share_cat10_운세,share_cat13_생활,share_cat4_비영리_공공,share_cat5_금융,share_cat4_채용,share_cat1_교육,share_cat11_미디어_컨텐츠
1,1,1,14,31834,84763,22983,2025-07-26 00:01:22,2025-08-25 11:22:40,31,741.387097,31,5,22,1,59,37.3,52.5,8.5,1.7,0.0,20.338983,18.644068,8.474576,0.0,25.423729,0.0,3.389831,22.033898,1.694915,0.0,0.0,42.372881,38.983051,0.0,18.644068,44.067797,0.0,11.864407,3.389831,8.474576,5.084746,0.0,13.559322,0.0,3.389831,0.0,0.0,3.389831,1.694915,5.084746,2733547,0.271144,2025-08-25 11:22:40,안정공급형,안정공급형,0.0,0.0,2534.0,1722.0,0.0,9523.0,0.0,0.0,1783.0,419.0,736.0,245.0,600.0,38.0,408.0,695.0,818.0,592.0,1.0,332.0,457.0,1576.0,6.0,268.0,3.0,197.0,22.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110255,0.074925,0.0,0.41435,0.0,0.0,0.077579,0.018231,0.032024,0.01066,0.026106,0.001653,0.017752,0.03024,0.035592,0.025758,4.4e-05,0.014445,0.019884,0.068572,0.000261,0.011661,0.000131,0.008572,0.000957,0.000348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,8,56,17274,40127,9484,2025-07-26 00:04:02,2025-08-25 11:18:47,31,305.935484,13,0,5,0,18,27.8,72.2,0.0,0.0,0.0,11.111111,11.111111,0.0,0.0,5.555556,0.0,5.555556,22.222222,44.444444,0.0,0.0,33.333333,0.0,0.0,66.666667,22.222222,0.0,38.888889,0.0,5.555556,0.0,0.0,11.111111,0.0,0.0,0.0,11.111111,5.555556,0.0,5.555556,580630,0.23635,2025-08-25 11:18:47,안정공급형(후보),안정공급형(후보),0.0,0.0,834.0,84.0,0.0,8231.0,0.0,0.0,0.0,20.0,0.0,170.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087938,0.008857,0.0,0.867883,0.0,0.0,0.0,0.002109,0.0,0.017925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002847,0.0,0.009384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001476,0.0,0.0,0.0,0.000527,0.000105,0.000105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,27,27,356,780,1919,311,2025-07-26 01:53:27,2025-08-25 08:53:06,31,10.032258,7,0,3,0,10,30.0,70.0,0.0,0.0,0.0,10.0,20.0,0.0,0.0,10.0,0.0,10.0,40.0,10.0,0.0,0.0,70.0,0.0,0.0,30.0,30.0,0.0,10.0,0.0,10.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,10.0,0.0,10.0,99460,0.162064,2025-08-25 08:53:06,안정공급형(후보),안정공급형(후보),0.0,0.0,0.0,4.0,0.0,172.0,0.0,0.0,0.0,0.0,25.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012862,0.0,0.553055,0.0,0.0,0.0,0.0,0.080386,0.270096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03537,0.0,0.0,0.006431,0.006431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,31,31,384,7335,20186,4358,2025-07-26 00:30:17,2025-08-25 11:21:21,31,140.580645,2,0,2,0,4,50.0,50.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,75.0,0.0,25.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,25.0,568970,0.215892,2025-08-25 11:21:21,특화전문형_금융특화,특화전문형,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,360.0,393.0,3605.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082607,0.090179,0.827214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,35,35,397,12815,24294,12740,2025-07-26 00:16:22,2025-08-25 10:33:22,31,410.967742,2,0,2,0,4,50.0,50.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,25.0,0.0,0.0,25.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,550380,0.524409,2025-08-25 10:33:22,특화전문형_게임특화,특화전문형,0.0,0.0,0.0,0.0,0.0,12501.0,0.0,0.0,0.0,0.0,0.0,178.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98124,0.0,0.0,0.0,0.0,0.0,0.013972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,68,68,654,1,24370,1106,2025-07-26 00:03:38,2025-08-25 11:02:42,31,35.677419,1,0,2,0,3,66.7,33.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.666667,33.333333,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,33.333333,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.333333,0.0,0.0,1100000,0.045384,2025-08-25 11:02:42,안정공급형(후보),안정공급형(후보),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,1064.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036166,0.962025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# 유사도

import numpy as np
import pandas as pd

# --- 유틸: 코사인 유사도 (sklearn 없이) ---
def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

# --- mda_pf에서 유사도 계산에 쓸 피처 뽑기 ---
def pick_feature_cols(mda_pf):
    cols = []
    cols += [c for c in mda_pf.columns if c.startswith('domain_') and c.endswith('_pct')]
    cols += [c for c in mda_pf.columns if c.startswith('ads_os_type_') and c.endswith('_pct')]
    cols += [c for c in ['MEGA_ratio','LARGE_ratio','MEDIUM_ratio','SMALL_ratio'] if c in mda_pf.columns]
    # 스케일 편향 줄이려면 전역 성과( expected_total_profit 등)는 제외하고,
    # 필요한 경우 'conversion_rate' 정도만 보조로 넣기
    if 'conversion_rate' in mda_pf.columns:
        cols.append('conversion_rate')
    return cols

# --- 표준화(+그룹 가중치) ---
def build_profile_matrix(mda_pf, feature_cols, group_weights=None):
    X = mda_pf.set_index('mda_idx')[feature_cols].astype(float).fillna(0.0)

    # 그룹 가중치(선택): 도메인/OS/포맷 비중을 더 세게 보려면 가중치 부여
    if group_weights:
        w = np.ones(len(feature_cols), dtype=float)
        for pat, gw in group_weights.items():
            for i,c in enumerate(feature_cols):
                if pat(c):
                    w[i] = gw
        X = X * w

    # 표준화(z-score) → 스케일 다른 열들 균형
    X = (X - X.mean()) / (X.std() + 1e-9)
    return X

# --- 추천 함수 ---
def recommend_similar_media(
    ad_df,                # 특정 광고의 매체 수행 테이블 (ads_XXXX_pf)
    mda_pf,               # 전체 매체 프로필 테이블
    top_anchor_by='total_conversions',  # anchor 선정 기준
    n_anchor=3,           # 상위 anchor 개수 (2~3 추천)
    topN=15,              # 추천 결과 개수
    exclude_classes=('계약종료형','품질관리형'),  # 제외할 매체 유형
    min_days_active=7,    # 최소 활동일
    group_weights=None    # {'도메인':2.0, '포맷':1.5} 같은 식으로도 쓸 수 있음
):
    feature_cols = pick_feature_cols(mda_pf)
    X = build_profile_matrix(mda_pf, feature_cols, group_weights=group_weights)

    # 이 광고에서 이미 쓰는 매체는 제외
    used = set(ad_df['mda_idx'].astype(int))

    # anchor (이 광고에서 상위 성과 매체)
    anchors = (
        ad_df.sort_values(top_anchor_by, ascending=False)
             .drop_duplicates('mda_idx')
             .head(n_anchor)['mda_idx'].astype(int).tolist()
    )
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError('anchor 매체가 프로필 매트릭스에 없습니다.')

    centroid = X.loc[anchors].mean(axis=0).values  # anchor 평균 프로필

    # 후보 풀: 안 쓰는 매체 + 기본 필터(유형, 활동일)
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[~cand['basic_classification'].isin(exclude_classes)]
    if 'days_active' in cand.columns:
        cand = cand[cand['days_active'] >= min_days_active]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity'])  # 후보 없음

    # 유사도 계산 (코사인, anchor centroid 대비)
    Xc = X.loc[cand['mda_idx'].values]
    sims = _cosine(centroid, Xc.values)

    out = cand.copy()
    out['similarity'] = sims

    # 리포트용 보조 컬럼 몇 개
    keep_cols = ['mda_idx','similarity','basic_classification','days_active',
                 'conversion_rate','expected_total_profit','total_ads']
    keep_cols = [c for c in keep_cols if c in out.columns]
    out = out[keep_cols].sort_values('similarity', ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, feature_cols


In [52]:
# 특정 광고 테이블(예: ads_73878_pf) 과 전체 매체 프로필(mda_pf) 준비돼 있다고 가정
recs, anchors, used_feats = recommend_similar_media(
    ad_df=ads_73878_pf, 
    mda_pf=mda_pf,
    top_anchor_by='total_conversions',
    n_anchor=3,
    topN=20,
    exclude_classes=('계약종료형','품질관리형'),
    group_weights={
        # 도메인 가중 2.0, 포맷비중 1.5 (OS는 1.0)
        (lambda c: c.startswith('domain_') and c.endswith('_pct')): 2.0,
        (lambda c: c.endswith('_ratio')): 1.5,
    }
)
print("anchor 매체:", anchors)
recs

anchor 매체: [14, 654, 56]


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,384,0.671987,특화전문형,31,0.215892,568970,4
1,356,0.633956,안정공급형(후보),31,0.162064,99460,10
2,32,0.619627,관리 필요,30,0.103558,191080,7
3,686,0.619341,안정공급형(후보),26,0.37931,1510,23
4,540,0.614091,안정공급형(후보),31,0.348201,403320,46
5,401,0.596112,안정공급형(후보),31,0.086053,5921,56
6,785,0.463261,안정공급형(후보),29,0.28125,3967,27
7,583,0.456529,안정공급형(후보),31,0.044158,122510,41
8,772,0.443814,안정공급형(후보),30,0.46063,13330,56
9,294,0.434449,안정공급형(후보),31,0.202488,127250,49


In [55]:
def build_feature_matrix_plus(
    mda_pf,
    share_cols,                 # 반드시 포함: share_cat*
    volume_cols=None,           # 볼륨형( log1p + z )
    size_ratio_cols=None,       # 사이즈 분포 (CLR or drop-1 + z)
    os_ratio_cols=None,         # OS 분포 (CLR or drop-1 + z)
    category_ratio_cols=None,   # ★ 광고 카테고리 비중 ads_category_*_pct
    domain_ratio_cols=None,     # ★ 도메인 비중 domain_*_pct
    use_clr=True,               # True면 ratio 블록들을 CLR
    col_weights=None,           # 보통 share_cols에만 가중 들어옴
    zscore=True
):
    volume_cols        = list(volume_cols or [])
    size_ratio_cols    = list(size_ratio_cols or [])
    os_ratio_cols      = list(os_ratio_cols or [])
    category_ratio_cols= list(category_ratio_cols or [])
    domain_ratio_cols  = list(domain_ratio_cols or [])

    all_cols = (list(share_cols) + volume_cols + size_ratio_cols +
                os_ratio_cols + category_ratio_cols + domain_ratio_cols)

    X = mda_pf.set_index('mda_idx')[all_cols].astype(float).copy()

    # 결측 처리
    X[volume_cols] = X[volume_cols].fillna(0.0)
    X[size_ratio_cols + os_ratio_cols + category_ratio_cols + domain_ratio_cols + share_cols] = \
        X[size_ratio_cols + os_ratio_cols + category_ratio_cols + domain_ratio_cols + share_cols].fillna(0.0)

    # 변환: 볼륨형 안정화
    if volume_cols:
        X[volume_cols] = np.log1p(X[volume_cols])

    # 변환: 구성비 블록 CLR
    if use_clr:
        if size_ratio_cols:
            X[size_ratio_cols] = _clr_block(X[size_ratio_cols])
        if os_ratio_cols:
            X[os_ratio_cols] = _clr_block(X[os_ratio_cols])
        if category_ratio_cols:
            X[category_ratio_cols] = _clr_block(X[category_ratio_cols])
        if domain_ratio_cols:
            X[domain_ratio_cols] = _clr_block(X[domain_ratio_cols])

    # (선) 가중치 적용: 열 스케일
    if col_weights:
        w = pd.Series({c: col_weights.get(c, 1.0) for c in all_cols}, index=all_cols, dtype=float)
        X = X.mul(w, axis=1)

    # 표준화
    if zscore:
        X = (X - X.mean()) / (X.std() + 1e-9)

    return X, all_cols


In [56]:
def recommend_with_weighted_similarity(
    ad_df,
    mda_pf,
    top_anchor_by='total_conversions',
    n_anchor=5,
    topN=20,
    weight_power=0.5,
    min_pair_frac=0.0,
    top_weight_feats=None,
    exclude_classes=('계약종료형','품질관리형'),
    min_days_active=7,
    blend_pred_table=None,
    blend_ad_id=None,
    blend_alpha=0.7,
    sort_by="final",

    # ====== 기존/추가 피처 ======
    volume_cols=("user_count","total_clicks","total_conversions",
                 "daily_avg_conversions","total_ads"),
    size_ratio_cols=("MEGA_ratio","LARGE_ratio","MEDIUM_ratio","SMALL_ratio"),
    os_ratio_cols=("ads_os_type_1_pct","ads_os_type_2_pct","ads_os_type_3_pct","ads_os_type_7_pct"),

    # ★ 여기 기본값에 너가 준 컬럼 전부 넣어둠 (없으면 자동으로 무시됨)
    category_ratio_cols=("ads_category_0_pct","ads_category_1_pct","ads_category_2_pct","ads_category_3_pct",
                         "ads_category_4_pct","ads_category_5_pct","ads_category_6_pct","ads_category_7_pct",
                         "ads_category_8_pct","ads_category_10_pct","ads_category_11_pct","ads_category_13_pct"),
    domain_ratio_cols=("domain_게임_pct","domain_교육_pct","domain_금융_pct","domain_기타_pct","domain_미디어/컨텐츠_pct",
                       "domain_뷰티_pct","domain_비영리/공공_pct","domain_생활_pct","domain_식당/카페_pct","domain_식음료_pct",
                       "domain_운동/스포츠_pct","domain_운세_pct","domain_의료/건강_pct","domain_채용_pct","domain_커머스_pct"),

    use_clr=True,  # 전 축 사용하므로 CLR 권장
    extra_col_weights=None,

    # ====== 안정화 옵션 ======
    prior_mix=0.2,
    prior_from="mda_mean",
    prior_bg_dict=None,
    use_idf=False,
    idf_smooth=1.0,
    min_similarity=None
):
    # 1) share_cat* 피처
    share_cols = [c for c in mda_pf.columns if c.startswith('share_cat')]
    if not share_cols:
        raise ValueError("mda_pf에 share_cat* 컬럼이 없습니다. 먼저 enrichment를 수행하세요.")

    # (존재하는 컬럼만 남기기: NotFound 방지)
    def _keep_exist(cols): return [c for c in cols if c in mda_pf.columns]

    volume_cols         = _keep_exist(volume_cols)
    size_ratio_cols     = _keep_exist(size_ratio_cols)
    os_ratio_cols       = _keep_exist(os_ratio_cols)
    category_ratio_cols = _keep_exist(category_ratio_cols)
    domain_ratio_cols   = _keep_exist(domain_ratio_cols)

    # 1-1) prior 배경 분포 준비
    prior_bg = None
    if prior_bg_dict is not None:
        prior_bg = dict(prior_bg_dict)
    elif prior_from == "mda_mean":
        avg = mda_pf[share_cols].fillna(0.0).mean(axis=0)
        s = avg.sum()
        if s > 0:
            prior_bg = (avg / s).to_dict()
    elif prior_from == "uniform":
        prior_bg = {c: 1.0/len(share_cols) for c in share_cols}

    # 2) 광고 전환 분포 기반 share 가중치 (+ prior/power)
    col_w = make_ad_pair_weights_from_ad_df(
        ad_df, power=weight_power, min_frac=min_pair_frac,
        prior_mix=prior_mix if prior_mix else 0.0,
        prior_bg=prior_bg
    )
    if top_weight_feats:
        top_keys = set(pd.Series(col_w).sort_values(ascending=False).head(top_weight_feats).index)
        col_w = {k: (v if k in top_keys else 0.0) for k,v in col_w.items()}

    # 2-1) IDF 보정(옵션)
    if use_idf:
        df_share = (mda_pf[share_cols].fillna(0) != 0).sum(axis=0)
        N = len(mda_pf)
        idf = np.log((N + 1.0) / (df_share + idf_smooth))
        idf = idf / (idf.mean() + 1e-12)
        for k in list(col_w.keys()):
            if k in idf.index:
                col_w[k] *= float(idf[k])

    if extra_col_weights:
        col_w.update(extra_col_weights)

    # 3) 피처 행렬 생성 (share + 추가 피처)
    X, all_feat_cols = build_feature_matrix_plus(
        mda_pf,
        share_cols=share_cols,
        volume_cols=volume_cols,
        size_ratio_cols=size_ratio_cols,
        os_ratio_cols=os_ratio_cols,
        category_ratio_cols=category_ratio_cols,   # ★ 전달
        domain_ratio_cols=domain_ratio_cols,       # ★ 전달
        use_clr=use_clr,
        col_weights=col_w,
        zscore=True
    )

    # 4) 앵커/센트로이드
    used = set(ad_df['mda_idx'].astype(int))
    anchors = (ad_df.sort_values(top_anchor_by, ascending=False)
                   .drop_duplicates('mda_idx')
                   .head(n_anchor)['mda_idx']
                   .astype(int).tolist())
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError("anchor가 없습니다. ad_df에 상위 매체가 있는지 확인하세요.")

    centroid = X.loc[anchors].mean(axis=0).values

    # 5) 후보 필터
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[~cand['basic_classification'].isin(exclude_classes)]
    if 'days_active' in cand.columns:
        cand = cand[cand['days_active'] >= min_days_active]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 6) 유사도
    B = X.loc[cand['mda_idx']].values
    cand['similarity'] = cosine_vec(centroid, B)

    if (min_similarity is not None):
        cand = cand[cand['similarity'] >= float(min_similarity)]
        if cand.empty:
            return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 7) 예측 블렌딩(옵션)
    has_pred = (blend_pred_table is not None) and (blend_ad_id is not None)
    if has_pred:
        pt = blend_pred_table[blend_pred_table['ads_idx']==blend_ad_id][['mda_idx','pred_turn']].copy()
        cand = cand.merge(pt, on='mda_idx', how='left')
        cand['pred_turn'] = cand['pred_turn'].fillna(0.0)
        maxv = cand['pred_turn'].max()
        cand['pred_norm'] = cand['pred_turn'] / (maxv + 1e-9)
        cand['final_score'] = blend_alpha*cand['similarity'] + (1.0-blend_alpha)*cand['pred_norm']

    # 8) 정렬
    if sort_by == "pred" and has_pred:
        sort_key = "pred_turn"
    elif sort_by == "sim":
        sort_key = "similarity"
    else:
        sort_key = "final_score" if has_pred else "similarity"

    keep = [c for c in [
        'mda_idx','similarity','final_score','pred_turn','pred_norm',
        'basic_classification','days_active','conversion_rate',
        'expected_total_profit','total_ads'
    ] if c in cand.columns]
    out = cand[keep].sort_values(sort_key, ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, all_feat_cols, col_w


In [61]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_73878_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=5
)
display(out.head(20))


Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,356,0.553925,안정공급형(후보),31,0.162064,99460,10
1,540,0.372773,안정공급형(후보),31,0.348201,403320,46
2,397,0.357038,특화전문형,31,0.524409,550380,4
3,87,0.354406,안정공급형(후보),29,0.329034,640407,6
4,12,0.340885,특화전문형,31,0.356818,3003668,24
5,343,0.300522,안정공급형(후보),31,0.497797,3931080,17
6,342,0.284074,특화전문형,31,0.538223,6449810,17
7,398,0.282827,안정공급형(후보),31,0.170626,185580,27
8,344,0.265168,안정공급형(후보),31,0.572359,1737623,16
9,32,0.245073,관리 필요,30,0.103558,191080,7


In [63]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_9935_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=5
)
display(out.head(20))

Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,817,0.329564,안정공급형(후보),31,0.631402,8946,209
1,563,0.312263,대량처리형,31,0.57488,4216079,6563
2,371,0.306462,안정공급형,31,0.432518,172278,973
3,58,0.305951,안정공급형(후보),31,0.322761,3884970,26
4,564,0.29395,안정공급형(후보),31,0.350742,62829,73
5,337,0.281521,안정공급형(후보),31,0.152083,164850,382
6,22,0.27231,대량처리형,31,0.422674,2541770,166
7,398,0.253981,안정공급형(후보),31,0.170626,185580,27
8,496,0.248369,안정공급형,31,0.206431,1639389,54
9,87,0.239956,안정공급형(후보),29,0.329034,640407,6


# 매체사 유사도

In [64]:
# === 통합 셀: 유사도 추천기 (CLR + prior + power + IDF + 추가 비율 피처) ===
import numpy as np
import pandas as pd
import re

# --- 유틸 ---
def _slug(s): 
    return re.sub(r'[^0-9A-Za-z가-힣]+', '_', str(s)).strip('_')

def _cosine(a, B):
    a = a.reshape(1, -1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den == 0, 1e-12, den)
    return (num / den).ravel()

def cosine_vec(a, B):
    return _cosine(np.asarray(a, dtype=float), np.asarray(B, dtype=float))

# --- 광고 데이터로 (카테고리×도메인) 가중치 + prior 스무딩 ---
def make_ad_pair_weights_from_ad_df(
    ad_df, cat_col='ads_category', dom_col='domain',
    conv_col='total_conversions', power=1.0, min_frac=0.0,
    prior_mix=0.0, prior_bg=None
):
    """
    (카테×도메인) 전환 분포 -> share_cat{c}_{slug} 가중치 dict
    power<1: 퍼짐, >1: 집중 / prior_mix: 배경분포 섞기
    """
    t = ad_df.copy()
    t[conv_col] = pd.to_numeric(t[conv_col], errors='coerce').fillna(0.0)
    g = (t.groupby([cat_col, dom_col])[conv_col].sum()
           .rename('conv').reset_index())
    tot = g['conv'].sum()
    if tot <= 0:
        return {}
    g['frac'] = g['conv'] / tot
    if min_frac > 0:
        g = g[g['frac'] >= min_frac].copy()
    g['w'] = (g['frac'] ** power)
    s = g['w'].sum()
    if s > 0:
        g['w'] = g['w'] / s
    g['key'] = [f"share_cat{int(c)}_{_slug(d)}" for c,d in g[[cat_col, dom_col]].itertuples(index=False)]
    w = dict(zip(g['key'], g['w']))

    # prior 스무딩
    if prior_mix and prior_mix > 0:
        if (prior_bg is None) or (len(prior_bg) == 0):
            prior_bg = {k: 1.0/len(w) for k in w.keys()}
        keys = set(w) | set(prior_bg)
        out = {}
        for k in keys:
            pw = w.get(k, 0.0)
            q  = prior_bg.get(k, 0.0)
            out[k] = (1.0 - prior_mix) * pw + prior_mix * q
        Z = sum(out.values()) or 1.0
        w = {k: v/Z for k,v in out.items()}
    return w

# --- 구성비 CLR 변환 ---
def _clr_block(df_block, eps=1e-6):
    Z = df_block.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

# --- 피처 행렬 (share + 볼륨 + 각종 비율 + CLR + 가중 + z-score) ---
def build_feature_matrix_plus(
    mda_pf,
    share_cols,                 
    volume_cols=None,           
    size_ratio_cols=None,       
    os_ratio_cols=None,         
    category_ratio_cols=None,   
    domain_ratio_cols=None,     
    use_clr=True,               
    col_weights=None,           
    zscore=True
):
    volume_cols         = list(volume_cols or [])
    size_ratio_cols     = list(size_ratio_cols or [])
    os_ratio_cols       = list(os_ratio_cols or [])
    category_ratio_cols = list(category_ratio_cols or [])
    domain_ratio_cols   = list(domain_ratio_cols or [])

    all_cols = (list(share_cols) + volume_cols + size_ratio_cols +
                os_ratio_cols + category_ratio_cols + domain_ratio_cols)

    X = mda_pf.set_index('mda_idx')[all_cols].astype(float).copy()

    # 결측
    X[volume_cols] = X[volume_cols].fillna(0.0)
    X[size_ratio_cols + os_ratio_cols + category_ratio_cols + domain_ratio_cols + share_cols] = \
        X[size_ratio_cols + os_ratio_cols + category_ratio_cols + domain_ratio_cols + share_cols].fillna(0.0)

    # 볼륨: log1p
    if volume_cols:
        X[volume_cols] = np.log1p(X[volume_cols])

    # 비율: CLR
    if use_clr:
        if size_ratio_cols:
            X[size_ratio_cols] = _clr_block(X[size_ratio_cols])
        if os_ratio_cols:
            X[os_ratio_cols] = _clr_block(X[os_ratio_cols])
        if category_ratio_cols:
            X[category_ratio_cols] = _clr_block(X[category_ratio_cols])
        if domain_ratio_cols:
            X[domain_ratio_cols] = _clr_block(X[domain_ratio_cols])

    # 열 가중
    if col_weights:
        w = pd.Series({c: col_weights.get(c, 1.0) for c in all_cols}, index=all_cols, dtype=float)
        X = X.mul(w, axis=1)

    # 표준화
    if zscore:
        X = (X - X.mean()) / (X.std() + 1e-9)

    return X, all_cols

# --- 메인 추천 ---
def recommend_with_weighted_similarity(
    ad_df,
    mda_pf,
    top_anchor_by='total_conversions',
    n_anchor=5,
    topN=20,
    weight_power=0.5,
    min_pair_frac=0.0,
    top_weight_feats=None,
    exclude_classes=('계약종료형','품질관리형'),
    min_days_active=7,
    blend_pred_table=None,
    blend_ad_id=None,
    blend_alpha=0.7,
    sort_by="final",

    # 피처 세트(있으면 자동 사용)
    volume_cols=("user_count","total_clicks","total_conversions","daily_avg_conversions","total_ads"),
    size_ratio_cols=("MEGA_ratio","LARGE_ratio","MEDIUM_ratio","SMALL_ratio"),
    os_ratio_cols=("ads_os_type_1_pct","ads_os_type_2_pct","ads_os_type_3_pct","ads_os_type_7_pct"),
    category_ratio_cols=("ads_category_0_pct","ads_category_1_pct","ads_category_2_pct","ads_category_3_pct",
                         "ads_category_4_pct","ads_category_5_pct","ads_category_6_pct","ads_category_7_pct",
                         "ads_category_8_pct","ads_category_10_pct","ads_category_11_pct","ads_category_13_pct"),
    domain_ratio_cols=("domain_게임_pct","domain_교육_pct","domain_금융_pct","domain_기타_pct","domain_미디어/컨텐츠_pct",
                       "domain_뷰티_pct","domain_비영리/공공_pct","domain_생활_pct","domain_식당/카페_pct","domain_식음료_pct",
                       "domain_운동/스포츠_pct","domain_운세_pct","domain_의료/건강_pct","domain_채용_pct","domain_커머스_pct"),

    use_clr=True,
    extra_col_weights=None,

    # 안정화 옵션
    prior_mix=0.2,
    prior_from="mda_mean",   # "mda_mean" | "uniform" | "none"
    prior_bg_dict=None,
    use_idf=False,
    idf_smooth=1.0,
    min_similarity=None
):
    # share 피처
    share_cols = [c for c in mda_pf.columns if c.startswith('share_cat')]
    if not share_cols:
        raise ValueError("mda_pf에 share_cat* 컬럼이 없습니다. 먼저 enrichment를 수행하세요.")

    # 존재하는 컬럼만 사용
    def _keep_exist(cols): return [c for c in cols if c in mda_pf.columns]
    volume_cols         = _keep_exist(volume_cols)
    size_ratio_cols     = _keep_exist(size_ratio_cols)
    os_ratio_cols       = _keep_exist(os_ratio_cols)
    category_ratio_cols = _keep_exist(category_ratio_cols)
    domain_ratio_cols   = _keep_exist(domain_ratio_cols)

    # prior 배경 분포
    prior_bg = None
    if prior_bg_dict is not None:
        prior_bg = dict(prior_bg_dict)
    elif prior_from == "mda_mean":
        avg = mda_pf[share_cols].fillna(0.0).mean(axis=0)
        s = avg.sum()
        if s > 0:
            prior_bg = (avg / s).to_dict()
    elif prior_from == "uniform":
        prior_bg = {c: 1.0/len(share_cols) for c in share_cols}

    # 가중치(광고 분포) 생성
    col_w = make_ad_pair_weights_from_ad_df(
        ad_df, power=weight_power, min_frac=min_pair_frac,
        prior_mix=prior_mix if prior_mix else 0.0,
        prior_bg=prior_bg
    )
    if top_weight_feats:
        top_keys = set(pd.Series(col_w).sort_values(ascending=False).head(top_weight_feats).index)
        col_w = {k: (v if k in top_keys else 0.0) for k,v in col_w.items()}

    # IDF 보정(옵션)
    if use_idf:
        df_share = (mda_pf[share_cols].fillna(0) != 0).sum(axis=0)
        N = len(mda_pf)
        idf = np.log((N + 1.0) / (df_share + idf_smooth))
        idf = idf / (idf.mean() + 1e-12)
        for k in list(col_w.keys()):
            if k in idf.index:
                col_w[k] *= float(idf[k])

    if extra_col_weights:
        col_w.update(extra_col_weights)

    # 피처 행렬
    X, all_feat_cols = build_feature_matrix_plus(
        mda_pf,
        share_cols=share_cols,
        volume_cols=volume_cols,
        size_ratio_cols=size_ratio_cols,
        os_ratio_cols=os_ratio_cols,
        category_ratio_cols=category_ratio_cols,
        domain_ratio_cols=domain_ratio_cols,
        use_clr=use_clr,
        col_weights=col_w,
        zscore=True
    )

    # 앵커/센트로이드
    used = set(ad_df['mda_idx'].astype(int))
    anchors = (ad_df.sort_values(top_anchor_by, ascending=False)
                   .drop_duplicates('mda_idx')
                   .head(n_anchor)['mda_idx']
                   .astype(int).tolist())
    anchors = [m for m in anchors if m in X.index]
    if not anchors:
        raise ValueError("anchor가 없습니다. ad_df에 상위 매체가 있는지 확인하세요.")
    centroid = X.loc[anchors].mean(axis=0).values

    # 후보 & 필터
    cand = mda_pf[~mda_pf['mda_idx'].isin(used)].copy()
    if 'basic_classification' in cand.columns and exclude_classes:
        cand = cand[~cand['basic_classification'].isin(exclude_classes)]
    if 'days_active' in cand.columns:
        cand = cand[cand['days_active'] >= min_days_active]
    if cand.empty:
        return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 유사도
    B = X.loc[cand['mda_idx']].values
    cand['similarity'] = cosine_vec(centroid, B)
    if (min_similarity is not None):
        cand = cand[cand['similarity'] >= float(min_similarity)]
        if cand.empty:
            return pd.DataFrame(columns=['mda_idx','similarity']), anchors, all_feat_cols, col_w

    # 예측 블렌딩(옵션)
    has_pred = (blend_pred_table is not None) and (blend_ad_id is not None)
    if has_pred:
        pt = blend_pred_table[blend_pred_table['ads_idx']==blend_ad_id][['mda_idx','pred_turn']].copy()
        cand = cand.merge(pt, on='mda_idx', how='left')
        cand['pred_turn'] = cand['pred_turn'].fillna(0.0)
        maxv = cand['pred_turn'].max()
        cand['pred_norm'] = cand['pred_turn'] / (maxv + 1e-9)
        cand['final_score'] = blend_alpha*cand['similarity'] + (1.0-blend_alpha)*cand['pred_norm']

    # 정렬
    if sort_by == "pred" and has_pred:
        sort_key = "pred_turn"
    elif sort_by == "sim":
        sort_key = "similarity"
    else:
        sort_key = "final_score" if has_pred else "similarity"

    keep = [c for c in [
        'mda_idx','similarity','final_score','pred_turn','pred_norm',
        'basic_classification','days_active','conversion_rate',
        'expected_total_profit','total_ads'
    ] if c in cand.columns]
    out = cand[keep].sort_values(sort_key, ascending=False).head(topN).reset_index(drop=True)
    return out, anchors, all_feat_cols, col_w
# === /통합 셀 끝 ===


In [68]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_9935_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=5
)
display(out.head(20))

Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,817,0.329564,안정공급형(후보),31,0.631402,8946,209
1,563,0.312263,대량처리형,31,0.57488,4216079,6563
2,371,0.306462,안정공급형,31,0.432518,172278,973
3,58,0.305951,안정공급형(후보),31,0.322761,3884970,26
4,564,0.29395,안정공급형(후보),31,0.350742,62829,73
5,337,0.281521,안정공급형(후보),31,0.152083,164850,382
6,22,0.27231,대량처리형,31,0.422674,2541770,166
7,398,0.253981,안정공급형(후보),31,0.170626,185580,27
8,496,0.248369,안정공급형,31,0.206431,1639389,54
9,87,0.239956,안정공급형(후보),29,0.329034,640407,6


In [83]:
ads_pool[(ads_pool['ads_size']=='LARGE') & (ads_pool['media_count']>5)].head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
5,5,5,9935,10,524,889,24,8,금융,7,2592600,145.083333,140.0,NONE,1500,1200,2025-07-26 00:39:57,2025-08-25 10:37:05,숨어있는 휴면포인트 조회,2021-02-01 10:00:00,0,31,0.774194,0.0,300,0.2,7200,MEGA,28.677419,16.903226,MEGA,LOW,MEGA_LOW,3,0,2,4,0,9,LARGE,0,,0.0,1.0
26,29,29,13209,14,758,2162,169,8,금융,2,2496600,3529.295858,3527.0,NONE,1900,1425,2025-07-26 00:57:18,2025-08-25 10:06:34,아이부자,2021-10-01 17:00:00,0,31,5.451613,0.1,475,0.3,80275,MEGA,69.741935,24.451613,MEGA,LOW,MEGA_LOW,4,1,3,4,0,12,LARGE,0,22.0,1.0,1.0
31,34,34,14074,15,186,558,33,8,금융,2,2642800,1361.515152,1334.0,NONE,500,300,2025-07-26 00:49:28,2025-08-25 07:45:58,오토링 광고 1회 참여,2021-11-19 00:00:00,0,31,1.064516,0.1,200,0.7,6600,MEGA,18.0,6.0,LARGE,LOW,LARGE_LOW,4,1,2,4,0,11,LARGE,0,,0.0,1.0
34,37,37,14405,9,317,505,40,10,미디어/컨텐츠,7,1639412,158.825,152.5,NONE,2600,1950,2025-07-26 00:26:00,2025-08-25 07:07:33,미노벨 노벨패스 7일 무료체험 이벤트,2021-12-07 17:00:00,0,31,1.290323,0.1,650,0.3,26000,LARGE,16.290323,10.225806,LARGE,LOW,LARGE_LOW,3,1,2,4,0,10,LARGE,0,,0.0,1.0
40,43,43,16523,6,659,1453,57,8,미디어/컨텐츠,7,2495400,4684.807018,238.0,NONE,1500,1000,2025-07-26 00:25:12,2025-08-25 06:24:46,탑툰,2022-04-14 14:00:00,0,31,1.83871,0.0,500,0.5,28500,LARGE,46.870968,21.258065,MEGA,LOW,MEGA_LOW,3,1,2,4,0,10,LARGE,0,,0.0,1.0
42,45,45,16528,11,412,613,30,8,생활,7,1651118,889.666667,100.0,NONE,2000,1600,2025-07-26 02:01:55,2025-08-25 10:25:46,안심보호,2022-04-14 14:00:00,0,31,0.967742,0.0,400,0.2,12000,MEGA,19.774194,13.290323,LARGE,LOW,LARGE_LOW,3,0,2,4,0,9,LARGE,0,,0.0,1.0
47,50,50,16895,6,365,1429,5,10,생활,7,2573800,64.2,60.0,NONE,10000,8000,2025-07-26 00:23:38,2025-08-25 07:36:40,전국 최대 부동산 경매! 한톡경매 (직과금),2022-05-02 17:00:00,0,31,0.16129,0.0,2000,0.2,10000,LARGE,46.096774,11.774194,MEGA,LOW,MEGA_LOW,3,0,2,4,0,9,LARGE,0,,0.0,1.0
67,71,71,20256,12,121,209,3,10,금융,7,2591600,54.666667,43.0,NONE,9000,6000,2025-07-26 08:53:42,2025-08-24 23:27:41,AT스탁플러스,2022-11-08 18:00:00,0,30,0.1,0.0,3000,0.5,9000,MEGA,6.966667,4.033333,LARGE,LOW,LARGE_LOW,4,0,1,3,0,8,LARGE,0,,0.0,1.0
69,73,73,20258,12,49,88,1,10,금융,7,2592400,53.0,53.0,NONE,2000,1400,2025-07-26 00:59:06,2025-08-22 12:10:16,전화번호 안심로그인,2022-11-08 18:00:00,0,28,0.035714,0.0,600,0.4,600,MEGA,3.142857,1.75,LARGE,LOW,LARGE_LOW,4,0,1,3,0,8,LARGE,0,,0.0,1.0
129,135,135,22232,7,614,1324,305,8,금융,2,2294400,338.95082,114.0,NONE,800,640,2025-07-26 00:46:23,2025-08-25 10:05:25,라이프플러스 트라이브,2023-06-05 14:00:00,0,31,9.83871,0.2,160,0.2,48800,LARGE,42.709677,19.806452,MEGA,LOW,MEGA_LOW,3,2,2,4,0,11,LARGE,0,,0.0,1.0


In [72]:
ads_9982_pf = analyze_ads_performance(9982, click)

In [76]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_9982_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=3
)
display(out.head(20))
print(anchors)

Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,801,0.949231,특화전문형,31,0.654837,792815,27
1,582,0.893589,특화전문형,31,0.422859,343925,26
2,702,0.890391,특화전문형,31,0.628446,487270,26
3,700,0.885683,특화전문형,31,0.414816,390390,26
4,703,0.878876,특화전문형,31,0.523336,255950,26
5,638,0.876381,특화전문형,31,0.434734,219740,26
6,637,0.862473,특화전문형,31,0.379377,555460,26
7,701,0.861711,특화전문형,31,0.382332,717450,25
8,688,0.856407,특화전문형,31,0.384638,307090,26
9,807,0.849017,특화전문형,31,0.475589,414850,26


[1020, 1047, 845]


In [77]:
mda_pf_enriched[mda_pf_enriched['mda_idx'].isin([1020,1047,845,801])]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,mda_idx,user_count,total_clicks,total_conversions,first_click,last_click,days_active,daily_avg_conversions,LARGE,MEDIUM,MEGA,SMALL,total_ads,MEGA_ratio,LARGE_ratio,MEDIUM_ratio,SMALL_ratio,ads_category_0_pct,ads_category_1_pct,ads_category_2_pct,ads_category_3_pct,ads_category_4_pct,ads_category_5_pct,ads_category_6_pct,ads_category_7_pct,ads_category_8_pct,ads_category_10_pct,ads_category_11_pct,ads_category_13_pct,ads_os_type_1_pct,ads_os_type_2_pct,ads_os_type_3_pct,ads_os_type_7_pct,domain_게임_pct,domain_교육_pct,domain_금융_pct,domain_기타_pct,domain_미디어/컨텐츠_pct,domain_뷰티_pct,domain_비영리/공공_pct,domain_생활_pct,domain_식당/카페_pct,domain_식음료_pct,domain_운동/스포츠_pct,domain_운세_pct,domain_의료/건강_pct,domain_채용_pct,domain_커머스_pct,expected_total_profit,conversion_rate,last_click_dt,classification,basic_classification,conv_cat0_게임,conv_cat0_금융,conv_cat1_게임,conv_cat1_생활,conv_cat1_커머스,conv_cat2_게임,conv_cat2_비영리_공공,conv_cat3_비영리_공공,conv_cat3_식음료,conv_cat8_미디어_컨텐츠,conv_cat8_의료_건강,conv_cat8_커머스,conv_cat1_금융,conv_cat1_기타,conv_cat1_미디어_컨텐츠,conv_cat1_뷰티,conv_cat1_의료_건강,conv_cat1_채용,conv_cat2_기타,conv_cat2_커머스,conv_cat3_미디어_컨텐츠,conv_cat3_뷰티,conv_cat3_생활,conv_cat5_게임,conv_cat8_게임,conv_cat8_금융,conv_cat8_생활,conv_cat10_미디어_컨텐츠,conv_cat0_미디어_컨텐츠,conv_cat0_생활,conv_cat0_의료_건강,conv_cat0_커머스,conv_cat2_식당_카페,conv_cat2_채용,conv_cat5_미디어_컨텐츠,conv_cat7_금융,conv_cat8_뷰티,conv_cat8_식음료,conv_cat8_운동_스포츠,conv_cat10_금융,conv_cat10_생활,conv_cat10_의료_건강,conv_cat3_채용,conv_cat8_비영리_공공,conv_cat4_식당_카페,conv_cat4_식음료,conv_cat4_운동_스포츠,conv_cat4_커머스,conv_cat8_교육,conv_cat10_커머스,conv_cat11_뷰티,conv_cat11_식음료,conv_cat11_운동_스포츠,conv_cat11_의료_건강,conv_cat11_커머스,conv_cat13_금융,conv_cat13_운세,conv_cat8_운세,conv_cat8_기타,conv_cat8_채용,conv_cat13_커머스,conv_cat3_금융,conv_cat4_게임,conv_cat4_교육,conv_cat4_금융,conv_cat4_기타,conv_cat4_미디어_컨텐츠,conv_cat4_뷰티,conv_cat4_생활,conv_cat4_운세,conv_cat4_의료_건강,conv_cat5_식음료,conv_cat5_운동_스포츠,conv_cat6_게임,conv_cat6_미디어_컨텐츠,conv_cat6_식당_카페,conv_cat10_뷰티,conv_cat10_운세,conv_cat13_생활,conv_cat4_비영리_공공,conv_cat5_금융,conv_cat4_채용,conv_cat1_교육,conv_cat11_미디어_컨텐츠,share_cat0_게임,share_cat0_금융,share_cat1_게임,share_cat1_생활,share_cat1_커머스,share_cat2_게임,share_cat2_비영리_공공,share_cat3_비영리_공공,share_cat3_식음료,share_cat8_미디어_컨텐츠,share_cat8_의료_건강,share_cat8_커머스,share_cat1_금융,share_cat1_기타,share_cat1_미디어_컨텐츠,share_cat1_뷰티,share_cat1_의료_건강,share_cat1_채용,share_cat2_기타,share_cat2_커머스,share_cat3_미디어_컨텐츠,share_cat3_뷰티,share_cat3_생활,share_cat5_게임,share_cat8_게임,share_cat8_금융,share_cat8_생활,share_cat10_미디어_컨텐츠,share_cat0_미디어_컨텐츠,share_cat0_생활,share_cat0_의료_건강,share_cat0_커머스,share_cat2_식당_카페,share_cat2_채용,share_cat5_미디어_컨텐츠,share_cat7_금융,share_cat8_뷰티,share_cat8_식음료,share_cat8_운동_스포츠,share_cat10_금융,share_cat10_생활,share_cat10_의료_건강,share_cat3_채용,share_cat8_비영리_공공,share_cat4_식당_카페,share_cat4_식음료,share_cat4_운동_스포츠,share_cat4_커머스,share_cat8_교육,share_cat10_커머스,share_cat11_뷰티,share_cat11_식음료,share_cat11_운동_스포츠,share_cat11_의료_건강,share_cat11_커머스,share_cat13_금융,share_cat13_운세,share_cat8_운세,share_cat8_기타,share_cat8_채용,share_cat13_커머스,share_cat3_금융,share_cat4_게임,share_cat4_교육,share_cat4_금융,share_cat4_기타,share_cat4_미디어_컨텐츠,share_cat4_뷰티,share_cat4_생활,share_cat4_운세,share_cat4_의료_건강,share_cat5_식음료,share_cat5_운동_스포츠,share_cat6_게임,share_cat6_미디어_컨텐츠,share_cat6_식당_카페,share_cat10_뷰티,share_cat10_운세,share_cat13_생활,share_cat4_비영리_공공,share_cat5_금융,share_cat4_채용,share_cat1_교육,share_cat11_미디어_컨텐츠
121,121,121,801,6588,12032,7879,2025-07-26 00:00:47,2025-08-25 11:23:48,31,254.16129,2,0,25,0,27,92.6,7.4,0.0,0.0,14.814815,25.925926,44.444444,3.703704,0.0,0.0,0.0,0.0,11.111111,0.0,0.0,0.0,0.0,88.888889,0.0,11.111111,48.148148,0.0,7.407407,0.0,11.111111,3.703704,0.0,7.407407,3.703704,0.0,0.0,0.0,0.0,11.111111,7.407407,792815,0.654837,2025-08-25 11:23:48,특화전문형_MEGA특화,특화전문형,293.0,0.0,0.0,83.0,69.0,3622.0,0.0,0.0,0.0,0.0,0.0,0.0,1733.0,0.0,0.0,63.0,0.0,456.0,0.0,0.0,264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,975.0,0.0,0.0,307.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037187,0.0,0.0,0.010534,0.008757,0.459703,0.0,0.0,0.0,0.0,0.0,0.0,0.219952,0.0,0.0,0.007996,0.0,0.057875,0.0,0.0,0.033507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123747,0.0,0.0,0.038964,0.001777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,137,137,845,6781,15643,6695,2025-07-26 00:17:19,2025-08-25 11:23:37,31,215.967742,2,0,26,0,28,92.9,7.1,0.0,0.0,14.285714,25.0,42.857143,7.142857,0.0,0.0,0.0,0.0,10.714286,0.0,0.0,0.0,0.0,85.714286,0.0,14.285714,46.428571,0.0,7.142857,0.0,14.285714,3.571429,0.0,7.142857,3.571429,0.0,0.0,0.0,0.0,10.714286,7.142857,638950,0.427987,2025-08-25 11:23:37,특화전문형_MEGA특화,특화전문형,317.0,0.0,0.0,65.0,69.0,2406.0,0.0,0.0,0.0,0.0,0.0,0.0,657.0,0.0,0.0,32.0,0.0,445.0,0.0,0.0,1537.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,788.0,0.0,0.0,365.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047349,0.0,0.0,0.009709,0.010306,0.359373,0.0,0.0,0.0,0.0,0.0,0.0,0.098133,0.0,0.0,0.00478,0.0,0.066468,0.0,0.0,0.229574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1177,0.0,0.0,0.054518,0.002091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,158,159,1020,7630,16765,10345,2025-07-26 00:09:26,2025-08-25 11:21:22,31,333.709677,3,0,27,0,30,90.0,10.0,0.0,0.0,13.333333,23.333333,40.0,10.0,0.0,0.0,0.0,0.0,13.333333,0.0,0.0,0.0,0.0,80.0,0.0,20.0,43.333333,0.0,6.666667,0.0,13.333333,6.666667,0.0,6.666667,3.333333,0.0,0.0,0.0,0.0,13.333333,6.666667,974620,0.617059,2025-08-25 11:21:22,특화전문형_MEGA특화,특화전문형,287.0,0.0,0.0,89.0,286.0,3782.0,0.0,0.0,0.0,0.0,0.0,0.0,1831.0,0.0,0.0,59.0,0.0,920.0,0.0,0.0,1798.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,954.0,0.0,0.0,320.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027743,0.0,0.0,0.008603,0.027646,0.365587,0.0,0.0,0.0,0.0,0.0,0.0,0.176994,0.0,0.0,0.005703,0.0,0.088932,0.0,0.0,0.173804,0.00029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092218,0.0,0.0,0.030933,0.001547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,180,181,1047,6472,11725,5947,2025-07-26 00:00:37,2025-08-25 11:21:15,31,191.83871,2,0,23,0,25,92.0,8.0,0.0,0.0,16.0,20.0,48.0,8.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,88.0,0.0,12.0,52.0,0.0,8.0,0.0,12.0,4.0,0.0,8.0,4.0,0.0,0.0,0.0,0.0,4.0,8.0,563930,0.507207,2025-08-25 11:21:15,특화전문형_게임특화,특화전문형,214.0,0.0,0.0,63.0,214.0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,810.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,1577.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,710.0,0.0,0.0,289.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035985,0.0,0.0,0.010594,0.035985,0.338994,0.0,0.0,0.0,0.0,0.0,0.0,0.136203,0.0,0.0,0.006726,0.0,0.0,0.0,0.0,0.265176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119388,0.0,0.0,0.048596,0.002354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
ads_pool[ads_pool['ads_idx']==22232]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
129,135,135,22232,7,614,1324,305,8,금융,2,2294400,338.95082,114.0,NONE,800,640,2025-07-26 00:46:23,2025-08-25 10:05:25,라이프플러스 트라이브,2023-06-05 14:00:00,0,31,9.83871,0.2,160,0.2,48800,LARGE,42.709677,19.806452,MEGA,LOW,MEGA_LOW,3,2,2,4,0,11,LARGE,0,,0.0,1.0


In [188]:
ads_443285_pf = analyze_ads_performance(443285, click)

In [189]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_443285_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=3
)
display(out.head(20))
print(anchors)

Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,398,0.376745,안정공급형(후보),31,0.170626,185580,27
1,371,0.324957,안정공급형,31,0.432518,172278,973
2,12,0.303178,특화전문형,31,0.356818,3003668,24
3,563,0.281716,대량처리형,31,0.57488,4216079,6563
4,337,0.262346,안정공급형(후보),31,0.152083,164850,382
5,58,0.258833,안정공급형(후보),31,0.322761,3884970,26
6,56,0.230483,안정공급형(후보),31,0.23635,580630,18
7,790,0.227398,안정공급형(후보),31,0.512785,254068,158
8,817,0.226567,안정공급형(후보),31,0.631402,8946,209
9,30,0.216324,안정공급형(후보),31,0.075775,22520,60


[496, 22, 761]


In [85]:
ads_22232_pf = analyze_ads_performance(22232, click)

In [86]:
out, anchors, feats, w = recommend_with_weighted_similarity(
    ad_df=ads_22232_pf,
    mda_pf=mda_pf_enriched,
    use_clr=True,          # 비율 전부 CLR
    weight_power=0.5,      # 루트 가중
    prior_mix=0.2,         # 배경 분포 섞기
    prior_from="mda_mean", # mda 평균 분포
    n_anchor=3
)
display(out.head(20))
print(anchors)

Unnamed: 0,mda_idx,similarity,basic_classification,days_active,conversion_rate,expected_total_profit,total_ads
0,22,0.627467,대량처리형,31,0.422674,2541770,166
1,458,0.361075,안정공급형(후보),31,0.216386,89500,9
2,375,0.338044,안정공급형(후보),31,0.427252,17591,103
3,294,0.304335,안정공급형(후보),31,0.202488,127250,49
4,337,0.300539,안정공급형(후보),31,0.152083,164850,382
5,817,0.300348,안정공급형(후보),31,0.631402,8946,209
6,645,0.293787,안정공급형(후보),31,0.597143,46075,178
7,246,0.291548,안정공급형(후보),31,0.610977,77757,121
8,401,0.274462,안정공급형(후보),31,0.086053,5921,56
9,12,0.271883,특화전문형,31,0.356818,3003668,24


[398, 496, 54]


In [87]:
mda_pf_enriched[mda_pf_enriched['mda_idx'].isin([398,496,54,22])]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,mda_idx,user_count,total_clicks,total_conversions,first_click,last_click,days_active,daily_avg_conversions,LARGE,MEDIUM,MEGA,SMALL,total_ads,MEGA_ratio,LARGE_ratio,MEDIUM_ratio,SMALL_ratio,ads_category_0_pct,ads_category_1_pct,ads_category_2_pct,ads_category_3_pct,ads_category_4_pct,ads_category_5_pct,ads_category_6_pct,ads_category_7_pct,ads_category_8_pct,ads_category_10_pct,ads_category_11_pct,ads_category_13_pct,ads_os_type_1_pct,ads_os_type_2_pct,ads_os_type_3_pct,ads_os_type_7_pct,domain_게임_pct,domain_교육_pct,domain_금융_pct,domain_기타_pct,domain_미디어/컨텐츠_pct,domain_뷰티_pct,domain_비영리/공공_pct,domain_생활_pct,domain_식당/카페_pct,domain_식음료_pct,domain_운동/스포츠_pct,domain_운세_pct,domain_의료/건강_pct,domain_채용_pct,domain_커머스_pct,expected_total_profit,conversion_rate,last_click_dt,classification,basic_classification,conv_cat0_게임,conv_cat0_금융,conv_cat1_게임,conv_cat1_생활,conv_cat1_커머스,conv_cat2_게임,conv_cat2_비영리_공공,conv_cat3_비영리_공공,conv_cat3_식음료,conv_cat8_미디어_컨텐츠,conv_cat8_의료_건강,conv_cat8_커머스,conv_cat1_금융,conv_cat1_기타,conv_cat1_미디어_컨텐츠,conv_cat1_뷰티,conv_cat1_의료_건강,conv_cat1_채용,conv_cat2_기타,conv_cat2_커머스,conv_cat3_미디어_컨텐츠,conv_cat3_뷰티,conv_cat3_생활,conv_cat5_게임,conv_cat8_게임,conv_cat8_금융,conv_cat8_생활,conv_cat10_미디어_컨텐츠,conv_cat0_미디어_컨텐츠,conv_cat0_생활,conv_cat0_의료_건강,conv_cat0_커머스,conv_cat2_식당_카페,conv_cat2_채용,conv_cat5_미디어_컨텐츠,conv_cat7_금융,conv_cat8_뷰티,conv_cat8_식음료,conv_cat8_운동_스포츠,conv_cat10_금융,conv_cat10_생활,conv_cat10_의료_건강,conv_cat3_채용,conv_cat8_비영리_공공,conv_cat4_식당_카페,conv_cat4_식음료,conv_cat4_운동_스포츠,conv_cat4_커머스,conv_cat8_교육,conv_cat10_커머스,conv_cat11_뷰티,conv_cat11_식음료,conv_cat11_운동_스포츠,conv_cat11_의료_건강,conv_cat11_커머스,conv_cat13_금융,conv_cat13_운세,conv_cat8_운세,conv_cat8_기타,conv_cat8_채용,conv_cat13_커머스,conv_cat3_금융,conv_cat4_게임,conv_cat4_교육,conv_cat4_금융,conv_cat4_기타,conv_cat4_미디어_컨텐츠,conv_cat4_뷰티,conv_cat4_생활,conv_cat4_운세,conv_cat4_의료_건강,conv_cat5_식음료,conv_cat5_운동_스포츠,conv_cat6_게임,conv_cat6_미디어_컨텐츠,conv_cat6_식당_카페,conv_cat10_뷰티,conv_cat10_운세,conv_cat13_생활,conv_cat4_비영리_공공,conv_cat5_금융,conv_cat4_채용,conv_cat1_교육,conv_cat11_미디어_컨텐츠,share_cat0_게임,share_cat0_금융,share_cat1_게임,share_cat1_생활,share_cat1_커머스,share_cat2_게임,share_cat2_비영리_공공,share_cat3_비영리_공공,share_cat3_식음료,share_cat8_미디어_컨텐츠,share_cat8_의료_건강,share_cat8_커머스,share_cat1_금융,share_cat1_기타,share_cat1_미디어_컨텐츠,share_cat1_뷰티,share_cat1_의료_건강,share_cat1_채용,share_cat2_기타,share_cat2_커머스,share_cat3_미디어_컨텐츠,share_cat3_뷰티,share_cat3_생활,share_cat5_게임,share_cat8_게임,share_cat8_금융,share_cat8_생활,share_cat10_미디어_컨텐츠,share_cat0_미디어_컨텐츠,share_cat0_생활,share_cat0_의료_건강,share_cat0_커머스,share_cat2_식당_카페,share_cat2_채용,share_cat5_미디어_컨텐츠,share_cat7_금융,share_cat8_뷰티,share_cat8_식음료,share_cat8_운동_스포츠,share_cat10_금융,share_cat10_생활,share_cat10_의료_건강,share_cat3_채용,share_cat8_비영리_공공,share_cat4_식당_카페,share_cat4_식음료,share_cat4_운동_스포츠,share_cat4_커머스,share_cat8_교육,share_cat10_커머스,share_cat11_뷰티,share_cat11_식음료,share_cat11_운동_스포츠,share_cat11_의료_건강,share_cat11_커머스,share_cat13_금융,share_cat13_운세,share_cat8_운세,share_cat8_기타,share_cat8_채용,share_cat13_커머스,share_cat3_금융,share_cat4_게임,share_cat4_교육,share_cat4_금융,share_cat4_기타,share_cat4_미디어_컨텐츠,share_cat4_뷰티,share_cat4_생활,share_cat4_운세,share_cat4_의료_건강,share_cat5_식음료,share_cat5_운동_스포츠,share_cat6_게임,share_cat6_미디어_컨텐츠,share_cat6_식당_카페,share_cat10_뷰티,share_cat10_운세,share_cat13_생활,share_cat4_비영리_공공,share_cat5_금융,share_cat4_채용,share_cat1_교육,share_cat11_미디어_컨텐츠
3,3,3,22,37668,83355,35232,2025-07-26 00:00:31,2025-08-25 11:14:50,31,1136.516129,67,58,33,8,166,19.9,40.4,34.9,4.8,8.433735,6.626506,10.843373,5.421687,0.0,10.240964,0.0,6.626506,26.506024,25.301205,0.0,0.0,0.0,40.361446,3.012048,56.626506,19.277108,0.60241,26.506024,0.0,10.240964,1.807229,3.012048,11.445783,0.60241,4.216867,1.204819,7.831325,4.819277,1.204819,7.228916,2541770,0.422674,2025-08-25 11:14:50,대량처리형,대량처리형,1205.0,2969.0,394.0,92.0,657.0,22660.0,581.0,771.0,177.0,291.0,143.0,115.0,675.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,967.0,0.0,0.0,2.0,0.0,796.0,54.0,0.0,1.0,719.0,813.0,564.0,9.0,324.0,12.0,19.0,182.0,4.0,26.0,2.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034202,0.08427,0.011183,0.002611,0.018648,0.643165,0.016491,0.021884,0.005024,0.00826,0.004059,0.003264,0.019159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027447,0.0,0.0,5.7e-05,0.0,0.022593,0.001533,0.0,2.8e-05,0.020408,0.023076,0.016008,0.000255,0.009196,0.000341,0.000539,0.005166,0.000114,0.000738,5.7e-05,2.8e-05,0.000199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,7,54,6395,12838,4299,2025-07-26 00:12:15,2025-08-25 11:11:45,31,138.677419,69,172,29,61,331,8.8,20.8,52.0,18.4,2.416918,4.833837,4.229607,3.021148,4.229607,12.084592,0.0,3.021148,7.854985,12.084592,45.015106,1.208459,11.178248,18.429003,0.302115,70.090634,17.824773,0.302115,12.688822,0.0,3.625378,10.574018,1.208459,4.229607,1.208459,7.250755,1.510574,4.229607,9.667674,0.906344,24.773414,309633,0.334865,2025-08-25 11:11:45,안정공급형(후보),안정공급형(후보),23.0,6.0,109.0,17.0,65.0,344.0,20.0,345.0,26.0,40.0,3.0,14.0,29.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,67.0,0.0,6.0,0.0,122.0,50.0,94.0,0.0,11.0,0.0,39.0,0.0,0.0,0.0,2.0,1.0,0.0,6.0,0.0,2.0,1.0,3.0,2.0,2.0,41.0,1.0,5.0,1.0,1.0,5.0,2.0,2788.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00535,0.001396,0.025355,0.003954,0.01512,0.080019,0.004652,0.080251,0.006048,0.009304,0.000698,0.003257,0.006746,0.0,0.0,0.0,0.0,0.000698,0.0,0.0,0.0,0.0,0.000233,0.000465,0.0,0.015585,0.0,0.001396,0.0,0.028379,0.011631,0.021866,0.0,0.002559,0.0,0.009072,0.0,0.0,0.0,0.000465,0.000233,0.0,0.001396,0.0,0.000465,0.000233,0.000698,0.000465,0.000465,0.009537,0.000233,0.001163,0.000233,0.000233,0.001163,0.000465,0.648523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,36,36,398,3267,6945,1185,2025-07-26 00:17:33,2025-08-25 09:57:19,31,38.225806,7,6,14,0,27,51.9,25.9,22.2,0.0,0.0,11.111111,18.518519,18.518519,0.0,3.703704,0.0,18.518519,22.222222,7.407407,0.0,0.0,0.0,48.148148,3.703704,48.148148,25.925926,3.703704,33.333333,0.0,7.407407,0.0,11.111111,3.703704,0.0,7.407407,0.0,0.0,3.703704,0.0,3.703704,185580,0.170626,2025-08-25 09:57:19,안정공급형(후보),안정공급형(후보),0.0,0.0,16.0,0.0,0.0,365.0,0.0,445.0,87.0,63.0,59.0,8.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,130.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013502,0.0,0.0,0.308017,0.0,0.375527,0.073418,0.053165,0.049789,0.006751,0.006751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002532,0.0,0.109705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,46,46,496,80048,127975,26418,2025-07-26 00:06:05,2025-08-25 11:24:01,31,852.193548,24,8,22,0,54,40.7,44.4,14.8,0.0,16.666667,11.111111,20.37037,11.111111,0.0,3.703704,0.0,9.259259,25.925926,1.851852,0.0,0.0,0.0,66.666667,7.407407,25.925926,25.925926,1.851852,25.925926,1.851852,7.407407,1.851852,5.555556,9.259259,0.0,5.555556,0.0,1.851852,3.703704,1.851852,7.407407,1639389,0.206431,2025-08-25 11:24:01,안정공급형,안정공급형,728.0,1076.0,0.0,711.0,398.0,3989.0,382.0,445.0,43.0,148.0,3.0,130.0,9885.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,17.0,0.0,130.0,2.0,0.0,10.0,5410.0,934.0,0.0,0.0,1858.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027557,0.04073,0.0,0.026913,0.015065,0.150996,0.01446,0.016845,0.001628,0.005602,0.000114,0.004921,0.374177,0.0,0.0,0.001741,0.0,0.0,0.0,0.0,0.000492,0.0,0.0,0.000644,0.0,0.004921,7.6e-05,0.0,0.000379,0.204785,0.035355,0.0,0.0,0.070331,0.0,0.000114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000151,0.002006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 전환수 예측 모델

In [88]:
ads_pool

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
0,0,0,6508,1,1,2,0,10,금융,7,2381500,,,NONE,1400,1100,2025-07-29 08:54:30,2025-07-29 08:55:55,스마트피싱보호,2020-04-23 17:00:00,0,1,0.0,0.0,300,0.3,0,MEDIUM,2.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,1.0
1,1,1,6985,2,30,46,0,7,금융,7,1201938,,,NONE,2700,2300,2025-07-26 11:02:18,2025-08-24 05:39:13,DB손해보험 다이렉트,2020-06-30 15:00:00,0,29,0.0,0.0,400,0.2,0,MEDIUM,1.586207,1.034483,MEDIUM,LOW,MEDIUM_LOW,2,0,0,3,0,5,MEDIUM,0,,0.0,0.0
2,2,2,8327,1,6,11,0,10,미디어/컨텐츠,7,1673458,,,NONE,3200,2420,2025-08-02 08:30:52,2025-08-18 22:32:25,파일썬,2020-11-10 11:21:00,0,17,0.0,0.0,780,0.3,0,MEDIUM,0.647059,0.352941,SMALL,LOW,SMALL_LOW,1,0,0,3,0,4,SMALL,0,,0.0,1.0
3,3,3,9264,1,1,1,0,10,금융,7,2372800,,,NONE,11000,8250,2025-08-14 13:54:12,2025-08-14 13:54:12,하나 가득담은 운전자보험가입,2020-11-30 00:00:00,0,1,0.0,0.0,2750,0.3,0,LARGE,1.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,1.0
4,4,4,9716,2,4,6,1,8,생활,2,2699900,74.000000,74.0,NONE,600,336,2025-07-28 01:22:36,2025-07-29 07:14:36,베이비러브 참여 신청,2021-01-11 16:00:00,0,2,0.5,0.2,264,0.8,264,MEDIUM,3.000000,2.000000,MEDIUM,LOW,MEDIUM_LOW,2,0,1,1,0,4,SMALL,0,,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4709,4766,9512,446899,2,52,63,49,4,커머스,7,1369916,42.102041,39.0,ADS_CODE_DAILY_UPDATE,18,14,2025-08-25 09:40:45,2025-08-25 09:58:56,원스토몰 180212,2025-08-25 09:37:27,0,1,49.0,0.8,4,0.3,196,SMALL,63.000000,52.000000,LARGE,HIGH,LARGE_HIGH,2,3,2,0,2,9,LARGE,0,,0.0,0.0
4710,4767,9513,446900,2,44,59,41,4,식당/카페,7,1369915,48.512195,47.0,ADS_CODE_DAILY_UPDATE,18,14,2025-08-25 09:45:41,2025-08-25 09:56:38,서빙카트 식당 업소용 주방 서빙카 다용도 서빙카트 소형 2단 기본고무바퀴 180213,2025-08-25 09:42:08,0,1,41.0,0.7,4,0.3,164,SMALL,59.000000,44.000000,LARGE,HIGH,LARGE_HIGH,2,3,2,0,1,8,LARGE,0,,0.0,0.0
4711,4768,9514,446902,1,2,2,0,8,미디어/컨텐츠,7,2735900,,,NONE,200,105,2025-08-25 10:37:49,2025-08-25 11:11:29,[네이버카페가입인사] 대장TV,2025-08-25 10:00:00,0,1,0.0,0.0,95,0.9,0,MEDIUM,2.000000,2.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,0.0
4712,4769,9515,446908,1,1,1,0,0,금융,2,1682806,,,NONE,230,180,2025-08-25 10:47:48,2025-08-25 10:47:48,NOL,2025-08-25 10:37:11,0,1,0.0,0.0,50,0.3,0,MEDIUM,1.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,0.0


In [89]:
ads_pool

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
0,0,0,6508,1,1,2,0,10,금융,7,2381500,,,NONE,1400,1100,2025-07-29 08:54:30,2025-07-29 08:55:55,스마트피싱보호,2020-04-23 17:00:00,0,1,0.0,0.0,300,0.3,0,MEDIUM,2.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,1.0
1,1,1,6985,2,30,46,0,7,금융,7,1201938,,,NONE,2700,2300,2025-07-26 11:02:18,2025-08-24 05:39:13,DB손해보험 다이렉트,2020-06-30 15:00:00,0,29,0.0,0.0,400,0.2,0,MEDIUM,1.586207,1.034483,MEDIUM,LOW,MEDIUM_LOW,2,0,0,3,0,5,MEDIUM,0,,0.0,0.0
2,2,2,8327,1,6,11,0,10,미디어/컨텐츠,7,1673458,,,NONE,3200,2420,2025-08-02 08:30:52,2025-08-18 22:32:25,파일썬,2020-11-10 11:21:00,0,17,0.0,0.0,780,0.3,0,MEDIUM,0.647059,0.352941,SMALL,LOW,SMALL_LOW,1,0,0,3,0,4,SMALL,0,,0.0,1.0
3,3,3,9264,1,1,1,0,10,금융,7,2372800,,,NONE,11000,8250,2025-08-14 13:54:12,2025-08-14 13:54:12,하나 가득담은 운전자보험가입,2020-11-30 00:00:00,0,1,0.0,0.0,2750,0.3,0,LARGE,1.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,1.0
4,4,4,9716,2,4,6,1,8,생활,2,2699900,74.000000,74.0,NONE,600,336,2025-07-28 01:22:36,2025-07-29 07:14:36,베이비러브 참여 신청,2021-01-11 16:00:00,0,2,0.5,0.2,264,0.8,264,MEDIUM,3.000000,2.000000,MEDIUM,LOW,MEDIUM_LOW,2,0,1,1,0,4,SMALL,0,,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4709,4766,9512,446899,2,52,63,49,4,커머스,7,1369916,42.102041,39.0,ADS_CODE_DAILY_UPDATE,18,14,2025-08-25 09:40:45,2025-08-25 09:58:56,원스토몰 180212,2025-08-25 09:37:27,0,1,49.0,0.8,4,0.3,196,SMALL,63.000000,52.000000,LARGE,HIGH,LARGE_HIGH,2,3,2,0,2,9,LARGE,0,,0.0,0.0
4710,4767,9513,446900,2,44,59,41,4,식당/카페,7,1369915,48.512195,47.0,ADS_CODE_DAILY_UPDATE,18,14,2025-08-25 09:45:41,2025-08-25 09:56:38,서빙카트 식당 업소용 주방 서빙카 다용도 서빙카트 소형 2단 기본고무바퀴 180213,2025-08-25 09:42:08,0,1,41.0,0.7,4,0.3,164,SMALL,59.000000,44.000000,LARGE,HIGH,LARGE_HIGH,2,3,2,0,1,8,LARGE,0,,0.0,0.0
4711,4768,9514,446902,1,2,2,0,8,미디어/컨텐츠,7,2735900,,,NONE,200,105,2025-08-25 10:37:49,2025-08-25 11:11:29,[네이버카페가입인사] 대장TV,2025-08-25 10:00:00,0,1,0.0,0.0,95,0.9,0,MEDIUM,2.000000,2.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,0.0
4712,4769,9515,446908,1,1,1,0,0,금융,2,1682806,,,NONE,230,180,2025-08-25 10:47:48,2025-08-25 10:47:48,NOL,2025-08-25 10:37:11,0,1,0.0,0.0,50,0.3,0,MEDIUM,1.000000,1.000000,SMALL,LOW,SMALL_LOW,1,0,0,0,0,1,SMALL,0,,0.0,0.0


In [91]:
print(click.head())
print(click.columns)

   Unnamed: 0                                 click_key  ads_idx   dvc_idx  \
0           0  000000d54b9faad47ee99d6cd3cf53894dd4baa5   313780  61906528   
1           1  000002b4d92f7648b455877c2676452efcd22a09   412426  34422806   
2           2  0000057e97361ff3d0263aaecee34cfaa3ba30fb   443660  38366075   
3           3  00000607f60139015da3ee1dd5499db3faa100dc   360192  61894110   
4           4  0000066bc25d4a6d147c27326cf972a4de88024e   372307  61956954   

   mda_idx  contract_price  media_price   click_day  click_time  \
0      539            6000         4500  2025-08-17          21   
1       58             180          170  2025-07-26           2   
2      808             170          120  2025-08-12          18   
3      539            6000         4500  2025-08-17           3   
4      539           15600        11700  2025-08-18           8   

            click_date     exp_day network          user_ip      rwd_idx  \
0  2025-08-17 21:07:37  2025-09-16       0    16.184

In [96]:
ads_list = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인.csv")

In [97]:
import numpy as np, pandas as pd

# =========================
# 0) 하이퍼/기본 설정
# =========================
L = 30   # 과거 윈도우(피처)
H = 30   # 예측 기간(전환 라벨 기간, 시나리오 B의 일클릭*H에도 사용)
K = 50   # 코호트(top-K 유사 광고 수)
BETA = 1.0  # 광고 유사도 가중의 온도 (0.5~1.0 권장)
ALPHA_PRIOR, BETA_PRIOR = 2.0, 400.0  # Beta-Binomial 스무딩(라플라스)용 prior (대략 CVR~0.5% 가정)
EPS = 1e-12

# =========================
# 1) 유틸
# =========================
def _to_ts(x):
    return pd.to_datetime(x) if not np.issubdtype(pd.Series([x]).dtype, np.datetime64) else x

def _clr_block(df, eps=1e-6):
    if df.empty: return df
    Z = df.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

def _cosine(a, B):
    a = a.reshape(1,-1)
    num = (B * a).sum(axis=1)
    den = (np.sqrt((B**2).sum(axis=1)) * np.sqrt((a**2).sum()))
    den = np.where(den==0, 1e-12, den)
    return (num/den).ravel()

# =========================
# 2) 광고 피처 만들기 (과거 광고 + 신규 광고)
#   - 카테/OS/도메인 분포(조성) -> CLR -> z-score
#   - 과거 광고는 클릭 로그로 계산(기간 L일)
#   - 신규 광고는 광고 메타 파일로 분포 생성
# =========================
def build_ad_feature_space(
    click_df: pd.DataFrame,
    ad_meta_df: pd.DataFrame,
    window_end=None, L_days=30,
    target_ads_id=None,
    domain_col_in_meta="domain", cat_col_in_meta="ads_category", os_col_in_meta="ads_os_type"
):
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])
    if window_end is None:
        window_end = df["click_day"].max()
    window_end = pd.to_datetime(window_end)
    start = window_end - pd.Timedelta(days=L_days-1)
    hist = df[(df["click_day"]>=start) & (df["click_day"]<=window_end)].copy()

    # 과거 광고들의 분포(클릭 기준)
    def _pct_ad(col):
        ct = pd.crosstab(hist["ads_idx"], hist[col])  # ads × value
        ct = ct.div(ct.sum(axis=1), axis=0).fillna(0.0)
        return ct

    dom_pct = _pct_ad("domain");       dom_pct.columns = [f"ad_domain_{c}_pct" for c in dom_pct.columns]
    cat_pct = _pct_ad("ads_category"); cat_pct.columns = [f"ad_cat_{c}_pct"     for c in cat_pct.columns]
    os_pct  = _pct_ad("ads_os_type");  os_pct.columns  = [f"ad_os_{c}_pct"      for c in os_pct.columns]

    # 과거 광고 특징 테이블
    A_hist = dom_pct.join([cat_pct, os_pct], how="outer").fillna(0.0)
    # CLR
    hist_blocks = [
        [c for c in A_hist.columns if c.startswith("ad_domain_")],
        [c for c in A_hist.columns if c.startswith("ad_cat_")],
        [c for c in A_hist.columns if c.startswith("ad_os_")],
    ]
    for blk in hist_blocks:
        if blk: A_hist[blk] = _clr_block(A_hist[blk])
    # z-score
    A_hist = (A_hist - A_hist.mean()) / (A_hist.std() + 1e-9)
    A_hist = A_hist.replace([np.inf,-np.inf],0).fillna(0.0)

    # 신규 광고(타깃)의 분포를 광고 메타에서 생성
    # - ad_meta_df에는 target_ads_id에 대한 행이 하나 이상 있을 수 있음(도메인 여러 개 등)
    # - 각 축별로 value_counts 비율을 만든다.
    meta = ad_meta_df.copy()
    if target_ads_id is not None and "ads_idx" in meta.columns:
        meta = meta[meta["ads_idx"]==target_ads_id].copy()
    # 도메인/카테/OS 비율
    def _ratio_from_meta(col):
        if col not in meta.columns or meta[col].dropna().empty:
            return {}
        vc = meta[col].dropna().astype(str).value_counts(normalize=True)
        return vc.to_dict()

    r_domain = _ratio_from_meta(domain_col_in_meta)
    r_cat    = _ratio_from_meta(cat_col_in_meta)
    r_os     = _ratio_from_meta(os_col_in_meta)

    # 히스토리에서 쓰인 열들 기준으로 벡터화를 맞춘다(없는 값은 0)
    target_series = pd.Series(dtype=float)
    for col in A_hist.columns:
        if col.startswith("ad_domain_"):
            key = col.replace("ad_domain_","")
            target_series[col] = r_domain.get(key, 0.0)
        elif col.startswith("ad_cat_"):
            key = col.replace("ad_cat_","")
            target_series[col] = r_cat.get(key, 0.0)
        elif col.startswith("ad_os_"):
            key = col.replace("ad_os_","")
            target_series[col] = r_os.get(key, 0.0)
        else:
            target_series[col] = 0.0
    target_series = target_series.to_frame().T
    # CLR -> z-score(A_hist의 통계 사용)
    for blk in hist_blocks:
        if blk:
            target_series[blk] = _clr_block(target_series[blk])
    target_series = (target_series - A_hist.mean()) / (A_hist.std() + 1e-9)
    target_series = target_series.replace([np.inf,-np.inf],0).fillna(0.0)
    target_series.index = [("target" if target_ads_id is None else int(target_ads_id))]

    # 광고 피처 공간(과거 광고 + 타깃 1개)
    A_all = pd.concat([A_hist, target_series], axis=0)
    return A_all, window_end

# =========================
# 3) 유사 광고 코호트 선택
# =========================
def pick_similar_ads(A_all: pd.DataFrame, target_id, K=50, beta=1.0):
    target_vec = A_all.loc[target_id].values
    pool = A_all.drop(index=target_id)
    sims = _cosine(target_vec, pool.values)
    pool = pool.copy()
    pool["sim"] = sims
    pool = pool.sort_values("sim", ascending=False).head(K)
    # 가중치: sim^beta / 합
    w = np.power(np.clip(pool["sim"].values, 0, 1), beta)
    w = w / (w.sum() + 1e-12)
    pool["weight"] = w
    return pool[["weight","sim"]]

# =========================
# 4) 코호트 → 매체별 CVR 추정 (가중 평균 + 다단계 스무딩)
#    - cohort의 (ads_idx, mda_idx)에서 L일 라벨이 아닌 "과거" 전환/클릭 집계 사용 금지
#    - 여기선 간단히: 과거 L일 같은 창을 사용(라벨 누출 방지)
# =========================
def estimate_media_cvr_for_target(
    click_df: pd.DataFrame,
    cohort_weights: pd.DataFrame,  # index: ads_idx, cols: weight, sim
    window_end, L_days=30,
    target_meta_cat=None,  # 신규 광고의 카테고리 (있으면 매체×카테 baseline에 사용)
    alpha_prior=2.0, beta_prior=400.0
):
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])
    start = pd.to_datetime(window_end) - pd.Timedelta(days=L_days-1)
    hist = df[(df["click_day"]>=start) & (df["click_day"]<=window_end)].copy()

    # 4-1) 코호트 ads만 남기고 매체별 클릭/전환 집계
    cohort_ids = cohort_weights.index.astype(int).tolist()
    Hc = hist[hist["ads_idx"].isin(cohort_ids)].copy()
    g = Hc.groupby(["ads_idx","mda_idx"]).agg(
        clicks=("click_key","count"),
        convs=("conversion","sum")
    ).reset_index()

    # 4-2) 가중 합계(ads별 weight 적용)
    w_map = cohort_weights["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]
    agg = g.groupby("mda_idx").agg(
        clicks_w=("w_clicks","sum"),
        convs_w=("w_convs","sum")
    )

    # 4-3) 베이스라인들
    #   (a) 매체 글로벌 CVR
    base_m = hist.groupby("mda_idx").agg(
        clicks=("click_key","count"),
        convs=("conversion","sum")
    )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    #   (b) 매체×카테 CVR (있으면)
    if target_meta_cat is not None and "ads_category" in hist.columns:
        sub = hist[hist["ads_category"]==target_meta_cat]
        base_mc = sub.groupby("mda_idx").agg(
            clicks=("click_key","count"),
            convs=("conversion","sum")
        )
        base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)
    else:
        base_mc = pd.DataFrame(columns=["cvr_mc"])

    # 4-4) 코호트 추정 CVR (Beta-Binomial 스무딩)
    out = agg.join(base_m[["cvr_m"]], how="outer").join(base_mc[["cvr_mc"]], how="left")
    out = out.fillna({"clicks_w":0.0, "convs_w":0.0})
    # 코호트 가중 누적을 "유효 클릭"으로 보고 prior를 더함
    out["cvr_cohort"] = (out["convs_w"] + alpha_prior) / (out["clicks_w"] + alpha_prior + beta_prior)
    # 다단계 스무딩: 코호트 ↔ 매체×카테 ↔ 매체글로벌 ↔ 전역
    global_cvr = (hist["conversion"].sum() + alpha_prior) / (len(hist) + alpha_prior + beta_prior)

    # 증거량 기반 가중(간단): eff_clicks = clicks_w
    eff = out["clicks_w"].fillna(0.0)
    w1 = (eff / (eff + 50)).fillna(0.0)              # 코호트 신뢰(50은 임계; 조절)
    # cvr_mc 있으면 더 가중, 없으면 cvr_m로
    cvr_mc = out["cvr_mc"].fillna(np.nan)
    has_mc = cvr_mc.notna().astype(float)
    # 최종 CVR
    out["pred_cvr"] = (
        w1 * out["cvr_cohort"] +
        (1-w1) * ( has_mc * cvr_mc.fillna(0) + (1-has_mc) * out["cvr_m"].fillna(global_cvr) )
    )
    out["pred_cvr"] = out["pred_cvr"].fillna(global_cvr)

    # 보조 컬럼
    out["coverage_ads"] = (g.groupby("mda_idx")["ads_idx"].nunique()).reindex(out.index).fillna(0).astype(int)
    out["cohort_eff_clicks"] = eff
    return out[["pred_cvr","cvr_cohort","cvr_m","cvr_mc","coverage_ads","cohort_eff_clicks"]].reset_index()

# =========================
# 5) 전환수 시나리오 산출
#   - A: per_1000_clicks (클릭 1000건 보낼 경우)
#   - B: 코호트 평균 일클릭 × H일
# =========================
def scenario_conversions(pred_cvr_df: pd.DataFrame, click_df: pd.DataFrame,
                         cohort_ads: pd.Index, H_days=30):
    # 코호트의 매체별 "일평균 클릭" 추정
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])
    # 코호트 ads만
    C = df[df["ads_idx"].isin(cohort_ads)].copy()
    agg = C.groupby(["mda_idx","click_day"]).size().rename("clk").reset_index()
    per_day = agg.groupby("mda_idx")["clk"].mean()  # 코호트 기준 일평균 클릭
    out = pred_cvr_df.copy()
    out["per_1000_clicks_conv"] = out["pred_cvr"] * 1000.0
    out["scenarioB_clicks"] = per_day.reindex(out["mda_idx"]).fillna(0.0).values * H_days
    out["scenarioB_conv"] = out["pred_cvr"] * out["scenarioB_clicks"]
    return out

# =========================
# 6) 전체 파이프라인 함수
# =========================
def predict_for_new_ad(
    click_df, ad_meta_df, target_ads_id,
    L_days=30, H_days=30, K=50, beta=1.0,
    target_category_col="ads_category"  # ad_meta에서 찾을 컬럼명
):
    # 광고 피처 공간
    A_all, wend = build_ad_feature_space(
        click_df, ad_meta_df, window_end=None, L_days=L_days, target_ads_id=target_ads_id
    )
    # 코호트 선택
    cohort = pick_similar_ads(A_all, target_id=target_ads_id, K=K, beta=beta)
    # 타깃 카테고리(있으면 꺼내기)
    if target_category_col in ad_meta_df.columns:
        cat_val = (ad_meta_df.loc[ad_meta_df["ads_idx"]==target_ads_id, target_category_col]
                   .dropna().astype(int))
        target_cat = int(cat_val.mode().iat[0]) if not cat_val.empty else None
    else:
        target_cat = None
    # 매체별 CVR 추정
    pred_cvr = estimate_media_cvr_for_target(
        click_df, cohort, wend, L_days=L_days, target_meta_cat=target_cat,
        alpha_prior=ALPHA_PRIOR, beta_prior=BETA_PRIOR
    )
    # 시나리오 산출
    pred_all = scenario_conversions(pred_cvr, click_df, cohort.index, H_days=H_days)
    # 정리
    pred_all = pred_all.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    return pred_all, cohort


In [99]:


# 2) 타깃 광고 ID 지정 (신규 광고의 ads_idx)
TARGET_AD = 9982  # 예시값; 실제 ID로 바꿔

# 3) 실행
pred_df, cohort_info = predict_for_new_ad(
    click_df=click,           # 너의 클릭 DF
    ad_meta_df=ads_list,       # 방금 로드한 광고 메타 DF
    target_ads_id=TARGET_AD,
    L_days=L, H_days=H,
    K=K, beta=BETA
)

# 4) 결과 보기
pred_df.head(20)[[
    "mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv",
    "coverage_ads","cohort_eff_clicks","cvr_cohort","cvr_m","cvr_mc"
]]


ValueError: operands could not be broadcast together with shapes (9149,31) (1,62) 

In [100]:
# =========================
# Cohort 기반 예측 (빠른 버전 + 캐싱)
# =========================
import numpy as np, pandas as pd

# ---- 전역 캐시: (L_days, window_end_date) -> {A_hist_z, mu, sigma, cols, blocks}
_AD_HIST_CACHE: dict = {}

# ---- 하이퍼
ALPHA_PRIOR, BETA_PRIOR = 2.0, 400.0   # Beta-Binomial 스무딩(prior)
EPS = 1e-12

# ---------- 유틸 ----------
def _clr_block(df, eps=1e-6):
    if df.empty: return df
    Z = df.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

def _drop_rare_columns(df, min_frac=1e-3):
    # 전체 샘플 중 0이 아닌 행 비율이 너무 낮은 열 제거(차원↓→속도↑)
    if df.empty: return df
    col_nz = (df != 0).sum(axis=0)
    keep = col_nz[col_nz >= max(1, int(len(df)*min_frac))].index
    return df[keep]

def clear_ad_hist_cache():
    _AD_HIST_CACHE.clear()

# ---------- 1) 광고 히스토리 피처 (창별 캐싱) ----------
def _build_or_get_ad_hist_space(click_df: pd.DataFrame, window_end=None, L_days=30,
                                min_frac=1e-3, use_float32=True):
    """창(L_days, window_end) 기준으로 광고 히스토리 피처 공간을 캐싱해 재사용"""
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])
    if window_end is None:
        window_end = df["click_day"].max()
    window_end = pd.to_datetime(window_end).normalize()
    start = window_end - pd.Timedelta(days=L_days-1)

    cache_key = (int(L_days), pd.Timestamp(window_end).date())
    if cache_key in _AD_HIST_CACHE:
        return _AD_HIST_CACHE[cache_key], window_end

    hist = df[(df["click_day"]>=start) & (df["click_day"]<=window_end)].copy()

    # 광고×도메인/카테/OS 분포(클릭 비율)
    def _ad_pct(col, prefix):
        ct = pd.crosstab(hist["ads_idx"], hist[col])
        ct = _drop_rare_columns(ct, min_frac=min_frac)
        ct = ct.div(ct.sum(axis=1), axis=0).fillna(0.0)
        ct.columns = [f"{prefix}{c}_pct" for c in ct.columns]
        return ct

    dom_pct = _ad_pct("domain", "ad_domain_")
    cat_pct = _ad_pct("ads_category", "ad_cat_")
    os_pct  = _ad_pct("ads_os_type", "ad_os_")

    A_hist = dom_pct.join([cat_pct, os_pct], how="outer").fillna(0.0)

    # 블록 구분
    blocks = {
        "domain": [c for c in A_hist.columns if c.startswith("ad_domain_")],
        "cat":    [c for c in A_hist.columns if c.startswith("ad_cat_")],
        "os":     [c for c in A_hist.columns if c.startswith("ad_os_")],
    }

    # CLR
    for cols in blocks.values():
        if cols: A_hist[cols] = _clr_block(A_hist[cols])

    # z-score 기준(히스토리 통계)
    mu = A_hist.mean()
    sigma = A_hist.std().replace(0, 1.0)  # 0 분산 보호
    A_hist_z = (A_hist - mu) / (sigma + 1e-9)

    if use_float32:
        A_hist_z = A_hist_z.astype(np.float32)
        mu = mu.astype(np.float32); sigma = sigma.astype(np.float32)

    store = {"A_hist_z": A_hist_z, "mu": mu, "sigma": sigma,
             "cols": A_hist_z.columns.tolist(), "blocks": blocks}
    _AD_HIST_CACHE[cache_key] = store
    return store, window_end

# ---------- 2) 광고 메타 → 타깃 벡터 만들기(히스토리 통계로 정규화) ----------
def _target_ad_vector_from_meta(ad_meta_df: pd.DataFrame, hist_store: dict,
                                target_ads_id=None,
                                domain_col="domain", cat_col="ads_category", os_col="ads_os_type",
                                use_float32=True):
    cols = hist_store["cols"]
    blocks = hist_store["blocks"]
    mu, sigma = hist_store["mu"], hist_store["sigma"]

    meta = ad_meta_df.copy()
    if target_ads_id is not None and "ads_idx" in meta.columns:
        meta = meta[meta["ads_idx"]==target_ads_id].copy()

    def _ratio(col):
        if col not in meta.columns or meta[col].dropna().empty:
            return {}
        vc = meta[col].dropna().astype(str).value_counts(normalize=True)
        return vc.to_dict()

    r_domain = _ratio(domain_col)
    r_cat    = _ratio(cat_col)
    r_os     = _ratio(os_col)

    s = pd.Series(0.0, index=cols, dtype=float)

    for c in cols:
        if c.startswith("ad_domain_"):
            key = c.replace("ad_domain_", "").replace("_pct","")
            s[c] = r_domain.get(key, 0.0)
        elif c.startswith("ad_cat_"):
            key = c.replace("ad_cat_", "").replace("_pct","")
            s[c] = r_cat.get(key, 0.0)
        elif c.startswith("ad_os_"):
            key = c.replace("ad_os_", "").replace("_pct","")
            s[c] = r_os.get(key, 0.0)

    # CLR: 블록 단위로
    df_t = s.to_frame().T
    if blocks["domain"]:
        df_t[blocks["domain"]] = _clr_block(df_t[blocks["domain"]])
    if blocks["cat"]:
        df_t[blocks["cat"]] = _clr_block(df_t[blocks["cat"]])
    if blocks["os"]:
        df_t[blocks["os"]] = _clr_block(df_t[blocks["os"]])

    # z-score: 히스토리 mu/sigma 사용
    t_z = (df_t.iloc[0] - mu) / (sigma + 1e-9)
    if use_float32:
        t_z = t_z.astype(np.float32)
    # 타깃 행 라벨은 충돌 없게 특별 라벨 사용
    label = f"__target_{'NA' if target_ads_id is None else int(target_ads_id)}__"
    t_z.name = label
    return t_z

# ---------- 3) 안전한 유사 광고 Top-K ----------
def pick_similar_ads(A_all: pd.DataFrame, target_label, K=30, beta=1.0):
    # 숫자 컬럼만
    cols = A_all.select_dtypes(include=[np.number]).columns
    A = A_all[cols]

    if target_label not in A.index:
        raise ValueError(f"target '{target_label}' not in feature space")

    # 타깃 한 행만 뽑고 제거
    target_row = A.loc[target_label]
    pool = A.drop(index=target_label)

    a = target_row.values.astype(float)
    B = pool.values.astype(float)

    a_norm = np.sqrt((a*a).sum()) + 1e-12
    B_norm = np.sqrt((B*B).sum(axis=1)) + 1e-12
    sims = (B @ a) / (B_norm * a_norm)

    pool = pool.copy()
    pool["sim"] = sims
    pool = pool.sort_values("sim", ascending=False).head(K)

    w = np.power(np.clip(pool["sim"].values, 0, 1), beta)
    w = w / (w.sum() + 1e-12)
    pool["weight"] = w
    return pool[["weight","sim"]]

# ---------- 4) 코호트 → 매체별 CVR 추정 ----------
def estimate_media_cvr_for_target(
    click_df: pd.DataFrame,
    cohort_weights: pd.DataFrame,  # index=ads_idx, cols: weight
    window_end, L_days=30,
    target_meta_cat=None,
    alpha_prior=ALPHA_PRIOR, beta_prior=BETA_PRIOR
):
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])

    window_end = pd.to_datetime(window_end)
    start = window_end - pd.Timedelta(days=L_days-1)
    hist = df[(df["click_day"]>=start) & (df["click_day"]<=window_end)].copy()

    cohort_ids = cohort_weights.index.astype(int).tolist()
    Hc = hist[hist["ads_idx"].isin(cohort_ids)].copy()

    # (ads, mda) 집계
    g = Hc.groupby(["ads_idx","mda_idx"]).agg(
        clicks=("click_key","count"),
        convs=("conversion","sum")
    ).reset_index()

    w_map = cohort_weights["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]

    agg = g.groupby("mda_idx").agg(
        clicks_w=("w_clicks","sum"),
        convs_w=("w_convs","sum"),
        cohort_ads=("ads_idx","nunique")
    )

    # 매체 별 글로벌 CVR
    base_m = hist.groupby("mda_idx").agg(
        clicks=("click_key","count"),
        convs=("conversion","sum")
    )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    # 매체×카테 CVR (있으면)
    if (target_meta_cat is not None) and ("ads_category" in hist.columns):
        sub = hist[hist["ads_category"]==target_meta_cat]
        base_mc = sub.groupby("mda_idx").agg(
            clicks=("click_key","count"),
            convs=("conversion","sum")
        )
        base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)
    else:
        base_mc = pd.DataFrame()

    out = agg.join(base_m[["cvr_m"]], how="outer").join(base_mc[["cvr_mc"]], how="left").fillna(0.0)

    # 코호트 CVR (스무딩)
    out["cvr_cohort"] = (out["convs_w"] + alpha_prior) / (out["clicks_w"] + alpha_prior + beta_prior)
    global_cvr = (hist["conversion"].sum() + alpha_prior) / (len(hist) + alpha_prior + beta_prior)

    eff = out["clicks_w"]
    w1 = (eff / (eff + 50.0)).fillna(0.0)  # 증거량에 따른 신뢰 가중
    cvr_mc = out["cvr_mc"].replace(0.0, np.nan)

    out["pred_cvr"] = (
        w1 * out["cvr_cohort"] +
        (1.0 - w1) * ( cvr_mc.fillna(out["cvr_m"].replace(0.0, global_cvr)) )
    )
    out["pred_cvr"] = out["pred_cvr"].replace(0.0, global_cvr)
    out.rename(columns={"cohort_ads":"coverage_ads","clicks_w":"cohort_eff_clicks"}, inplace=True)

    return out.reset_index()[["mda_idx","pred_cvr","cvr_cohort","cvr_m","cvr_mc","coverage_ads","cohort_eff_clicks"]]

# ---------- 5) 시나리오 산출 ----------
def scenario_conversions(pred_cvr_df: pd.DataFrame, click_df: pd.DataFrame,
                         cohort_ads: pd.Index, H_days=30):
    df = click_df.copy()
    if not np.issubdtype(df["click_day"].dtype, np.datetime64):
        df["click_day"] = pd.to_datetime(df["click_day"])
    C = df[df["ads_idx"].isin(cohort_ads)].copy()
    # 코호트의 매체별 일평균 클릭
    agg = C.groupby(["mda_idx","click_day"]).size().rename("clk").reset_index()
    per_day = agg.groupby("mda_idx")["clk"].mean()

    out = pred_cvr_df.copy()
    out["per_1000_clicks_conv"] = out["pred_cvr"] * 1000.0
    out["scenarioB_clicks"] = per_day.reindex(out["mda_idx"]).fillna(0.0).values * H_days
    out["scenarioB_conv"] = out["pred_cvr"] * out["scenarioB_clicks"]
    return out

# ---------- 6) 전체 파이프라인 ----------
def predict_for_new_ad(
    click_df, ad_meta_df, target_ads_id,
    L_days=30, H_days=30, K=30, beta=1.0,
    target_category_col="ads_category"
):
    # 6-1) 창별 히스토리 공간 가져오기(캐시)
    hist_store, wend = _build_or_get_ad_hist_space(click_df, window_end=None, L_days=L_days)

    # 6-2) 타깃 벡터 생성 & 결합
    t_vec = _target_ad_vector_from_meta(ad_meta_df, hist_store, target_ads_id)
    target_label = t_vec.name
    A_all = pd.concat([hist_store["A_hist_z"], t_vec.to_frame().T], axis=0)

    # 6-3) 유사 광고 코호트
    cohort = pick_similar_ads(A_all, target_label=target_label, K=K, beta=beta)

    # 6-4) 타깃 카테고리(있으면)
    if (target_category_col in ad_meta_df.columns) and \
       (not ad_meta_df.loc[ad_meta_df.get("ads_idx", pd.Series([target_ads_id]*len(ad_meta_df)))==target_ads_id,
                           target_category_col].dropna().empty):
        target_cat = int(ad_meta_df.loc[ad_meta_df["ads_idx"]==target_ads_id, target_category_col]
                         .dropna().astype(int).mode().iat[0])
    else:
        target_cat = None

    # 6-5) 매체별 CVR 추정
    pred_cvr = estimate_media_cvr_for_target(click_df, cohort, wend, L_days=L_days, target_meta_cat=target_cat)

    # 6-6) 시나리오 산출
    pred_all = scenario_conversions(pred_cvr, click_df, cohort.index, H_days=H_days)

    # 6-7) 정렬/컬럼
    pred_all = pred_all.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    return pred_all, cohort

# ---------- 사용 예 ----------
# ad_meta = pd.read_csv("/mnt/data/광고도메인리스트.csv")
# TARGET_AD = 9982
# pred_df, cohort_info = predict_for_new_ad(click, ad_meta, target_ads_id=TARGET_AD,
#                                           L_days=30, H_days=30, K=30, beta=1.0)
# pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv",
#                   "coverage_ads","cohort_eff_clicks","cvr_cohort","cvr_m","cvr_mc"]]


In [None]:

TARGET_AD = 9982
pred_df, cohort_info = predict_for_new_ad(click, ads_list, target_ads_id=TARGET_AD,
                                          L_days=30, H_days=30, K=30, beta=1.0)
pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv",
                  "coverage_ads","cohort_eff_clicks","cvr_cohort","cvr_m","cvr_mc"]]

Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,coverage_ads,cohort_eff_clicks,cvr_cohort,cvr_m,cvr_mc
0,396,0.57641,576.410198,0.0,0.0,0.0,0.0,0.004975,0.57641,0.0
1,344,0.568879,568.879037,0.0,0.0,0.0,0.0,0.004975,0.568879,0.0
2,563,0.567196,567.19614,397.5,225.460466,2.0,0.258468,0.005287,0.570101,0.0
3,108,0.549722,549.721515,60.0,32.983291,1.0,0.006675,0.004983,0.549794,0.0
4,562,0.543693,543.693225,1980.0,1076.512586,1.0,1.512734,0.008206,0.559894,0.0
5,342,0.537397,537.397427,0.0,0.0,0.0,0.0,0.004975,0.537397,0.0
6,397,0.516583,516.583445,0.0,0.0,0.0,0.0,0.004975,0.516583,0.0
7,343,0.496358,496.358432,0.0,0.0,0.0,0.0,0.004975,0.496358,0.0
8,1025,0.388488,388.487614,1082.0,420.343598,5.0,46.280801,0.103379,0.508664,0.652389
9,678,0.381568,381.568295,3676.8,1402.950307,4.0,72.481616,0.142278,0.534237,0.728451


In [104]:
TARGET_AD = 9982
pred_df, cohort_info = predict_for_new_ad(click, ads_list, target_ads_id=TARGET_AD,
                                          L_days=30, H_days=7, K=5, beta=1.0)
pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv",
                  "coverage_ads","cohort_eff_clicks","cvr_cohort","cvr_m","cvr_mc"]]

Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,coverage_ads,cohort_eff_clicks,cvr_cohort,cvr_m,cvr_mc
0,396,0.57641,576.410198,0.0,0.0,0.0,0.0,0.004975,0.57641,0.0
1,563,0.570101,570.100846,0.0,0.0,0.0,0.0,0.004975,0.570101,0.0
2,344,0.568879,568.879037,0.0,0.0,0.0,0.0,0.004975,0.568879,0.0
3,562,0.559894,559.894235,0.0,0.0,0.0,0.0,0.004975,0.559894,0.0
4,108,0.549794,549.794239,0.0,0.0,0.0,0.0,0.004975,0.549794,0.0
5,1032,0.539743,539.742952,7.0,3.778201,1.0,0.201059,0.004973,0.640547,0.541893
6,342,0.537397,537.397427,0.0,0.0,0.0,0.0,0.004975,0.537397,0.0
7,397,0.516583,516.583445,0.0,0.0,0.0,0.0,0.004975,0.516583,0.0
8,1047,0.503091,503.091442,0.0,0.0,0.0,0.0,0.004975,0.493925,0.503091
9,343,0.496358,496.358432,0.0,0.0,0.0,0.0,0.004975,0.496358,0.0


In [102]:
ads_9982_pf

Unnamed: 0,ads_idx,mda_idx,total_clicks,total_conversions,contract_price,media_price,domain,ads_category,cvr,profit_per_conversion,total_profit,first_click,last_click,days_active_calc,daily_clicks,daily_conversions,daily_profit,배분그룹
0,9982,1020,2949,1570,230,160,미디어/컨텐츠,3,0.5324,70,109900,2025-07-26 03:24:26,2025-08-25 11:21:22,31,95.129032,50.645161,3545.16129,잘 배분
1,9982,1047,2458,1443,230,160,미디어/컨텐츠,3,0.5871,70,101010,2025-07-26 03:16:33,2025-08-25 11:21:15,31,79.290323,46.548387,3258.387097,잘 배분
2,9982,845,2502,1433,230,160,미디어/컨텐츠,3,0.5727,70,100310,2025-07-26 03:14:53,2025-08-25 11:23:37,31,80.709677,46.225806,3235.806452,잘 배분
3,9982,824,2428,1392,230,160,미디어/컨텐츠,3,0.5733,70,97440,2025-07-26 03:18:25,2025-08-25 11:23:42,31,78.322581,44.903226,3143.225806,잘 배분
4,9982,1022,2017,1143,230,160,미디어/컨텐츠,3,0.5667,70,80010,2025-07-26 03:16:18,2025-08-25 11:21:27,31,65.064516,36.870968,2580.967742,잘 배분
5,9982,1046,2451,1137,230,160,미디어/컨텐츠,3,0.4639,70,79590,2025-07-26 03:16:36,2025-08-25 11:23:10,31,79.064516,36.677419,2567.419355,잘 배분
6,9982,1021,1926,1100,230,160,미디어/컨텐츠,3,0.5711,70,77000,2025-07-26 03:10:52,2025-08-25 11:22:40,31,62.129032,35.483871,2483.870968,잘 배분
7,9982,851,1753,768,230,160,미디어/컨텐츠,3,0.4381,70,53760,2025-07-26 03:15:41,2025-08-25 11:21:54,31,56.548387,24.774194,1734.193548,잘 배분
8,9982,850,1696,738,230,160,미디어/컨텐츠,3,0.4351,70,51660,2025-07-26 03:30:15,2025-08-25 11:17:11,31,54.709677,23.806452,1666.451613,잘 배분
9,9982,1032,451,434,230,160,미디어/컨텐츠,3,0.9623,70,30380,2025-07-26 04:28:07,2025-08-25 11:14:38,31,14.548387,14.0,980.0,잘 배분


In [113]:
import numpy as np, pandas as pd
from scipy.sparse import csr_matrix

# ========== 경로 ==========
PERF_CSV = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv"   # rpt_time_date, ads_idx, mda_idx, rpt_time_clk, rpt_time_turn
META_CSV = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv"             # ads_idx, ads_category, ads_os_type, domain ...

# ========== 내부 유틸 ==========
ALPHA_PRIOR, BETA_PRIOR = 2.0, 400.0
EPS = 1e-12
_HOURLY_CACHE = {}   # (L_days, window_end_date) -> store

def _clr_block(df, eps=1e-6):
    if df.empty: return df
    Z = df.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

def clear_hourly_cache():
    _HOURLY_CACHE.clear()

# 광고 메타 → ads별 분포(도메인/카테/OS) 만들기
def _ad_meta_pct(df, col, prefix):
    df = df.dropna(subset=[col])
    vc = (df.groupby(["ads_idx", col]).size().rename("cnt").reset_index())
    total = vc.groupby("ads_idx")["cnt"].transform("sum")
    vc["pct"] = vc["cnt"] / total
    piv = vc.pivot(index="ads_idx", columns=col, values="pct").fillna(0.0)
    piv.columns = [f"{prefix}{c}_pct" for c in piv.columns]
    return piv

# 창별 스토어(임베딩 + CSR + 베이스라인) 만들기/캐시
def _build_store_from_hourly(perf_df, meta_df, L_days=30, window_end=None, min_frac=0.002):
    perf = perf_df.copy()
    if not np.issubdtype(perf["rpt_time_date"].dtype, np.datetime64):
        perf["rpt_time_date"] = pd.to_datetime(perf["rpt_time_date"])
    perf = perf.rename(columns={"rpt_time_clk":"clicks", "rpt_time_turn":"conversions"})

    if window_end is None:
        window_end = perf["rpt_time_date"].max()
    window_end = pd.to_datetime(window_end).normalize()
    start = window_end - pd.Timedelta(days=L_days-1)
    hist = perf[(perf["rpt_time_date"]>=start) & (perf["rpt_time_date"]<=window_end)].copy()

    key = (int(L_days), pd.Timestamp(window_end).date())
    if key in _HOURLY_CACHE:
        return _HOURLY_CACHE[key]

    # --- 광고 임베딩 (메타만 사용 → 매우 빠름)
    dom_pct = _ad_meta_pct(meta_df, "domain", "ad_domain_")
    cat_pct = _ad_meta_pct(meta_df, "ads_category", "ad_cat_")
    os_pct  = _ad_meta_pct(meta_df, "ads_os_type", "ad_os_")
    A = dom_pct.join([cat_pct, os_pct], how="outer").fillna(0.0)

    dom_cols = [c for c in A.columns if c.startswith("ad_domain_")]
    cat_cols = [c for c in A.columns if c.startswith("ad_cat_")]
    os_cols  = [c for c in A.columns if c.startswith("ad_os_")]

    for cols in [dom_cols, cat_cols, os_cols]:
        if cols: A[cols] = _clr_block(A[cols])
    mu, sigma = A.mean(), A.std().replace(0, 1.0)
    A_z = ((A - mu) / (sigma + 1e-9)).astype(np.float32)

    # 창에 실제로 등장한 광고로만 축소(속도↑)
    A_z = A_z.loc[A_z.index.intersection(hist["ads_idx"].unique())]

    # --- (ads × mda) 집계 → CSR
    grp = (hist.groupby(["ads_idx","mda_idx"])
               .agg(clicks=("clicks","sum"), convs=("conversions","sum"))
               .reset_index())
    ads_uni = pd.Index(grp["ads_idx"].unique()).sort_values()
    mda_uni = pd.Index(grp["mda_idx"].unique()).sort_values()
    n_ads, n_mda = len(ads_uni), len(mda_uni)
    ar = ads_uni.get_indexer(grp["ads_idx"])
    mr = mda_uni.get_indexer(grp["mda_idx"])
    clicks_csr = csr_matrix((grp["clicks"].astype(np.float32).values, (ar, mr)), shape=(n_ads, n_mda))
    convs_csr  = csr_matrix((grp["convs"].astype(np.float32).values,  (ar, mr)), shape=(n_ads, n_mda))

    # --- 매체 베이스라인
    m_clicks = np.asarray(clicks_csr.sum(0)).ravel()
    m_convs  = np.asarray(convs_csr.sum(0)).ravel()
    cvr_m = (m_convs + ALPHA_PRIOR) / (m_clicks + ALPHA_PRIOR + BETA_PRIOR)

    # 매체×카테고리 베이스라인 (meta와 결합)
    cats = meta_df[["ads_idx","ads_category"]].dropna().drop_duplicates()
    grp_cat = grp.merge(cats, on="ads_idx", how="left")
    m_clicks_by_cat, m_convs_by_cat = {}, {}
    for c in grp_cat["ads_category"].dropna().unique():
        sub = grp_cat[grp_cat["ads_category"]==c]
        if sub.empty: continue
        ar2 = ads_uni.get_indexer(sub["ads_idx"]); mr2 = mda_uni.get_indexer(sub["mda_idx"])
        cc = csr_matrix((sub["clicks"].astype(np.float32).values, (ar2, mr2)), shape=(n_ads, n_mda))
        cv = csr_matrix((sub["convs"].astype(np.float32).values,  (ar2, mr2)), shape=(n_ads, n_mda))
        m_clicks_by_cat[int(c)] = np.asarray(cc.sum(0)).ravel()
        m_convs_by_cat[int(c)]  = np.asarray(cv.sum(0)).ravel()

    store = {
        "L_days": int(L_days), "window_end": window_end,
        "A_z": A_z, "A_cols": A_z.columns.tolist(), "A_mu": mu, "A_sigma": sigma,
        "ads_index": ads_uni, "mda_index": mda_uni,
        "clicks_csr": clicks_csr, "convs_csr": convs_csr,
        "cvr_m": cvr_m, "m_clicks_by_cat": m_clicks_by_cat, "m_convs_by_cat": m_convs_by_cat
    }
    _HOURLY_CACHE[key] = store
    return store

# 타깃 광고 벡터(메타에서 생성 → CLR → z-score → 코사인 정규화)
def _target_vec_from_meta(meta_df, store, target_ads_id):
    cols, mu, sigma = store["A_cols"], store["A_mu"], store["A_sigma"]
    rows = meta_df[meta_df["ads_idx"]==target_ads_id]
    s = pd.Series(0.0, index=cols, dtype=np.float32)

    def _ratio(df, col):
        if col not in df.columns or df[col].dropna().empty: return {}
        return df[col].dropna().astype(str).value_counts(normalize=True).to_dict()

    r_dom = _ratio(rows, "domain"); r_cat = _ratio(rows, "ads_category"); r_os = _ratio(rows, "ads_os_type")
    for c in cols:
        if c.startswith("ad_domain_"): s[c] = r_dom.get(c.replace("ad_domain_","").replace("_pct",""), 0.0)
        elif c.startswith("ad_cat_"):  s[c] = r_cat.get(c.replace("ad_cat_","").replace("_pct",""), 0.0)
        elif c.startswith("ad_os_"):   s[c] = r_os.get(c.replace("ad_os_","").replace("_pct",""), 0.0)

    df_t = s.to_frame().T
    dom = [c for c in cols if c.startswith("ad_domain_")]
    cat = [c for c in cols if c.startswith("ad_cat_")]
    os_ = [c for c in cols if c.startswith("ad_os_")]
    if dom: df_t[dom] = _clr_block(df_t[dom])
    if cat: df_t[cat] = _clr_block(df_t[cat])
    if os_: df_t[os_] = _clr_block(df_t[os_])

    z = (df_t.iloc[0] - mu) / (sigma + 1e-9)
    a = z.values.astype(np.float32)
    a = a / (np.sqrt((a*a).sum()) + 1e-12)
    return z, a

def predict_for_new_ad_from_hourly(
    perf_csv=PERF_CSV, meta_csv=META_CSV, target_ads_id=None,
    L_days=30, H_days=30, K=25, beta=1.0
):
    perf = pd.read_csv(perf_csv, encoding="utf-8-sig")
    meta = pd.read_csv(meta_csv, encoding="utf-8-sig")

    store = _build_store_from_hourly(perf, meta, L_days=L_days)

    # 타깃 카테고리
    cat_series = meta.loc[meta["ads_idx"]==target_ads_id, "ads_category"].dropna().astype(int)
    target_cat = int(cat_series.mode().iat[0]) if not cat_series.empty else None

    # 타깃 벡터 + 유사 광고 Top-K
    z, a = _target_vec_from_meta(meta, store, target_ads_id)
    A = store["A_z"].reindex(store["ads_index"], fill_value=0.0).values
    norms = np.sqrt((A*A).sum(1)) + 1e-12
    sims = (A @ a) / norms
    K = min(K, sims.shape[0])
    idx = np.argpartition(-sims, K-1)[:K]; idx = idx[np.argsort(-sims[idx])]
    w = np.power(np.clip(sims[idx], 0, 1), beta); w = w / (w.sum() + EPS)
    cohort_ads = store["ads_index"][idx].astype(int)
    cohort = pd.DataFrame({"weight": w, "sim": sims[idx]}, index=cohort_ads)

    # 가중 클릭/전환 (코호트 → 매체)
    W = np.zeros(len(store["ads_index"]), dtype=np.float32)
    pos = store["ads_index"].get_indexer(cohort.index)
    W[pos] = cohort["weight"].values.astype(np.float32)
    clicks_w = store["clicks_csr"].T.dot(W)  # (n_mda,)
    convs_w  = store["convs_csr"].T.dot(W)

    cvr_cohort = (convs_w + ALPHA_PRIOR) / (clicks_w + ALPHA_PRIOR + BETA_PRIOR)
    # 베이스라인 blend (매체×카테가 있으면 우선)
    if (target_cat is not None) and (target_cat in store["m_clicks_by_cat"]):
        m_clicks_c = store["m_clicks_by_cat"][target_cat]
        m_convs_c  = store["m_convs_by_cat"][target_cat]
        cvr_mc = (m_convs_c + ALPHA_PRIOR) / (m_clicks_c + ALPHA_PRIOR + BETA_PRIOR)
    else:
        cvr_mc = None
    base = cvr_mc if cvr_mc is not None else store["cvr_m"]

    eff = clicks_w
    w1 = eff / (eff + 50.0)
    pred_cvr = w1 * cvr_cohort + (1.0 - w1) * base

    out = pd.DataFrame({
        "mda_idx": store["mda_index"].astype(int).values,
        "pred_cvr": pred_cvr,
    })
    # 시나리오
    out["per_1000_clicks_conv"] = out["pred_cvr"] * 1000.0
    out["scenarioB_clicks"] = (eff / float(store["L_days"])) * float(H_days)
    out["scenarioB_conv"]   = out["pred_cvr"] * out["scenarioB_clicks"]

    out = out.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    return out, cohort

# === 사용 예 ===
# TARGET_AD = 9982
# pred_df, cohort = predict_for_new_ad_from_hourly(target_ads_id=TARGET_AD, L_days=30, H_days=30, K=25, beta=1.0)
# pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv"]]


In [114]:
# === 사용 예 ===
TARGET_AD = 9982
pred_df, cohort = predict_for_new_ad_from_hourly(target_ads_id=TARGET_AD, L_days=30, H_days=30, K=25, beta=1.0)
pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","scenarioB_clicks","scenarioB_conv"]]


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv
0,757,0.403731,403.730774,52.784973,21.310919
1,678,0.40282,402.819672,60.211109,24.254219
2,756,0.401237,401.237366,67.444458,27.061237
3,1025,0.393753,393.752899,44.126884,17.37509
4,1032,0.369534,369.534485,28.826815,10.652502
5,1026,0.359301,359.300903,27.289814,9.805255
6,1029,0.358431,358.430908,22.611919,8.104811
7,725,0.352734,352.734314,46.283947,16.325937
8,796,0.345792,345.791656,48.806904,16.87702
9,760,0.344142,344.142273,49.587502,17.065157


In [120]:
ads_9982_pf['mda_idx'].unique()

array([1020, 1047,  845,  824, 1022, 1046, 1021,  851,  850, 1032, 1025,
       1048, 1027, 1029, 1026, 1031, 1045, 1013, 1030,  997, 1034, 1035,
       1033, 1010,  371, 1028, 1036,  270, 1012,  990,  281,  761,  766,
        994, 1000, 1016])

In [124]:
# ============================================================
# Fast cohort-based CVR prediction (one-cell version)
# - Uses hourly aggregated report: ['rpt_time_date','ads_idx','mda_idx','rpt_time_clk','rpt_time_turn']
# - Uses ad meta: ['ads_idx','ads_category','ads_os_type','domain', ...]
# - No file saving. All functions return DataFrames / dicts.
# - Upgrades: IDF feature weighting, time decay, mixed baselines, cohort size weighting, calibration.
# ============================================================

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

EPS = 1e-12

# ----------------------- utils -----------------------
def _ensure_df(maybe_df_or_path, usecols=None, encoding="utf-8-sig"):
    if isinstance(maybe_df_or_path, pd.DataFrame):
        return maybe_df_or_path.copy()
    return pd.read_csv(maybe_df_or_path, usecols=usecols, encoding=encoding)

def _clr_block(df, eps=1e-6):
    if df.empty: return df
    Z = df.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

def _ad_meta_pct(df, col, prefix):
    d = df.dropna(subset=[col])
    if d.empty: return pd.DataFrame()
    vc = (d.groupby(["ads_idx", col]).size().rename("cnt").reset_index())
    total = vc.groupby("ads_idx")["cnt"].transform("sum")
    vc["pct"] = vc["cnt"] / total
    piv = vc.pivot(index="ads_idx", columns=col, values="pct").fillna(0.0)
    piv.columns = [f"{prefix}{c}_pct" for c in piv.columns]
    return piv

# ---------------- store builder (fast & rich) ----------------
def _build_store_from_hourly(
    perf, meta,
    L_days=30, window_end=None, min_frac=0.002,
    decay_half_life_days=None,         # e.g., 14 → recent clicks weighted more
    use_idf=True, idf_smooth=1.0       # IDF column weighting for meta features
):
    perf = _ensure_df(perf, usecols=["rpt_time_date","ads_idx","mda_idx","rpt_time_clk","rpt_time_turn"])
    meta = _ensure_df(meta)

    if not np.issubdtype(perf["rpt_time_date"].dtype, np.datetime64):
        perf["rpt_time_date"] = pd.to_datetime(perf["rpt_time_date"])
    perf = perf.rename(columns={"rpt_time_clk":"clicks", "rpt_time_turn":"conversions"})

    if window_end is None:
        window_end = perf["rpt_time_date"].max()
    window_end = pd.to_datetime(window_end).normalize()
    start = window_end - pd.Timedelta(days=L_days-1)
    hist = perf[(perf["rpt_time_date"]>=start) & (perf["rpt_time_date"]<=window_end)].copy()

    # time decay (optional)
    if decay_half_life_days is not None and decay_half_life_days > 0:
        age = (window_end - hist["rpt_time_date"]).dt.total_seconds()/86400.0
        decay = np.power(0.5, age/float(decay_half_life_days))
        hist["clicks"] = hist["clicks"] * decay
        hist["conversions"] = hist["conversions"] * decay

    # meta embedding → CLR → z-score
    dom_pct = _ad_meta_pct(meta, "domain", "ad_domain_")
    cat_pct = _ad_meta_pct(meta, "ads_category", "ad_cat_")
    os_pct  = _ad_meta_pct(meta, "ads_os_type", "ad_os_")
    A = dom_pct.join([cat_pct, os_pct], how="outer").fillna(0.0)

    # drop very rare columns
    if len(A) > 0 and min_frac > 0:
        nz = (A != 0).sum(0)
        keep = nz[nz >= max(1, int(len(A)*min_frac))].index
        A = A[keep]

    dom_cols = [c for c in A.columns if c.startswith("ad_domain_")]
    cat_cols = [c for c in A.columns if c.startswith("ad_cat_")]
    os_cols  = [c for c in A.columns if c.startswith("ad_os_")]
    for cols in (dom_cols, cat_cols, os_cols):
        if cols: A[cols] = _clr_block(A[cols])

    mu, sigma = A.mean(), A.std().replace(0, 1.0)
    A_z = ((A - mu) / (sigma + 1e-9)).astype(np.float32)

    # IDF weighting (optional)
    idf_vec = None
    if use_idf:
        df_col = (A_z != 0).sum(0).astype(float)
        N = float(len(A_z))
        idf = np.log((N + 1.0) / (df_col + idf_smooth))
        idf = idf / (idf.mean() + 1e-12)
        A_z = A_z.mul(idf, axis=1)
        idf_vec = idf.astype(np.float32)

    # restrict to ads in the window
    A_z = A_z.loc[A_z.index.intersection(hist["ads_idx"].unique())]

    # build (ads×mda) CSR matrices
    grp = (hist.groupby(["ads_idx","mda_idx"])
           .agg(clicks=("clicks","sum"), convs=("conversions","sum"))
           .reset_index())
    ads_uni = pd.Index(grp["ads_idx"].unique()).sort_values()
    mda_uni = pd.Index(grp["mda_idx"].unique()).sort_values()
    ar = ads_uni.get_indexer(grp["ads_idx"])
    mr = mda_uni.get_indexer(grp["mda_idx"])
    clicks_csr = csr_matrix((grp["clicks"].astype(np.float32).values, (ar, mr)),
                            shape=(len(ads_uni), len(mda_uni)))
    convs_csr  = csr_matrix((grp["convs"].astype(np.float32).values,  (ar, mr)),
                            shape=(len(ads_uni), len(mda_uni)))

    row_clicks = np.asarray(clicks_csr.sum(1)).ravel().astype(np.float32)   # per-ad size

    # baselines: mda only
    m_clicks = np.asarray(clicks_csr.sum(0)).ravel()
    m_convs  = np.asarray(convs_csr.sum(0)).ravel()

    # baselines by cat / domain / os (dict[val] -> clicks/convs arrays)
    def _mk_baseline(key_col):
        if key_col not in meta.columns: return {}
        keys = meta[["ads_idx", key_col]].dropna().drop_duplicates()
        g2 = grp.merge(keys, on="ads_idx", how="left").dropna(subset=[key_col])
        res = {}
        for val in g2[key_col].unique():
            sub = g2[g2[key_col]==val]
            if sub.empty: continue
            ar2 = ads_uni.get_indexer(sub["ads_idx"])
            mr2 = mda_uni.get_indexer(sub["mda_idx"])
            cc = csr_matrix((sub["clicks"].astype(np.float32).values, (ar2, mr2)),
                            shape=(len(ads_uni), len(mda_uni)))
            cv = csr_matrix((sub["convs"].astype(np.float32).values,  (ar2, mr2)),
                            shape=(len(ads_uni), len(mda_uni)))
            res[val] = {
                "clicks": np.asarray(cc.sum(0)).ravel(),
                "convs":  np.asarray(cv.sum(0)).ravel()
            }
        return res

    base_cat = _mk_baseline("ads_category")
    base_dom = _mk_baseline("domain")
    base_os  = _mk_baseline("ads_os_type")

    store = {
        "L_days": int(L_days), "window_end": window_end,
        "A_z": A_z, "A_cols": A_z.columns.tolist(), "A_mu": mu, "A_sigma": sigma,
        "idf_vec": idf_vec,
        "ads_index": ads_uni, "mda_index": mda_uni,
        "clicks_csr": clicks_csr, "convs_csr": convs_csr,
        "row_clicks": row_clicks,
        "m_clicks": m_clicks, "m_convs": m_convs,
        "base_cat": base_cat, "base_dom": base_dom, "base_os": base_os,
        "hist_df": hist, "meta_df": meta
    }
    return store

# ---------------- target vector (meta → z + idf) ----------------
def _target_vec_from_meta(meta_df, store, target_ads_id):
    cols, mu, sigma = store["A_cols"], store["A_mu"], store["A_sigma"]
    rows = meta_df[meta_df["ads_idx"]==target_ads_id]
    s = pd.Series(0.0, index=cols, dtype=np.float32)

    def _ratio(df, col):
        if col not in df.columns or df[col].dropna().empty: return {}
        return df[col].dropna().astype(str).value_counts(normalize=True).to_dict()

    r_dom = _ratio(rows, "domain")
    r_cat = _ratio(rows, "ads_category")
    r_os  = _ratio(rows, "ads_os_type")
    for c in cols:
        if c.startswith("ad_domain_"): s[c] = r_dom.get(c.replace("ad_domain_","").replace("_pct",""), 0.0)
        elif c.startswith("ad_cat_"):  s[c] = r_cat.get(c.replace("ad_cat_","").replace("_pct",""), 0.0)
        elif c.startswith("ad_os_"):   s[c] = r_os.get(c.replace("ad_os_","").replace("_pct",""), 0.0)

    df_t = s.to_frame().T
    dom = [c for c in cols if c.startswith("ad_domain_")]
    cat = [c for c in cols if c.startswith("ad_cat_")]
    os_ = [c for c in cols if c.startswith("ad_os_")]
    if dom: df_t[dom] = _clr_block(df_t[dom])
    if cat: df_t[cat] = _clr_block(df_t[cat])
    if os_: df_t[os_] = _clr_block(df_t[os_])

    z = (df_t.iloc[0] - mu) / (sigma + 1e-9)

    if store.get("idf_vec") is not None:
        idf = store["idf_vec"]
        if isinstance(idf, pd.Series):
            z = z * idf.reindex(z.index).fillna(1.0)
        else:
            z = z * idf

    a = z.values.astype(np.float32)
    a = a / (np.sqrt((a*a).sum()) + 1e-12)
    return z, a

# ---------------- mixed baseline for a target ----------------
def _mix_baseline_for_target(store, meta_df, target_ads_id,
                             alpha=2.0, beta=120.0,
                             weights=(0.5,0.3,0.2)):   # (cat, domain, os)
    def _ratio(col):
        s = meta_df.loc[meta_df["ads_idx"]==target_ads_id, col].dropna()
        if s.empty: return {}
        return s.astype(str).value_counts(normalize=True).to_dict()

    r_cat = _ratio("ads_category"); r_dom = _ratio("domain"); r_os = _ratio("ads_os_type")

    def _blend(map_dict, r):
        if not map_dict or not r: return None
        acc_clicks = np.zeros_like(store["m_clicks"], dtype=float)
        acc_convs  = np.zeros_like(store["m_convs"], dtype=float)
        for k, p in r.items():
            if k in map_dict:
                acc_clicks += p * map_dict[k]["clicks"]
                acc_convs  += p * map_dict[k]["convs"]
        denom = acc_clicks + alpha + beta
        return (acc_convs + alpha) / np.where(denom==0, np.inf, denom)

    c_cat = _blend(store["base_cat"], r_cat)
    c_dom = _blend(store["base_dom"], r_dom)
    c_os  = _blend(store["base_os"],  r_os)

    parts, ws = [], []
    if c_cat is not None: parts.append(c_cat); ws.append(weights[0])
    if c_dom is not None: parts.append(c_dom); ws.append(weights[1])
    if c_os  is not None: parts.append(c_os);  ws.append(weights[2])

    if not parts:
        return (store["m_convs"] + alpha) / (store["m_clicks"] + alpha + beta)
    ws = np.asarray(ws, dtype=float); ws = ws / ws.sum()
    base = np.average(np.vstack(parts), axis=0, weights=ws)
    return base

# ---------------- prediction ----------------
def predict_for_new_ad_from_hourly(
    perf, meta, target_ads_id,
    L_days=30, H_days=30, K=50, beta=1.0,
    decay_half_life_days=14, use_idf=True, idf_smooth=1.0,
    cohort_size_power=0.5, blend_kappa=20.0,
    alpha_prior=2.0, beta_prior=120.0, calibrate=True
):
    store = _build_store_from_hourly(
        perf, meta, L_days=L_days,
        decay_half_life_days=decay_half_life_days,
        use_idf=use_idf, idf_smooth=idf_smooth
    )

    meta_df = store["meta_df"]
    z, a = _target_vec_from_meta(meta_df, store, target_ads_id)
    A = store["A_z"].reindex(store["ads_index"], fill_value=0.0).values
    norms = np.sqrt((A*A).sum(1)) + 1e-12
    sims = (A @ a) / norms

    K = min(K, sims.shape[0])
    idx = np.argpartition(-sims, K-1)[:K]
    idx = idx[np.argsort(-sims[idx])]

    w = np.power(np.clip(sims[idx], 0, 1), beta)
    if cohort_size_power and cohort_size_power > 0:
        rc = store["row_clicks"][idx]
        w = w * np.power(rc + 1.0, cohort_size_power)
    w = w / (w.sum() + EPS)

    cohort_ads = store["ads_index"][idx].astype(int)
    cohort = pd.DataFrame({"weight": w, "sim": sims[idx]}, index=cohort_ads)

    # cohort-weighted clicks/conversions
    W = np.zeros(len(store["ads_index"]), dtype=np.float32)
    pos = store["ads_index"].get_indexer(cohort.index)
    W[pos] = cohort["weight"].values.astype(np.float32)
    clicks_w = store["clicks_csr"].T.dot(W)
    convs_w  = store["convs_csr"].T.dot(W)
    cvr_cohort = (convs_w + alpha_prior) / (clicks_w + alpha_prior + beta_prior)

    # mixed baseline
    base = _mix_baseline_for_target(store, meta_df, target_ads_id,
                                    alpha=alpha_prior, beta=beta_prior,
                                    weights=(0.5,0.3,0.2))

    eff = clicks_w
    w1 = eff / (eff + float(blend_kappa))
    pred_cvr = w1 * cvr_cohort + (1.0 - w1) * base

    # simple calibration (align mean within window)
    if calibrate:
        hist = store["hist_df"]
        true_mean = (hist["conversions"].sum() + alpha_prior) / (hist["clicks"].sum() + alpha_prior + beta_prior)
        pred_mean = float(np.average(pred_cvr, weights=np.maximum(eff, 1.0)))
        scale = np.clip(true_mean / (pred_mean + 1e-12), 0.5, 2.0)
        pred_cvr = pred_cvr * scale

    out = pd.DataFrame({
        "mda_idx": store["mda_index"].astype(int).values,
        "pred_cvr": pred_cvr,
        "cvr_cohort": cvr_cohort,
        "baseline_cvr": base,
        "cohort_eff_clicks": eff
    })
    out["per_1000_clicks_conv"] = out["pred_cvr"] * 1000.0
    out["scenarioB_clicks"] = (eff / float(store["L_days"])) * float(H_days)
    out["scenarioB_conv"]   = out["pred_cvr"] * out["scenarioB_clicks"]
    out = out.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)

    return out, cohort

# ---------------- evaluation (attach metrics next to table) ----------------
def evaluate_and_attach(perf, pred_df, target_ads_id, L_days=30):
    perf = _ensure_df(perf, usecols=["rpt_time_date","ads_idx","mda_idx","rpt_time_clk","rpt_time_turn"])
    perf["rpt_time_date"] = pd.to_datetime(perf["rpt_time_date"])
    end = perf["rpt_time_date"].max().normalize()
    start = end - pd.Timedelta(days=L_days-1)
    hist = perf[(perf["rpt_time_date"]>=start)&(perf["rpt_time_date"]<=end)]
    hist = hist.rename(columns={"rpt_time_clk":"clicks","rpt_time_turn":"conversions"})

    act = hist[hist["ads_idx"]==target_ads_id].groupby("mda_idx").agg(
        clicks=("clicks","sum"),
        conv=("conversions","sum")
    ).reset_index()
    act["cvr"] = act["conv"] / act["clicks"].replace(0,np.nan)

    joined = pred_df.merge(act[["mda_idx","cvr","clicks"]], on="mda_idx", how="left")
    eval_df = joined.dropna(subset=["cvr"]).copy()

    mae = float(np.nanmean(np.abs(eval_df["pred_cvr"] - eval_df["cvr"]))) if len(eval_df) else np.nan
    rmse = float(np.sqrt(np.nanmean((eval_df["pred_cvr"] - eval_df["cvr"])**2))) if len(eval_df) else np.nan
    wmae = float(np.nansum(np.abs(eval_df["pred_cvr"] - eval_df["cvr"]) * eval_df["clicks"]) /
                 (np.nansum(eval_df["clicks"])+1e-12)) if len(eval_df) else np.nan
    pear = float(eval_df[["pred_cvr","cvr"]].corr().iloc[0,1]) if len(eval_df)>=2 else np.nan
    spear = float(eval_df[["pred_cvr","cvr"]].rank().corr().iloc[0,1]) if len(eval_df)>=2 else np.nan

    def p_at(n):
        L = set(joined.sort_values("per_1000_clicks_conv", ascending=False).head(n)["mda_idx"])
        R = set(eval_df.sort_values("cvr", ascending=False).head(n)["mda_idx"])
        return len(L & R) / max(1, len(L))
    P5 = p_at(5); P10 = p_at(10)

    metrics = dict(RMSE_cvr=rmse, MAE_cvr=mae, WMAE_cvr=wmae, Pearson=pear, Spearman=spear, P5=P5, P10=P10)

    for k,v in metrics.items():
        joined[k] = v
    return metrics, joined

# ---------------- how to run ----------------
# perf_csv = "/path/수정_시간별적립보고서(최종).csv"
# meta_csv = "/path/광고도메인리스트.csv"
# TARGET_AD = 9982
# pred_df, cohort = predict_for_new_ad_from_hourly(
#     perf_csv, meta_csv, TARGET_AD,
#     L_days=30, H_days=30, K=50, beta=1.0,
#     decay_half_life_days=14, use_idf=True, cohort_size_power=0.5,
#     blend_kappa=20.0, alpha_prior=2.0, beta_prior=120.0, calibrate=True
# )
# metrics, table = evaluate_and_attach(perf_csv, pred_df, TARGET_AD, L_days=30)
# display(table.head(20)); print(metrics)


In [None]:
# --- 설정 ---
TARGET_AD = 73878            # 보고 싶은 광고 ID
L_DAYS   = 30
H_DAYS   = 30
K        = 100               # 커버리지 늘리고 싶으면 100~200
BETA     = 0.5               # 유사도 완화 (0.5 추천)

# (선택) 라벨 정규화: 공백/슬래시 등 정리
def _norm(df):
    for c in ["domain", "ads_category", "ads_os_type"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df
click   = _norm(click)
ad_meta = _norm(ad_meta)

# 1) 최근 L일 히스토리 임베딩 공간(캐시) 만들기 — 희귀열 드롭을 막고 싶으면 min_frac=0.0
hist_store, wend = _build_or_get_ad_hist_space(click, L_days=L_DAYS, min_frac=0.0)

# 2) 타깃 광고를 같은 공간으로 투영하여 't_vec' 생성
t_vec = _target_ad_vector_from_meta(ad_meta, hist_store, TARGET_AD)

# 3) 예측 실행(코호트/시나리오 포함)
pred_df, cohort = predict_for_new_ad(
    click, ad_meta, TARGET_AD,
    L_days=L_DAYS, H_days=H_DAYS, K=K, beta=BETA
)

# 4) 진단 체크
print("||t_vec||₁:", float(np.abs(t_vec).sum()),
      " nonzero:", int((t_vec != 0).sum()))

print("예: ad_domain_* 샘플:",
      [c for c in hist_store["cols"] if c.startswith("ad_domain_")][:5])
print("타깃 광고 도메인 값들:",
      ad_meta.loc[ad_meta["ads_idx"]==TARGET_AD, "domain"].dropna().unique())

pos = hist_store["A_hist_z"].index.get_indexer(cohort.index)
print("코호트 매칭률:", float((pos >= 0).mean()))
print("cohort_eff_clicks 합:", float(pred_df["cohort_eff_clicks"].sum()))


In [130]:
perf_csv = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv"
meta_csv = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv"

TARGET_AD = 73878
pred_df, cohort = predict_for_new_ad_from_hourly(
    perf_csv, meta_csv, TARGET_AD,
    L_days=30, H_days=7, K=50, beta=1.0,
    decay_half_life_days=14, use_idf=True, cohort_size_power=0.5,
    blend_kappa=20.0, alpha_prior=2.0, beta_prior=120.0, calibrate=True
)
metrics, table = evaluate_and_attach(perf_csv, pred_df, TARGET_AD, L_days=30)
display(table.head(20)); print(metrics)


Unnamed: 0,mda_idx,pred_cvr,cvr_cohort,baseline_cvr,cohort_eff_clicks,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,1032,0.370694,0.016393,0.224743,0.0,370.693715,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
1,1025,0.340846,0.016393,0.206647,0.0,340.845689,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
2,756,0.319704,0.016393,0.193829,0.0,319.703865,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
3,1047,0.299616,0.016393,0.18165,0.0,299.615858,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
4,757,0.294313,0.016393,0.178435,0.0,294.313268,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
5,1027,0.29405,0.016393,0.178276,0.0,294.049938,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
6,1026,0.292439,0.016393,0.177299,0.0,292.438756,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
7,1029,0.283893,0.016393,0.172118,0.0,283.893318,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
8,678,0.272364,0.016393,0.165128,0.0,272.364242,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0
9,1020,0.261853,0.016393,0.158756,0.0,261.853333,0.0,0.0,,,0.057802,0.041104,0.015622,0.030254,0.391311,0.0,0.0


{'RMSE_cvr': 0.057802172436283175, 'MAE_cvr': 0.04110442436035462, 'WMAE_cvr': 0.015622424112463876, 'Pearson': 0.030254462600882184, 'Spearman': 0.3913107207028266, 'P5': 0.0, 'P10': 0.0}


In [131]:
# 1) 타깃 벡터가 비어있는지
print("||t_vec||₁:", float(t_vec.abs().sum()), "nonzero:", int((t_vec!=0).sum()))

# 2) 히스토리 열과 타깃 메타 레이블이 맞는지
print("예: ad_domain_* 샘플:", [c for c in hist_store["cols"] if c.startswith("ad_domain_")][:5])
print("타깃 광고 메타 도메인 값들:", ad_meta_df.loc[ad_meta_df["ads_idx"]==TARGET_AD, "domain"].dropna().unique())

# 3) 코호트가 히스토리 ads에 매칭되었는지
pos = hist_store["A_hist_z"].index.get_indexer(cohort.index)
print("코호트 매칭률:", (pos>=0).mean(), "cohort_eff_clicks 합:", pred_df["cohort_eff_clicks"].sum())


NameError: name 't_vec' is not defined

# 다시 처음

In [132]:
# =========================================
# Step 1) 광고 메타 기반 "유사 광고" 찾기 + (선택) 클릭로그로 품질 평가
# =========================================
import numpy as np
import pandas as pd

# ---------- 유틸 ----------
def _clr_block(df, eps=1e-6):
    """구성비(합=1) 블록에 CLR 변환"""
    if df.empty: return df
    Z = df.clip(lower=eps)
    g = np.exp(np.log(Z).mean(axis=1))
    return np.log(Z.div(g, axis=0))

def _norm_meta(df):
    """라벨 통일(공백 정리 등)"""
    df = df.copy()
    for c in ["domain","ads_category","ads_os_type"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

# ---------- 메타 → 임베딩 공간 ----------
def build_ad_meta_space(ad_meta: pd.DataFrame, min_frac=0.002, use_clr=True):
    """
    ad_meta 스키마(최소): ads_idx, domain, ads_category, ads_os_type
    광고별 domain/카테/OS 분포(%)를 만들고 CLR+표준화까지 적용해 임베딩 행렬 반환
    """
    meta = _norm_meta(ad_meta)

    def _ad_pct(col, prefix):
        # 광고별 라벨 빈도 → 분포
        vc = (meta.groupby(["ads_idx", col]).size().rename("cnt").reset_index())
        total = vc.groupby("ads_idx")["cnt"].transform("sum")
        vc["pct"] = vc["cnt"] / total
        piv = vc.pivot(index="ads_idx", columns=col, values="pct").fillna(0.0)
        piv.columns = [f"{prefix}{c}_pct" for c in piv.columns]
        return piv

    dom = _ad_pct("domain", "ad_domain_")
    cat = _ad_pct("ads_category", "ad_cat_")
    os_ = _ad_pct("ads_os_type", "ad_os_")
    X = dom.join([cat, os_], how="outer").fillna(0.0)

    # 너무 희귀한 열 드롭(속도/노이즈↓)
    if len(X) and min_frac>0:
        nz = (X!=0).sum(0)
        keep = nz[nz >= max(1, int(len(X)*min_frac))].index
        X = X[keep]

    blocks = {
        "domain": [c for c in X.columns if c.startswith("ad_domain_")],
        "cat":    [c for c in X.columns if c.startswith("ad_cat_")],
        "os":     [c for c in X.columns if c.startswith("ad_os_")],
    }
    if use_clr:
        for cols in blocks.values():
            if cols: X[cols] = _clr_block(X[cols])

    mu, sigma = X.mean(), X.std().replace(0, 1.0)
    Xz = ((X - mu) / (sigma + 1e-9)).astype(np.float32)

    return {"Xz": Xz, "mu": mu.astype(np.float32), "sigma": sigma.astype(np.float32),
            "cols": Xz.columns.tolist(), "blocks": blocks}

# ---------- 타깃 광고 벡터 ----------
def target_vec_from_meta(ad_meta: pd.DataFrame, space: dict, target_ads_id):
    cols, mu, sigma = space["cols"], space["mu"], space["sigma"]
    blocks = space["blocks"]
    meta = _norm_meta(ad_meta)
    rows = meta[meta["ads_idx"]==target_ads_id]
    if rows.empty:
        raise ValueError(f"ad_meta에 ads_idx={target_ads_id}가 없습니다.")

    def _ratio(col):
        return (rows[col].dropna().astype(str).value_counts(normalize=True)
                if col in rows.columns else pd.Series(dtype=float))

    r_dom = _ratio("domain").to_dict()
    r_cat = _ratio("ads_category").to_dict()
    r_os  = _ratio("ads_os_type").to_dict()

    s = pd.Series(0.0, index=cols, dtype=np.float32)
    for c in cols:
        if   c.startswith("ad_domain_"): s[c] = r_dom.get(c.replace("ad_domain_","").replace("_pct",""), 0.0)
        elif c.startswith("ad_cat_"   ): s[c] = r_cat.get(c.replace("ad_cat_"   ,"").replace("_pct",""), 0.0)
        elif c.startswith("ad_os_"    ): s[c] = r_os .get(c.replace("ad_os_"    ,"").replace("_pct",""), 0.0)
    df = s.to_frame().T
    if blocks["domain"]: df[blocks["domain"]] = _clr_block(df[blocks["domain"]])
    if blocks["cat"]:    df[blocks["cat"]]    = _clr_block(df[blocks["cat"]])
    if blocks["os"]:     df[blocks["os"]]     = _clr_block(df[blocks["os"]])

    z = ((df.iloc[0] - mu) / (sigma + 1e-9)).astype(np.float32)
    a = z.values
    a = a / (np.sqrt((a*a).sum()) + 1e-12)  # 코사인 정규화
    return z, a

# ---------- 유사 광고 Top-K ----------
def find_similar_ads(space: dict, target_ads_id, K=50, beta=1.0):
    Xz = space["Xz"]
    z, a = target_vec_from_meta(ad_meta, space, target_ads_id)
    M = Xz.values
    norms = np.sqrt((M*M).sum(1)) + 1e-12
    sims = (M @ a) / norms
    ads_ids = Xz.index.to_numpy()

    # 자기 자신 제거
    mask = (ads_ids != target_ads_id)
    sims, ads_ids = sims[mask], ads_ids[mask]

    if K > len(sims): K = len(sims)
    top_idx = np.argpartition(-sims, K-1)[:K]
    top_idx = top_idx[np.argsort(-sims[top_idx])]

    w = np.power(np.clip(sims[top_idx], 0, 1), beta)
    w = w / (w.sum() + 1e-12)

    out = pd.DataFrame({
        "ads_idx": ads_ids[top_idx].astype(int),
        "sim": sims[top_idx],
        "weight": w
    })
    return out

# ---------- (선택) 클릭 로그로 품질 평가 ----------
def evaluate_similarity_with_clicks(click_df: pd.DataFrame, target_ads_id, cand_ads,
                                    L_days=30, window_end=None, top_mda=10):
    """
    광고-매체 분포 기반의 '행동 유사도'로 평가
      - 코사인(타깃 vs 후보) on mda click-share
      - 상위 top_mda 매체 Jaccard 겹침
    """
    df = click_df.copy()
    if "click_day" in df.columns:
        df["click_day"] = pd.to_datetime(df["click_day"])
        if window_end is None:
            window_end = df["click_day"].max()
        start = pd.to_datetime(window_end).normalize() - pd.Timedelta(days=L_days-1)
        df = df[(df["click_day"]>=start) & (df["click_day"]<=window_end)]

    # 광고×매체 클릭 분포
    C = pd.crosstab(df["ads_idx"], df["mda_idx"])
    C = C.div(C.sum(axis=1), axis=0).fillna(0.0)

    if target_ads_id not in C.index:
        return pd.DataFrame({"ads_idx": cand_ads,
                             "cos_mda": np.nan, "jaccard_top": np.nan})

    t = C.loc[target_ads_id].values
    t /= (np.linalg.norm(t) + 1e-12)

    res = []
    target_top = set(C.loc[target_ads_id].sort_values(ascending=False).head(top_mda).index)
    for a in cand_ads:
        if a in C.index:
            v = C.loc[a].values
            v /= (np.linalg.norm(v) + 1e-12)
            cos = float(v @ t)
            cand_top = set(C.loc[a].sort_values(ascending=False).head(top_mda).index)
            inter = len(target_top & cand_top)
            union = len(target_top | cand_top) or 1
            jac = inter / union
        else:
            cos, jac = np.nan, np.nan
        res.append((int(a), cos, jac))
    return pd.DataFrame(res, columns=["ads_idx","cos_mda","jaccard_top"])

# ===================== 사용 예 =====================
# 1) 파일 경로 지정
# ad_meta_path = "/mnt/data/광고도메인리스트.csv"
# click_path   = "/mnt/data/ads_pool.csv"   # (선택) ads_idx, mda_idx, click_day, ...

# 2) 데이터 로드
# ad_meta = pd.read_csv(ad_meta_path, encoding="utf-8-sig")
# click   = pd.read_csv(click_path, encoding="utf-8-sig")   # 평가에만 필요

# 3) 공간 만들기 & 유사 광고 추출
# TARGET_AD = 9982
# space = build_ad_meta_space(ad_meta, min_frac=0.002, use_clr=True)
# sim_ads = find_similar_ads(space, TARGET_AD, K=30, beta=1.0)
# display(sim_ads.head(20))

# 4) (선택) 클릭 로그로 품질 확인
# eval_df = evaluate_similarity_with_clicks(click, TARGET_AD, sim_ads["ads_idx"], L_days=30, top_mda=10)
# result = sim_ads.merge(eval_df, on="ads_idx", how="left")
# display(result.head(20))
# print({
#     "mean_cos_mda": float(result["cos_mda"].mean()),
#     "mean_jaccard": float(result["jaccard_top"].mean())
# })


In [135]:

ad_meta_path = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv"
click_path   = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/유저테이블.csv"   # (선택) ads_idx, mda_idx, click_day, ...

ad_meta = pd.read_csv(ad_meta_path, encoding="utf-8-sig")
click   = pd.read_csv(click_path, encoding="utf-8-sig")   # 평가에만 필요


TARGET_AD = 9982
space = build_ad_meta_space(ad_meta, min_frac=0.002, use_clr=True)
sim_ads = find_similar_ads(space, TARGET_AD, K=30, beta=1.0)
display(sim_ads.head(20))


eval_df = evaluate_similarity_with_clicks(click, TARGET_AD, sim_ads["ads_idx"], L_days=30, top_mda=10)
result = sim_ads.merge(eval_df, on="ads_idx", how="left")
display(result.head(20))
print({
    "mean_cos_mda": float(result["cos_mda"].mean()),
    "mean_jaccard": float(result["jaccard_top"].mean())
})

Unnamed: 0,ads_idx,sim,weight
0,34045,1.0,0.039676
1,445574,1.0,0.039676
2,438420,0.828716,0.03288
3,438389,0.828716,0.03288
4,438150,0.828716,0.03288
5,438407,0.828716,0.03288
6,438419,0.828716,0.03288
7,438417,0.828716,0.03288
8,438059,0.828716,0.03288
9,443852,0.828716,0.03288


Unnamed: 0,ads_idx,sim,weight,cos_mda,jaccard_top
0,34045,1.0,0.039676,0.000472,0.0
1,445574,1.0,0.039676,5.2e-05,0.0
2,438420,0.828716,0.03288,0.0,0.0
3,438389,0.828716,0.03288,4.8e-05,0.052632
4,438150,0.828716,0.03288,0.0,0.052632
5,438407,0.828716,0.03288,0.0,0.052632
6,438419,0.828716,0.03288,0.0,0.0
7,438417,0.828716,0.03288,0.0,0.052632
8,438059,0.828716,0.03288,0.0,0.0
9,443852,0.828716,0.03288,0.0,0.052632


{'mean_cos_mda': 2.1872987300235968e-05, 'mean_jaccard': 0.029038112522686017}


In [136]:
ads_list[ads_list['ads_idx'].isin([9982, 34045, 445574, 438420, 438389])]

Unnamed: 0,ads_idx,ads_code,aff_idx,adv_idx,ads_type,ads_category,ads_name,ads_icon_img,ads_summary,ads_save_way,ads_day_cap,ads_sdate,ads_edate,ads_age_min,ads_age_max,ads_os_type,ads_contract_price,ads_reward_price,ads_order,ads_rejoin_type,regdate,domain
6,9982,ChpB9DGeNo,8,50,11,3,강원일보 네이버 뉴스,https://static.adbrix.igaworks.com/adpopcorn/2...,[참여방법]\n1.\'참여하기\' 버튼 터치하고 이벤트페이지로 이동\n2. 이벤트 ...,네이버 뉴스 구독,False,2021-02-05 14:00:00,2030-01-01 00:00:00,0,100,7,230,160,2729900,NONE,2021-02-05 14:56:37 UTC,미디어/컨텐츠\n
227,34045,IhvxEGKBOh,8,50,10,3,이마트 유튜브,https://static.adbrix.igaworks.com/adpopcorn/2...,이마트 유튜브 채널 \'구독 및 좋아요\'하면 리워드 지급\n(20~59세만 참여 ...,구독 및 좋아요,False,2024-04-19 15:00:00,2029-11-01 00:00:00,20,59,7,230,160,2729400,NONE,2024-04-19 15:31:37 UTC,미디어/컨텐츠\n
3486,438389,kAX6YrB9NG,86,1655,3,4,인포하이브 블로그 네이 169945,https://cashplan-r2.uk/quiz-mission.png,[참여방법]\n1. 본문에서 퀴즈와 상품/장소 확인\n1-1. 퀴즈: 1. 다양한 ...,일반정답미션,False,2025-07-30 00:28:28,9999-12-31 23:59:59,0,100,7,18,12,1369722,ADS_CODE_DAILY_UPDATE,2025-07-30 00:28:27 UTC,미디어/컨텐츠\n
3514,438420,dhM65dzD1u,86,1655,3,4,정답 미션 169931,https://cashplan-r2.uk/quiz-mission.png,[참여방법]\n1. 퀴즈 확인\n1-1. 정답으로 가는 광고 이미지를 찾아 클릭하세...,일반정답미션,False,2025-07-30 09:57:48,9999-12-31 23:59:59,0,100,7,18,12,1369722,ADS_CODE_DAILY_UPDATE,2025-07-30 09:57:47 UTC,미디어/컨텐츠\n
8660,445574,vbkRIvIpae,57,84,7,3,[인스타팔로우] tamz.kr,https://lh3.googleusercontent.com/2sREY-8Upjma...,<< 적립방법 >>\n\n1. 최초 참여시 인스타그램 로그인 진행\n\n2. 프로필...,팔로우,False,2025-08-20 14:00:00,2031-01-03 00:00:00,0,100,7,200,150,2728900,NONE,2025-08-20 14:35:48 UTC,미디어/컨텐츠\n


In [138]:
ads_pool[ads_pool['ads_idx'].isin([9982, 34045, 445574, 438420, 438389])]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
6,6,6,9982,36,22510,23187,12981,3,미디어/컨텐츠,7,2729900,114.674062,118.0,NONE,230,160,2025-07-26 03:09:06,2025-08-25 11:23:42,강원일보 네이버 뉴스,2021-02-05 14:00:00,0,31,418.741935,0.6,70,0.4,908670,MEGA,747.967742,726.129032,MEGA,HIGH,MEGA_HIGH,4,4,4,4,1,17,MEGA,0,,0.0,1.0
217,225,227,34045,10,5749,16083,727,3,미디어/컨텐츠,7,2729400,8071.803301,76.0,NONE,230,160,2025-07-26 06:16:06,2025-08-25 11:21:18,이마트 유튜브,2024-04-19 15:00:00,0,31,23.451613,0.0,70,0.4,50890,MEGA,518.806452,185.451613,MEGA,GOOD,MEGA_GOOD,3,2,4,4,0,13,MEGA,0,,0.0,1.0
1996,2030,3486,438389,3,135,183,7,4,미디어/컨텐츠,7,1369722,69.571429,30.0,ADS_CODE_DAILY_UPDATE,18,14,2025-07-30 00:31:29,2025-07-31 00:52:42,인포하이브 블로그 네이 169945,2025-07-30 00:28:28,0,2,3.5,0.0,4,0.3,28,MEDIUM,91.5,67.5,MEGA,LOW,MEGA_LOW,2,1,3,1,0,7,MEDIUM,0,562563.0,1.0,1.0
4332,4386,8660,445574,13,1756,3050,1029,3,미디어/컨텐츠,7,2728900,39595.033042,31297.0,NONE,200,150,2025-08-20 14:42:52,2025-08-22 14:40:46,[인스타팔로우] tamz.kr,2025-08-20 14:00:00,0,2,514.5,0.3,50,0.3,51450,MEGA,1525.0,878.0,MEGA,HIGH,MEGA_HIGH,4,4,4,1,0,13,MEGA,0,,0.0,1.0


In [139]:
ads_pool[ads_pool['ads_idx'].isin([9982, 34045, 445574, 438420, 438389])]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
6,6,6,9982,36,22510,23187,12981,3,미디어/컨텐츠,7,2729900,114.674062,118.0,NONE,230,160,2025-07-26 03:09:06,2025-08-25 11:23:42,강원일보 네이버 뉴스,2021-02-05 14:00:00,0,31,418.741935,0.6,70,0.4,908670,MEGA,747.967742,726.129032,MEGA,HIGH,MEGA_HIGH,4,4,4,4,1,17,MEGA,0,,0.0,1.0
217,225,227,34045,10,5749,16083,727,3,미디어/컨텐츠,7,2729400,8071.803301,76.0,NONE,230,160,2025-07-26 06:16:06,2025-08-25 11:21:18,이마트 유튜브,2024-04-19 15:00:00,0,31,23.451613,0.0,70,0.4,50890,MEGA,518.806452,185.451613,MEGA,GOOD,MEGA_GOOD,3,2,4,4,0,13,MEGA,0,,0.0,1.0
1996,2030,3486,438389,3,135,183,7,4,미디어/컨텐츠,7,1369722,69.571429,30.0,ADS_CODE_DAILY_UPDATE,18,14,2025-07-30 00:31:29,2025-07-31 00:52:42,인포하이브 블로그 네이 169945,2025-07-30 00:28:28,0,2,3.5,0.0,4,0.3,28,MEDIUM,91.5,67.5,MEGA,LOW,MEGA_LOW,2,1,3,1,0,7,MEDIUM,0,562563.0,1.0,1.0
4332,4386,8660,445574,13,1756,3050,1029,3,미디어/컨텐츠,7,2728900,39595.033042,31297.0,NONE,200,150,2025-08-20 14:42:52,2025-08-22 14:40:46,[인스타팔로우] tamz.kr,2025-08-20 14:00:00,0,2,514.5,0.3,50,0.3,51450,MEGA,1525.0,878.0,MEGA,HIGH,MEGA_HIGH,4,4,4,1,0,13,MEGA,0,,0.0,1.0


In [140]:
import numpy as np
import pandas as pd

# ===============================
# 단일 카테고리(원-핫) 기반 유사 광고 Top-K
# ===============================

CAT_COLS = ["domain", "ads_category", "ads_os_type", "ads_type", "ads_rejoin_type"]
PRICE_CANDIDATES = ("ads_media_price", "media_price", "contract_price")

def _norm_meta(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

def _mode_or_unk(s: pd.Series, unk="UNK") -> str:
    s = s.dropna().astype(str)
    if s.empty:
        return unk
    m = s.mode()
    return (m.iat[0] if not m.empty else unk)

def build_ad_feature_space_singlecat(
    ad_meta: pd.DataFrame,
    id_col: str = "ads_idx",
    drop_rare_min_ads: int | None = None,   # 예: 3 → 3개 미만 광고에서만 등장한 더미 열은 드롭
    group_balance: bool = False,            # True면 그룹(도메인/카테/OS/타입/재가입)별 스케일 균형화
    use_float32: bool = True
):
    """
    ad_meta에 같은 ads_idx가 여러 행이어도 '단일 카테고리'로 축약해서 한 행으로 만듭니다.
    - 각 범주형은 광고별 최빈값(mode)으로 대표
    - 원-핫 후 가격(있으면) 합쳐 z-score
    """
    meta = _norm_meta(ad_meta)
    if id_col not in meta.columns:
        raise ValueError(f"'{id_col}' column not found in ad_meta")

    # 1) 광고별 단일 라벨로 축약 (mode)
    cols_present = [c for c in CAT_COLS if c in meta.columns]
    red = (meta[[id_col] + cols_present]
           .groupby(id_col, as_index=False)
           .agg({c: _mode_or_unk for c in cols_present}))

    # 2) 범주형 원-핫
    X_parts = []
    group_cols = {}  # 그룹별 열 추적
    for c in cols_present:
        d = pd.get_dummies(red[c], prefix=f"ad_{c}")
        d.index = red[id_col].astype(int).values
        X_parts.append(d)
        group_cols[c] = list(d.columns)

    X = pd.concat(X_parts, axis=1).astype(float)

    # 3) 희귀 더미 열 드롭(옵션)
    if drop_rare_min_ads is not None and drop_rare_min_ads > 1 and not X.empty:
        nz = X.sum(axis=0)  # 원-핫이라 열 합 = 등장 광고 수
        keep = nz[nz >= float(drop_rare_min_ads)].index
        X = X[keep]
        # group_cols 재계산
        new_group_cols = {}
        for c, cols in group_cols.items():
            new_group_cols[c] = [col for col in cols if col in X.columns]
        group_cols = new_group_cols

    # 4) 수치형 가격 추가 (있는 컬럼 하나라도 발견되면 사용)
    price_col = next((c for c in PRICE_CANDIDATES if c in meta.columns), None)
    if price_col is not None:
        price = (meta[[id_col, price_col]]
                 .dropna()
                 .groupby(id_col)[price_col]
                 .median())
        X["ad_media_price_log"] = np.log1p(price).reindex(X.index).fillna(0.0)

    # 5) (옵션) 그룹 균형화: 그룹별 분산 합이 비슷해지도록 스케일링
    if group_balance and not X.empty:
        for g, cols in group_cols.items():
            if not cols:
                continue
            # 그룹 내 열들의 표준편차 합으로 나눠서 그룹 간 영향 균형
            sd_sum = X[cols].std(ddof=0).replace(0, 1.0).sum()
            scale = 1.0 / float(sd_sum if sd_sum > 0 else 1.0)
            X[cols] = X[cols] * scale

    # 6) z-score (열 단위)
    mu = X.mean()
    sigma = X.std(ddof=0).replace(0, 1.0)
    A_z = (X - mu) / (sigma + 1e-9)

    if use_float32:
        A_z = A_z.astype(np.float32)
        mu = mu.astype(np.float32)
        sigma = sigma.astype(np.float32)

    store = dict(
        A_z=A_z,
        mu=mu,
        sigma=sigma,
        cols=A_z.columns.tolist(),
        id_col=id_col,
        group_cols=group_cols,
        price_col=price_col
    )
    return store

def find_similar_ads_singlecat(
    ad_meta: pd.DataFrame,
    target_ads_id: int,
    K: int = 30,
    beta: float = 1.0,
    **space_kwargs
):
    """
    단일 카테고리 원-핫 + 가격(log) 기반 코사인 유사도 Top-K
    """
    store = build_ad_feature_space_singlecat(ad_meta, **space_kwargs)
    A = store["A_z"]
    if target_ads_id not in A.index:
        raise ValueError(f"ads_idx={target_ads_id} not found in feature space (ad_meta를 확인하세요)")

    M = A.values
    a = A.loc[target_ads_id].values
    # L2 정규화(코사인)
    a = a / (np.linalg.norm(a) + 1e-12)
    norms = np.sqrt((M*M).sum(axis=1)) + 1e-12
    sims = (M @ a) / norms

    # 자기 자신 제외
    idx_all = np.arange(len(A))
    self_pos = int(np.where(A.index.values == int(target_ads_id))[0][0])
    sims[self_pos] = -np.inf

    K = min(K, len(sims))
    top = np.argpartition(-sims, K-1)[:K]
    top = top[np.argsort(-sims[top])]

    sim_vals = sims[top]
    weights = np.power(np.clip(sim_vals, 0, 1), beta)
    weights = weights / (weights.sum() + 1e-12)

    out = pd.DataFrame({
        "ads_idx": A.index.values[top].astype(int),
        "sim": sim_vals,
        "weight": weights
    }).reset_index(drop=True)

    # 디버그용: 어떤 피처가 비슷하게 맞았는지 간단 요약
    # (동일 원-핫이 많이 겹칠수록 sim이 올라감)
    return out, store

# ================= 사용 예 =================
# ad_meta = pd.read_csv("/mnt/data/광고도메인리스트.csv", encoding="utf-8-sig")
# target_id = 9982
# sim_ads, feat_store = find_similar_ads_singlecat(
#     ad_meta, target_id, K=30, beta=1.0,
#     drop_rare_min_ads=3,    # 희귀 라벨 열 제거(없애려면 None)
#     group_balance=False     # 그룹 균형화 원하면 True
# )
# display(sim_ads.head(20))


In [142]:
ad_meta = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv", encoding="utf-8-sig")
target_id = 9982
sim_ads, feat_store = find_similar_ads_singlecat(
    ad_meta, target_id, K=5, beta=1.0,
    drop_rare_min_ads=3,    # 희귀 라벨 열 제거(없애려면 None)
    group_balance=False     # 그룹 균형화 원하면 True
)
display(sim_ads.head(20))


Unnamed: 0,ads_idx,sim,weight
0,34045,1.0,0.209587
1,19488,0.965198,0.202293
2,446054,0.941255,0.197275
3,18504,0.939069,0.196816
4,11442,0.925773,0.19403


In [143]:
ads_seg = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/ads_segment.csv")

In [145]:
ads_seg[ads_seg['ads_idx'].isin([9982,34045,19488,446054,180504,11442])]

Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster
6,6,9982,36,22510,23187,12981,3,미디어/컨텐츠,7,2729900,114.674062,118.0,NONE,230,160,2025-07-26 03:09:06,2025-08-25 11:23:42,강원일보 네이버 뉴스,2021-02-05 14:00:00,0,31,418.741935,0.6,70,0.4,908670,MEGA,747.967742,726.129032,MEGA,HIGH,MEGA_HIGH,4,4,4,4,1,17,MEGA,0
27,27,11442,20,1333,2314,502,3,식음료,7,2760000,1966.926295,2050.0,NONE,280,120,2025-08-12 00:09:48,2025-08-18 03:16:44,[친한친구] 베지밀 공식 인스타그램,2021-06-18 00:00:00,1,7,71.714286,0.2,160,1.3,80320,MEGA,330.571429,190.428571,MEGA,HIGH,MEGA_HIGH,4,3,3,2,0,12,LARGE,0
64,64,19488,97,12099,12337,6372,3,미디어/컨텐츠,2,2729100,90.032486,103.0,NONE,230,160,2025-07-26 06:09:04,2025-08-25 06:39:19,광주방송 네이버 뉴스,2022-09-23 12:00:00,0,31,205.548387,0.5,70,0.4,446040,MEGA,397.967742,390.290323,MEGA,HIGH,MEGA_HIGH,4,4,3,4,0,15,MEGA,0
227,227,34045,10,5749,16083,727,3,미디어/컨텐츠,7,2729400,8071.803301,76.0,NONE,230,160,2025-07-26 06:16:06,2025-08-25 11:21:18,이마트 유튜브,2024-04-19 15:00:00,0,31,23.451613,0.0,70,0.4,50890,MEGA,518.806452,185.451613,MEGA,GOOD,MEGA_GOOD,3,2,4,4,0,13,MEGA,0
9017,9017,446054,2,30,46,0,3,커머스,7,2730100,,,NONE,200,160,2025-08-21 11:48:32,2025-08-21 15:06:22,[쇼핑라이브하트+채팅] 캔코 새학기 특가할인 이벤트!,2025-08-21 11:00:00,0,1,0.0,0.0,40,0.2,0,MEDIUM,46.0,30.0,LARGE,LOW,LARGE_LOW,2,0,2,0,0,4,SMALL,0


In [148]:
import pandas as pd
import numpy as np

# 1) 데이터 로드 -------------------------------------------------
# 시간별 집계 파일 예시(열: rpt_time_date, ads_idx, mda_idx, rpt_time_clk, rpt_time_turn)
perf_df = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv",
                      encoding="utf-8-sig")

# 광고 메타(열: ads_idx, domain, ads_category, ads_os_type, ads_type, ads_rejoin_type, ads_media_price ...)
ad_meta_df = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv",
                         encoding="utf-8-sig")

# 2) 최소 컬럼 체크/보정 -----------------------------------------
# 날짜 컬럼 통일 (함수는 아래 셋 중 아무거나 있으면 자동 인식)
if "rpt_time_date" not in perf_df.columns:
    if "click_day" in perf_df.columns:
        perf_df["rpt_time_date"] = pd.to_datetime(perf_df["click_day"])
    elif "click_date" in perf_df.columns:
        perf_df["rpt_time_date"] = pd.to_datetime(perf_df["click_date"])
    else:
        raise ValueError("perf_df에 날짜 열이 없습니다. (rpt_time_date / click_day / click_date 중 하나 필요)")

# 클릭/전환 집계 컬럼이 없고 로그라면, 그냥 그대로 두세요.
# predict_media_cvr가 로그 포맷(click_key/conversion)도 자동 처리합니다.

# ads_idx/mda_idx는 반드시 필요
missing = [c for c in ["ads_idx","mda_idx"] if c not in perf_df.columns]
if missing:
    raise ValueError(f"perf_df에 {missing} 컬럼이 필요합니다.")

# 3) 예측 + 평가 -------------------------------------------------
# 아래 predict_media_cvr / evaluate_against_actual 함수는 이전 셀에서 정의해둔 그대로 사용
TARGET_AD = 9982  # 원하는 광고 ID로 변경

pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df,
    ad_meta=ad_meta_df,
    target_ad=TARGET_AD,
    K=50, beta_sim=1.0,
    L_days=30, H_days=30,
    alpha_prior=2.0, beta_prior=120.0,
    blend_kappa=15.0,
    domain_weight=1.0,           # 도메인 영향 키우려면 2~3
    restrict_same_domain=False   # 같은 도메인만 후보로 제한하려면 True
)

metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)

display(table.head(20))   # 예측과 실측이 조인된 상위 20행 미리보기
print(metrics)            # RMSE/MAE/WMAE/상관/Precision@N


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,1025,1.019608,1.019608,1,0.549827,0.847966,0.024546,0.847966,0.795557,795.557122,78.0,62.053456,0.965831,965.831435,439.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
1,1032,1.72549,1.647059,1,0.706681,0.780031,0.029477,0.780031,0.7026,702.600055,99.0,69.557405,0.962389,962.389381,452.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
2,1027,0.496732,0.496732,1,0.464252,0.720559,0.020382,0.720559,0.698115,698.115423,38.0,26.528386,0.951299,951.298701,308.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
3,1029,1.281046,1.228758,1,0.698301,0.751852,0.02619,0.751852,0.694754,694.754429,77.368421,53.752053,0.96648,966.480447,179.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
4,1026,1.437908,1.30719,1,0.601795,0.754414,0.026792,0.754414,0.690765,690.7653,86.842105,59.987513,0.906475,906.47482,278.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
5,678,6.614379,6.379085,1,0.545877,0.889945,0.065149,0.889945,0.637543,637.542633,303.6,193.557943,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
6,757,7.03268,6.901961,1,0.586401,0.880464,0.06899,0.880464,0.621447,621.446987,322.8,200.603087,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
7,756,9.542484,9.333333,1,0.579098,0.89628,0.086157,0.89628,0.581292,581.292422,438.0,254.606081,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
8,677,4.54902,2.69281,1,0.596462,0.71308,0.037083,0.71308,0.555777,555.776903,200.769231,111.582901,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
9,725,7.189542,4.888889,1,0.532367,0.759278,0.053324,0.759278,0.530545,530.544597,284.482759,150.93079,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5


{'RMSE_cvr': 0.3452279254290426, 'MAE_cvr': 0.27263382631901967, 'WMAE_cvr': 0.20277130909980418, 'Pearson': 0.5275134862832426, 'Spearman': 0.3889245847371603, 'P5': 0.6, 'P10': 0.5}


In [149]:
import numpy as np
import pandas as pd

# =========================
# 0) 작은 유틸
# =========================
def _norm_meta(df):
    df = df.copy()
    for c in ["domain","ads_type","ads_rejoin_type"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

def _pick_date_col(df):
    for c in ["rpt_time_date","click_day","click_date"]:
        if c in df.columns:
            return c
    raise ValueError("날짜 열(rpt_time_date/click_day/click_date) 없음")

def _clicks_convs_cols(df):
    # 집계 or 로그 모두 지원
    clicks = None; convs = None
    if "rpt_time_clk" in df.columns: clicks = "rpt_time_clk"
    elif "clicks" in df.columns:     clicks = "clicks"
    elif "click_key" in df.columns:  clicks = None  # 로그면 size()로 계산

    if "rpt_time_turn" in df.columns: convs = "rpt_time_turn"
    elif "conversions" in df.columns: convs = "conversions"
    elif "conversion" in df.columns:  convs = "conversion"

    return clicks, convs

def _standardize(s):
    s = pd.to_numeric(s, errors="coerce")
    mu, sd = np.nanmean(s), np.nanstd(s)
    sd = 1.0 if (sd is None or sd == 0 or np.isnan(sd)) else sd
    return (s - mu) / sd

def _cosine_sim_matrix(A, a):
    # A: (N, D), a: (D,)
    A = np.asarray(A, dtype=float)
    a = np.asarray(a, dtype=float)
    a = a / (np.linalg.norm(a) + 1e-12)
    norms = np.sqrt((A*A).sum(1)) + 1e-12
    return (A @ a) / norms

# =========================
# 1) 유사 광고 코호트 뽑기
# =========================
def build_feature_space(
    ad_meta,
    cat_cols=("domain","ads_category","ads_os_type","ads_type","ads_rejoin_type"),
    num_cols=("ads_media_price",),
    domain_weight=1.0
):
    """
    - 범주형: 원-핫
    - 수치형: log1p 후 표준화
    """
    df = _norm_meta(ad_meta).copy()
    df = df.drop_duplicates("ads_idx")

    # 수치형 준비 (대체 컬럼 허용)
    num_cols = list(num_cols)
    if ("ads_media_price" not in df.columns) and ("media_price" in df.columns):
        num_cols = ["media_price"] + [c for c in num_cols if c!="ads_media_price"]

    X_list = []

    # 원-핫
    for c in cat_cols:
        if c in df.columns:
            one = pd.get_dummies(df[c].astype(str), prefix=c, dtype=float)
            # 도메인 가중치 (영향력 키우고 싶을 때)
            if c == "domain" and domain_weight != 1.0:
                one = one * float(domain_weight)
            X_list.append(one)

    # 수치형
    for c in num_cols:
        if c in df.columns:
            col = pd.Series(np.log1p(pd.to_numeric(df[c], errors="coerce")))
            X_list.append(_standardize(col).to_frame(f"{c}_z"))

    if not X_list:
        raise ValueError("쓸 수 있는 피처가 없습니다. ad_meta 컬럼을 확인하세요.")

    X = pd.concat(X_list, axis=1).fillna(0.0)
    X.index = df["ads_idx"].astype(int)
    return X, df[["ads_idx"] + [c for c in cat_cols if c in df.columns]]

def find_similar_ads(
    ad_meta, target_ad, K=50, beta_sim=1.0,
    domain_weight=1.0, restrict_same_domain=False
):
    X, meta_small = build_feature_space(ad_meta, domain_weight=domain_weight)

    if target_ad not in X.index:
        raise ValueError(f"target_ad({target_ad})가 ad_meta에 없습니다.")

    # 같은 도메인만 후보로 제한 (옵션)
    cand_idx = X.index
    if restrict_same_domain and "domain" in meta_small.columns:
        tdom = (meta_small.loc[meta_small["ads_idx"]==target_ad, "domain"]
                         .dropna().astype(str).str.strip())
        if not tdom.empty:
            tdom = tdom.iat[0]
            same_dom_ads = meta_small[meta_small["domain"].astype(str).str.strip()==tdom]["ads_idx"]
            cand_idx = pd.Index(same_dom_ads.astype(int))

    # 코사인 유사도
    a = X.loc[target_ad].values
    A = X.loc[cand_idx].drop(index=target_ad, errors="ignore")
    sims = _cosine_sim_matrix(A.values, a)

    A = A.assign(sim=sims)
    A = A.sort_values("sim", ascending=False).head(K)

    w = np.power(np.clip(A["sim"].values, 0, 1), beta_sim)
    w = w / (w.sum() + 1e-12)
    cohort = A.assign(weight=w)[["weight","sim"]]
    cohort.index.name = "ads_idx"
    return cohort

# =========================
# 2) 코호트 기반 매체별 CVR/전환수 예측
# =========================
def predict_media_cvr(
    perf, ad_meta, target_ad,
    K=50, beta_sim=1.0,
    L_days=30, H_days=30,
    alpha_prior=2.0, beta_prior=120.0,
    blend_kappa=15.0,
    domain_weight=1.0, restrict_same_domain=False
):
    # --- 코호트 추출
    cohort = find_similar_ads(
        ad_meta, target_ad, K=K, beta_sim=beta_sim,
        domain_weight=domain_weight, restrict_same_domain=restrict_same_domain
    )

    # --- 데이터 윈도우링 + 표준화
    perf = perf.copy()
    date_col = _pick_date_col(perf)
    perf[date_col] = pd.to_datetime(perf[date_col])
    wend = perf[date_col].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[date_col]>=start) & (perf[date_col]<=wend)].copy()

    # 클릭/전환 열 파악
    clk_col, cv_col = _clicks_convs_cols(hist)

    # target의 카테고리(있으면) 매핑
    cat_map = ad_meta.drop_duplicates("ads_idx").set_index("ads_idx")["ads_category"] if "ads_category" in ad_meta.columns else None
    if cat_map is not None and "ads_category" not in hist.columns:
        hist = hist.merge(cat_map.rename("ads_category"), left_on="ads_idx", right_index=True, how="left")

    # 코호트 가중 집계 (ads_idx, mda_idx)
    sub = hist[hist["ads_idx"].isin(cohort.index)].copy()
    if sub.empty:
        raise ValueError("코호트에 해당하는 히스토리 데이터가 없습니다.")

    # 집계 방식 분기
    if clk_col is None:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=("ads_idx","size"),
            convs=("conversion","sum")
        ).reset_index()
    else:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=(clk_col,"sum"),
            convs=(cv_col,"sum")
        ).reset_index()

    w_map = cohort["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]

    agg = g.groupby("mda_idx").agg(
        cohort_eff_clicks=("w_clicks","sum"),
        cohort_eff_convs=("w_convs","sum"),
        coverage_ads=("ads_idx","nunique")
    )

    # 매체 베이스라인
    if clk_col is None:
        base_m = hist.groupby("mda_idx").agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        )
    else:
        base_m = hist.groupby("mda_idx").agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    # 매체×카테고리 베이스라인(있으면)
    if "ads_category" in hist.columns:
        tcat = ad_meta.loc[ad_meta["ads_idx"]==target_ad, "ads_category"].dropna()
        tcat = int(tcat.mode().iat[0]) if not tcat.empty else None
    else:
        tcat = None

    base_mc = pd.DataFrame()
    if (tcat is not None) and ("ads_category" in hist.columns):
        subcat = hist[hist["ads_category"]==tcat]
        if not subcat.empty:
            if clk_col is None:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=("ads_idx","size"), convs=("conversion","sum")
                )
            else:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=(clk_col,"sum"), convs=(cv_col,"sum")
                )
            base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)

    # 코호트 CVR + 블렌딩
    out = agg.join(base_m[["cvr_m"]], how="left").join(base_mc[["cvr_mc"]], how="left")
    out = out.fillna({"cvr_m":0.0, "cvr_mc":np.nan})

    cohort_cvr = (out["cohort_eff_convs"] + alpha_prior) / (out["cohort_eff_clicks"] + alpha_prior + beta_prior)
    base = out["cvr_mc"].fillna(out["cvr_m"])  # 카테고리 우선
    w1 = out["cohort_eff_clicks"] / (out["cohort_eff_clicks"] + float(blend_kappa))
    pred_cvr = w1 * cohort_cvr + (1.0 - w1) * base

    pred = out.copy()
    pred["cvr_cohort"] = cohort_cvr
    pred["baseline_cvr"] = base
    pred["pred_cvr"] = pred_cvr
    pred["per_1000_clicks_conv"] = pred["pred_cvr"] * 1000.0

    # 시나리오: 코호트 일평균 클릭 × H_days
    if clk_col is None:
        per_day = (sub.groupby(["mda_idx", hist[date_col].dt.normalize()])["ads_idx"]
                     .size().rename("clk").reset_index())
    else:
        per_day = (sub.groupby(["mda_idx", hist[date_col].dt.normalize()])[clk_col]
                     .sum().rename("clk").reset_index())
    daily = per_day.groupby("mda_idx")["clk"].mean()
    pred["scenarioB_clicks"] = daily.reindex(pred.index).fillna(0.0).values * float(H_days)
    pred["scenarioB_conv"]   = pred["pred_cvr"] * pred["scenarioB_clicks"]

    pred = pred.reset_index().rename(columns={"index":"mda_idx"})
    pred = pred.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    return pred, cohort, {"window_end": str(wend.date()), "L_days": L_days, "H_days": H_days}

# =========================
# 3) (선택) 실제와 비교 평가
# =========================
def evaluate_against_actual(perf, pred_df, target_ad, L_days=30):
    perf = perf.copy()
    date_col = _pick_date_col(perf)
    perf[date_col] = pd.to_datetime(perf[date_col])
    wend = perf[date_col].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[date_col]>=start) & (perf[date_col]<=wend)]
    clk_col, cv_col = _clicks_convs_cols(hist)

    act = hist[hist["ads_idx"]==target_ad]
    if act.empty:
        # 실측이 없으면 평가 불가
        metrics = {"RMSE_cvr":np.nan,"MAE_cvr":np.nan,"WMAE_cvr":np.nan,"Pearson":np.nan,"Spearman":np.nan,"P5":np.nan,"P10":np.nan}
        table = pred_df.copy()
        for k,v in metrics.items(): table[k]=v
        return metrics, table

    if clk_col is None:
        g = act.groupby("mda_idx").agg(clicks=("ads_idx","size"), convs=("conversion","sum")).reset_index()
    else:
        g = act.groupby("mda_idx").agg(clicks=(clk_col,"sum"), convs=(cv_col,"sum")).reset_index()
    g["cvr"] = g["convs"] / g["clicks"].replace(0, np.nan)
    g["per_1000_clicks_conv"] = g["cvr"] * 1000.0

    join = pred_df.merge(g[["mda_idx","cvr","per_1000_clicks_conv","clicks"]], on="mda_idx", how="left",
                         suffixes=("","_act"))

    eval_df = join.dropna(subset=["cvr"]).copy()
    if eval_df.empty:
        metrics = {"RMSE_cvr":np.nan,"MAE_cvr":np.nan,"WMAE_cvr":np.nan,"Pearson":np.nan,"Spearman":np.nan,"P5":np.nan,"P10":np.nan}
        table = join.copy()
        for k,v in metrics.items(): table[k]=v
        return metrics, table

    # 지표
    rmse = float(np.sqrt(np.mean((eval_df["pred_cvr"]-eval_df["cvr"])**2)))
    mae  = float(np.mean(np.abs(eval_df["pred_cvr"]-eval_df["cvr"])))
    wmae = float((np.abs(eval_df["pred_cvr"]-eval_df["cvr"]) * eval_df["clicks"]).sum() /
                 (eval_df["clicks"].sum() + 1e-12))
    pear = float(eval_df[["pred_cvr","cvr"]].corr().iloc[0,1])
    spear = float(eval_df[["pred_cvr","cvr"]].rank().corr().iloc[0,1])

    def precision_at(n):
        A = set(join.sort_values("per_1000_clicks_conv", ascending=False).head(n)["mda_idx"])
        B = set(eval_df.sort_values("per_1000_clicks_conv_act", ascending=False).head(n)["mda_idx"])
        return len(A & B) / max(1,len(A))
    P5, P10 = precision_at(5), precision_at(10)

    metrics = {"RMSE_cvr":rmse,"MAE_cvr":mae,"WMAE_cvr":wmae,"Pearson":pear,"Spearman":spear,"P5":P5,"P10":P10}
    table = join.copy()
    for k,v in metrics.items(): table[k]=v
    return metrics, table

# =========================
# 4) 사용 예 (한 줄)
# =========================
# pred_df, cohort_df, info = predict_media_cvr(
#     perf=perf_df, ad_meta=ad_meta_df, target_ad=9982,
#     K=50, beta_sim=1.0, L_days=30, H_days=30,
#     alpha_prior=2.0, beta_prior=120.0, blend_kappa=15.0,
#     domain_weight=1.0, restrict_same_domain=False
# )
# metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=9982, L_days=30)
# display(table.head(20)); print(metrics)


In [150]:
pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df, ad_meta=ad_meta_df, target_ad=9982,
    K=50, beta_sim=1.0, L_days=30, H_days=30,
    alpha_prior=2.0, beta_prior=120.0, blend_kappa=15.0,
    domain_weight=1.0, restrict_same_domain=False
)
metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=9982, L_days=30)
display(table.head(20)); print(metrics)

Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,1025,1.019608,1.019608,1,0.549827,0.847966,0.024546,0.847966,0.795557,795.557122,78.0,62.053456,0.965831,965.831435,439.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
1,1032,1.72549,1.647059,1,0.706681,0.780031,0.029477,0.780031,0.7026,702.600055,99.0,69.557405,0.962389,962.389381,452.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
2,1027,0.496732,0.496732,1,0.464252,0.720559,0.020382,0.720559,0.698115,698.115423,38.0,26.528386,0.951299,951.298701,308.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
3,1029,1.281046,1.228758,1,0.698301,0.751852,0.02619,0.751852,0.694754,694.754429,77.368421,53.752053,0.96648,966.480447,179.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
4,1026,1.437908,1.30719,1,0.601795,0.754414,0.026792,0.754414,0.690765,690.7653,86.842105,59.987513,0.906475,906.47482,278.0,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
5,678,6.614379,6.379085,1,0.545877,0.889945,0.065149,0.889945,0.637543,637.542633,303.6,193.557943,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
6,757,7.03268,6.901961,1,0.586401,0.880464,0.06899,0.880464,0.621447,621.446987,322.8,200.603087,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
7,756,9.542484,9.333333,1,0.579098,0.89628,0.086157,0.89628,0.581292,581.292422,438.0,254.606081,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
8,677,4.54902,2.69281,1,0.596462,0.71308,0.037083,0.71308,0.555777,555.776903,200.769231,111.582901,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5
9,725,7.189542,4.888889,1,0.532367,0.759278,0.053324,0.759278,0.530545,530.544597,284.482759,150.93079,,,,0.345228,0.272634,0.202771,0.527513,0.388925,0.6,0.5


{'RMSE_cvr': 0.3452279254290426, 'MAE_cvr': 0.27263382631901967, 'WMAE_cvr': 0.20277130909980418, 'Pearson': 0.5275134862832426, 'Spearman': 0.3889245847371603, 'P5': 0.6, 'P10': 0.5}


In [152]:
# ============================================================
# Cohort 기반 매체별 전환율/전환수 예측 (원-핫 + 가격 + 코호트 블렌딩)
# + 단일카테고리(원-핫) 유사광고 Top-K 헬퍼 포함
# ------------------------------------------------------------
# - perf_df: 시간별 집계 또는 클릭로그 (ads_idx, mda_idx, 날짜, 클릭/전환)
# - ad_meta_df: 광고 메타 (domain, ads_category, ads_os_type, ads_type, ads_rejoin_type, ads_media_price 등)
# ============================================================

import numpy as np
import pandas as pd

# ------------------------------
# 공통 유틸
# ------------------------------
def _norm_meta(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ["domain","ads_type","ads_rejoin_type"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

def _pick_date_col(df: pd.DataFrame) -> str:
    for c in ["rpt_time_date","click_day","click_date"]:
        if c in df.columns:
            return c
    raise ValueError("날짜 열(rpt_time_date/click_day/click_date) 없음")

def _clicks_convs_cols(df: pd.DataFrame):
    # 집계/로그 모두 지원
    clicks = None; convs = None
    if "rpt_time_clk" in df.columns: clicks = "rpt_time_clk"
    elif "clicks" in df.columns:     clicks = "clicks"
    elif "click_key" in df.columns:  clicks = None  # 로그면 size() 사용

    if "rpt_time_turn" in df.columns: convs = "rpt_time_turn"
    elif "conversions" in df.columns: convs = "conversions"
    elif "conversion" in df.columns:  convs = "conversion"

    return clicks, convs

def _standardize(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    mu, sd = np.nanmean(s), np.nanstd(s)
    sd = 1.0 if (sd is None or sd == 0 or np.isnan(sd)) else sd
    return (s - mu) / sd

def _cosine_sim_matrix(A, a):
    A = np.asarray(A, dtype=float)
    a = np.asarray(a, dtype=float)
    a = a / (np.linalg.norm(a) + 1e-12)
    norms = np.sqrt((A*A).sum(1)) + 1e-12
    return (A @ a) / norms

# ------------------------------
# (A) 단일 카테고리 원-핫 + 가격(log) 유사광고 Top-K (빠른 버전)
# ------------------------------
CAT_COLS = ["domain", "ads_category", "ads_os_type", "ads_type", "ads_rejoin_type"]
PRICE_CANDIDATES = ("ads_media_price", "media_price", "contract_price")

def _mode_or_unk(s: pd.Series, unk="UNK") -> str:
    s = s.dropna().astype(str)
    if s.empty:
        return unk
    m = s.mode()
    return (m.iat[0] if not m.empty else unk)

def build_ad_feature_space_singlecat(
    ad_meta: pd.DataFrame,
    id_col: str = "ads_idx",
    drop_rare_min_ads: int | None = None,
    group_balance: bool = False,
    use_float32: bool = True
):
    meta = _norm_meta(ad_meta)
    if id_col not in meta.columns:
        raise ValueError(f"'{id_col}' column not found in ad_meta")

    cols_present = [c for c in CAT_COLS if c in meta.columns]
    red = (meta[[id_col] + cols_present]
           .groupby(id_col, as_index=False)
           .agg({c: _mode_or_unk for c in cols_present}))

    X_parts = []
    group_cols = {}
    for c in cols_present:
        d = pd.get_dummies(red[c], prefix=f"ad_{c}")
        d.index = red[id_col].astype(int).values
        X_parts.append(d)
        group_cols[c] = list(d.columns)

    X = pd.concat(X_parts, axis=1).astype(float)

    if drop_rare_min_ads is not None and drop_rare_min_ads > 1 and not X.empty:
        nz = X.sum(axis=0)
        keep = nz[nz >= float(drop_rare_min_ads)].index
        X = X[keep]
        group_cols = {c:[col for col in cols if col in X.columns] for c,cols in group_cols.items()}

    price_col = next((c for c in PRICE_CANDIDATES if c in meta.columns), None)
    if price_col is not None:
        price = (meta[[id_col, price_col]].dropna()
                 .groupby(id_col)[price_col].median())
        X["ad_media_price_log"] = np.log1p(price).reindex(X.index).fillna(0.0)

    if group_balance and not X.empty:
        for g, cols in group_cols.items():
            if not cols:
                continue
            sd_sum = X[cols].std(ddof=0).replace(0, 1.0).sum()
            scale = 1.0 / float(sd_sum if sd_sum > 0 else 1.0)
            X[cols] = X[cols] * scale

    mu = X.mean()
    sigma = X.std(ddof=0).replace(0, 1.0)
    A_z = (X - mu) / (sigma + 1e-9)

    if use_float32:
        A_z = A_z.astype(np.float32)
        mu = mu.astype(np.float32)
        sigma = sigma.astype(np.float32)

    return dict(A_z=A_z, mu=mu, sigma=sigma, cols=A_z.columns.tolist(),
                id_col=id_col, group_cols=group_cols, price_col=price_col)

def find_similar_ads_singlecat(
    ad_meta: pd.DataFrame,
    target_ads_id: int,
    K: int = 30,
    beta: float = 1.0,
    **space_kwargs
):
    store = build_ad_feature_space_singlecat(ad_meta, **space_kwargs)
    A = store["A_z"]
    if target_ads_id not in A.index:
        raise ValueError(f"ads_idx={target_ads_id} not found in feature space")

    M = A.values
    a = A.loc[target_ads_id].values
    a = a / (np.linalg.norm(a) + 1e-12)
    sims = _cosine_sim_matrix(M, a)

    pos_self = int(np.where(A.index.values == int(target_ads_id))[0][0])
    sims[pos_self] = -np.inf

    K = min(K, len(sims))
    top = np.argpartition(-sims, K-1)[:K]
    top = top[np.argsort(-sims[top])]

    sim_vals = sims[top]
    weights = np.power(np.clip(sim_vals, 0, 1), beta)
    weights = weights / (weights.sum() + 1e-12)

    out = pd.DataFrame({
        "ads_idx": A.index.values[top].astype(int),
        "sim": sim_vals,
        "weight": weights
    }).reset_index(drop=True)
    return out, store

# ------------------------------
# (B) 전체 파이프라인 (원-핫 + 가격 + 코호트 블렌딩)
# ------------------------------
def build_feature_space(
    ad_meta: pd.DataFrame,
    cat_cols=("domain","ads_category","ads_os_type","ads_type","ads_rejoin_type"),
    num_cols=("ads_media_price",),
    domain_weight=1.0
):
    df = _norm_meta(ad_meta).copy()
    df = df.drop_duplicates("ads_idx")

    num_cols = list(num_cols)
    if ("ads_media_price" not in df.columns) and ("media_price" in df.columns):
        num_cols = ["media_price"] + [c for c in num_cols if c!="ads_media_price"]

    X_list = []
    for c in cat_cols:
        if c in df.columns:
            one = pd.get_dummies(df[c].astype(str), prefix=c, dtype=float)
            if c == "domain" and domain_weight != 1.0:
                one = one * float(domain_weight)
            X_list.append(one)

    for c in num_cols:
        if c in df.columns:
            col = pd.Series(np.log1p(pd.to_numeric(df[c], errors="coerce")))
            X_list.append(_standardize(col).to_frame(f"{c}_z"))

    if not X_list:
        raise ValueError("쓸 수 있는 피처가 없습니다. ad_meta 컬럼을 확인하세요.")

    X = pd.concat(X_list, axis=1).fillna(0.0)
    X.index = df["ads_idx"].astype(int)
    return X, df[["ads_idx"] + [c for c in cat_cols if c in df.columns]]

def find_similar_ads(
    ad_meta: pd.DataFrame, target_ad: int, K=50, beta_sim=1.0,
    domain_weight=1.0, restrict_same_domain=False
):
    X, meta_small = build_feature_space(ad_meta, domain_weight=domain_weight)

    if target_ad not in X.index:
        raise ValueError(f"target_ad({target_ad})가 ad_meta에 없습니다.")

    cand_idx = X.index
    if restrict_same_domain and "domain" in meta_small.columns:
        tdom = (meta_small.loc[meta_small["ads_idx"]==target_ad, "domain"]
                          .dropna().astype(str).str.strip())
        if not tdom.empty:
            tdom = tdom.iat[0]
            same_dom_ads = meta_small[meta_small["domain"].astype(str).str.strip()==tdom]["ads_idx"]
            cand_idx = pd.Index(same_dom_ads.astype(int))

    a = X.loc[target_ad].values
    A = X.loc[cand_idx].drop(index=target_ad, errors="ignore")
    sims = _cosine_sim_matrix(A.values, a)

    A = A.assign(sim=sims).sort_values("sim", ascending=False).head(K)
    w = np.power(np.clip(A["sim"].values, 0, 1), beta_sim)
    w = w / (w.sum() + 1e-12)
    cohort = A.assign(weight=w)[["weight","sim"]]
    cohort.index.name = "ads_idx"
    return cohort

def predict_media_cvr(
    perf: pd.DataFrame, ad_meta: pd.DataFrame, target_ad: int,
    K=50, beta_sim=1.0,
    L_days=30, H_days=30,
    alpha_prior=2.0, beta_prior=120.0,
    blend_kappa=15.0,
    domain_weight=1.0, restrict_same_domain=False
):
    # 1) 유사광고 코호트
    cohort = find_similar_ads(
        ad_meta, target_ad, K=K, beta_sim=beta_sim,
        domain_weight=domain_weight, restrict_same_domain=restrict_same_domain
    )

    # 2) 윈도우링
    perf = perf.copy()
    date_col = _pick_date_col(perf)
    perf[date_col] = pd.to_datetime(perf[date_col])
    wend = perf[date_col].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[date_col]>=start) & (perf[date_col]<=wend)].copy()

    clk_col, cv_col = _clicks_convs_cols(hist)

    # 3) 타깃 카테고리 붙이기(있으면)
    cat_map = ad_meta.drop_duplicates("ads_idx").set_index("ads_idx")["ads_category"] if "ads_category" in ad_meta.columns else None
    if cat_map is not None and "ads_category" not in hist.columns:
        hist = hist.merge(cat_map.rename("ads_category"), left_on="ads_idx", right_index=True, how="left")

    # 4) 코호트 가중 집계
    sub = hist[hist["ads_idx"].isin(cohort.index)].copy()
    if sub.empty:
        raise ValueError("코호트에 해당하는 히스토리 데이터가 없습니다.")

    if clk_col is None:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        ).reset_index()
    else:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        ).reset_index()

    w_map = cohort["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]

    agg = g.groupby("mda_idx").agg(
        cohort_eff_clicks=("w_clicks","sum"),
        cohort_eff_convs=("w_convs","sum"),
        coverage_ads=("ads_idx","nunique")
    )

    # 5) 베이스라인
    if clk_col is None:
        base_m = hist.groupby("mda_idx").agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        )
    else:
        base_m = hist.groupby("mda_idx").agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    tcat = None
    if "ads_category" in hist.columns:
        tser = ad_meta.loc[ad_meta["ads_idx"]==target_ad, "ads_category"].dropna()
        tcat = int(tser.mode().iat[0]) if not tser.empty else None

    base_mc = pd.DataFrame()
    if (tcat is not None) and ("ads_category" in hist.columns):
        subcat = hist[hist["ads_category"]==tcat]
        if not subcat.empty:
            if clk_col is None:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=("ads_idx","size"), convs=("conversion","sum")
                )
            else:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=(clk_col,"sum"), convs=(cv_col,"sum")
                )
            base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)

    out = agg.join(base_m[["cvr_m"]], how="left").join(base_mc[["cvr_mc"]], how="left")
    out = out.fillna({"cvr_m":0.0, "cvr_mc":np.nan})

    cohort_cvr = (out["cohort_eff_convs"] + alpha_prior) / (out["cohort_eff_clicks"] + alpha_prior + beta_prior)
    base = out["cvr_mc"].fillna(out["cvr_m"])
    w1 = out["cohort_eff_clicks"] / (out["cohort_eff_clicks"] + float(blend_kappa))
    pred_cvr = w1 * cohort_cvr + (1.0 - w1) * base

    pred = out.copy()
    pred["cvr_cohort"] = cohort_cvr
    pred["baseline_cvr"] = base
    pred["pred_cvr"] = pred_cvr
    pred["per_1000_clicks_conv"] = pred["pred_cvr"] * 1000.0

    # 6) H일 시나리오 클릭/전환
    if clk_col is None:
        per_day = (sub.groupby(["mda_idx", hist[date_col].dt.normalize()])["ads_idx"]
                     .size().rename("clk").reset_index())
    else:
        per_day = (sub.groupby(["mda_idx", hist[date_col].dt.normalize()])[clk_col]
                     .sum().rename("clk").reset_index())
    daily = per_day.groupby("mda_idx")["clk"].mean()
    pred["scenarioB_clicks"] = daily.reindex(pred.index).fillna(0.0).values * float(H_days)
    pred["scenarioB_conv"]   = pred["pred_cvr"] * pred["scenarioB_clicks"]

    pred = pred.reset_index().rename(columns={"index":"mda_idx"})
    pred = pred.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    return pred, cohort, {"window_end": str(wend.date()), "L_days": L_days, "H_days": H_days}

def evaluate_against_actual(perf: pd.DataFrame, pred_df: pd.DataFrame, target_ad: int, L_days=30):
    perf = perf.copy()
    date_col = _pick_date_col(perf)
    perf[date_col] = pd.to_datetime(perf[date_col])
    wend = perf[date_col].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[date_col]>=start) & (perf[date_col]<=wend)]
    clk_col, cv_col = _clicks_convs_cols(hist)

    act = hist[hist["ads_idx"]==target_ad]
    if act.empty:
        metrics = {"RMSE_cvr":np.nan,"MAE_cvr":np.nan,"WMAE_cvr":np.nan,"Pearson":np.nan,"Spearman":np.nan,"P5":np.nan,"P10":np.nan}
        table = pred_df.copy()
        for k,v in metrics.items(): table[k]=v
        return metrics, table

    if clk_col is None:
        g = act.groupby("mda_idx").agg(clicks=("ads_idx","size"), convs=("conversion","sum")).reset_index()
    else:
        g = act.groupby("mda_idx").agg(clicks=(clk_col,"sum"), convs=(cv_col,"sum")).reset_index()
    g["cvr"] = g["convs"] / g["clicks"].replace(0, np.nan)
    g["per_1000_clicks_conv"] = g["cvr"] * 1000.0

    join = pred_df.merge(g[["mda_idx","cvr","per_1000_clicks_conv","clicks"]],
                         on="mda_idx", how="left", suffixes=("","_act"))

    eval_df = join.dropna(subset=["cvr"]).copy()
    if eval_df.empty:
        metrics = {"RMSE_cvr":np.nan,"MAE_cvr":np.nan,"WMAE_cvr":np.nan,"Pearson":np.nan,"Spearman":np.nan,"P5":np.nan,"P10":np.nan}
        table = join.copy()
        for k,v in metrics.items(): table[k]=v
        return metrics, table

    rmse = float(np.sqrt(np.mean((eval_df["pred_cvr"]-eval_df["cvr"])**2)))
    mae  = float(np.mean(np.abs(eval_df["pred_cvr"]-eval_df["cvr"])))
    wmae = float((np.abs(eval_df["pred_cvr"]-eval_df["cvr"]) * eval_df["clicks"]).sum() /
                 (eval_df["clicks"].sum() + 1e-12))
    pear = float(eval_df[["pred_cvr","cvr"]].corr().iloc[0,1])
    spear = float(eval_df[["pred_cvr","cvr"]].rank().corr().iloc[0,1])

    def precision_at(n):
        A = set(join.sort_values("per_1000_clicks_conv", ascending=False).head(n)["mda_idx"])
        B = set(eval_df.sort_values("per_1000_clicks_conv_act", ascending=False).head(n)["mda_idx"])
        return len(A & B) / max(1,len(A))
    P5, P10 = precision_at(5), precision_at(10)

    metrics = {"RMSE_cvr":rmse,"MAE_cvr":mae,"WMAE_cvr":wmae,"Pearson":pear,"Spearman":spear,"P5":P5,"P10":P10}
    table = join.copy()
    for k,v in metrics.items(): table[k]=v
    return metrics, table

# ------------------------------------------------------------
# 사용 예시 (주석 해제해서 실행)
# ------------------------------------------------------------
# perf_df = pd.read_csv("/path/수정_시간별적립보고서(최종).csv", encoding="utf-8-sig")
# ad_meta_df = pd.read_csv("/path/광고도메인리스트.csv", encoding="utf-8-sig")
# TARGET_AD = 9982
#
# pred_df, cohort_df, info = predict_media_cvr(
#     perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
#     K=50, beta_sim=1.0, L_days=30, H_days=30,
#     alpha_prior=2.0, beta_prior=120.0, blend_kappa=15.0,
#     domain_weight=1.0, restrict_same_domain=False
# )
# metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
# display(table.head(20)); print(metrics)
#
# # 유사광고 Top-K만 보고 싶으면:
# sim_ads, feat_store = find_similar_ads_singlecat(
#     ad_meta_df, TARGET_AD, K=30, beta=1.0,
#     drop_rare_min_ads=3, group_balance=False
# )
# display(sim_ads.head(20))


In [160]:
perf_df = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv", encoding="utf-8-sig")
ad_meta_df = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv", encoding="utf-8-sig")
TARGET_AD = 13928	

pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
    K=5, beta_sim=1.0, L_days=30, H_days=7,
    alpha_prior=2.0, beta_prior=120.0, blend_kappa=15.0,
    domain_weight=1.0, restrict_same_domain=False
)
metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
display(table.head(20)); print(metrics)

# 유사광고 Top-K만 보고 싶으면:
sim_ads, feat_store = find_similar_ads_singlecat(
    ad_meta_df, TARGET_AD, K=5, beta=1.0,
    drop_rare_min_ads=3, group_balance=False
)
display(sim_ads.head(20))

Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,341,305.8,290.4,1,0.637559,0.35559,0.683497,0.35559,0.668165,668.164667,1189.222222,794.596271,0.959016,959.016393,122.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
1,340,309.4,289.8,1,0.670431,0.392349,0.676402,0.392349,0.663268,663.267987,1203.222222,798.058781,0.966102,966.101695,118.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
2,303,244.0,233.6,1,0.682367,0.426667,0.643716,0.426667,0.631145,631.145431,948.888889,598.886887,0.897959,897.959184,98.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
3,324,245.6,232.4,1,0.697718,0.416717,0.63765,0.416717,0.624933,624.932875,955.111111,596.880333,0.960784,960.784314,102.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
4,701,270.2,252.4,1,0.38212,0.039478,0.648649,0.039478,0.61661,616.609541,1050.777778,647.919603,0.960526,960.526316,76.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
5,579,258.6,237.2,1,0.345042,0.027143,0.628481,0.027143,0.595513,595.51326,1005.666667,598.887835,0.943396,943.396226,53.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
6,801,209.0,195.0,1,0.655407,0.494006,0.595166,0.494006,0.588392,588.392066,812.777778,478.231996,0.930233,930.232558,86.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
7,1020,208.6,190.8,1,0.614679,0.573438,0.583182,0.573438,0.582528,582.528445,811.222222,472.56002,0.893617,893.617021,94.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
8,621,208.8,197.8,1,0.403661,0.058674,0.60399,0.058674,0.567441,567.440966,812.0,460.762065,0.921875,921.875,64.0,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0
9,563,3730.4,2143.4,1,0.570481,0.570949,0.5569,0.570949,0.556956,556.955862,43521.333333,24239.461741,,,,0.517885,0.493728,0.425509,0.133438,-0.053775,0.0,0.0


{'RMSE_cvr': 0.5178846717751774, 'MAE_cvr': 0.49372812293256735, 'WMAE_cvr': 0.42550908406707055, 'Pearson': 0.13343767935925324, 'Spearman': -0.053775005541993375, 'P5': 0.0, 'P10': 0.0}


Unnamed: 0,ads_idx,sim,weight
0,445205,0.982368,0.203219
1,369718,0.969353,0.200526
2,446308,0.961088,0.198817
3,441181,0.960617,0.198719
4,444350,0.960617,0.198719


In [159]:
ads_pool[ads_pool['ads_size']=='MEGA']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
6,6,6,9982,36,22510,23187,12981,3,미디어/컨텐츠,7,2729900,114.674062,118.0,NONE,230,160,2025-07-26 03:09:06,2025-08-25 11:23:42,강원일보 네이버 뉴스,2021-02-05 14:00:00,0,31,418.741935,0.6,70,0.4,908670,MEGA,747.967742,726.129032,MEGA,HIGH,MEGA_HIGH,4,4,4,4,1,17,MEGA,0,,0.0,1.0
28,31,31,13928,77,3111,3124,2834,1,생활,2,2758000,76.66302,70.0,NONE,170,120,2025-07-26 00:08:39,2025-08-25 00:14:22,다톡_실시간 동네친구 채팅,2021-11-12 10:07:00,0,31,91.419355,0.9,50,0.4,141700,MEGA,100.774194,100.354839,MEGA,HIGH,MEGA_HIGH,4,3,3,4,2,16,MEGA,0,,0.0,1.0
60,64,64,19488,97,12099,12337,6372,3,미디어/컨텐츠,2,2729100,90.032486,103.0,NONE,230,160,2025-07-26 06:09:04,2025-08-25 06:39:19,광주방송 네이버 뉴스,2022-09-23 12:00:00,0,31,205.548387,0.5,70,0.4,446040,MEGA,397.967742,390.290323,MEGA,HIGH,MEGA_HIGH,4,4,3,4,0,15,MEGA,0,,0.0,1.0
217,225,227,34045,10,5749,16083,727,3,미디어/컨텐츠,7,2729400,8071.803301,76.0,NONE,230,160,2025-07-26 06:16:06,2025-08-25 11:21:18,이마트 유튜브,2024-04-19 15:00:00,0,31,23.451613,0.0,70,0.4,50890,MEGA,518.806452,185.451613,MEGA,GOOD,MEGA_GOOD,3,2,4,4,0,13,MEGA,0,,0.0,1.0
224,232,234,49983,24,8433,8699,5694,1,채용,7,2748000,105.611345,77.0,NONE,200,140,2025-07-26 00:18:04,2025-08-25 03:23:23,사람인,2024-06-01 13:00:00,0,31,183.677419,0.7,60,0.4,341640,MEGA,280.612903,272.032258,MEGA,HIGH,MEGA_HIGH,4,3,3,4,1,15,MEGA,0,,0.0,1.0
256,264,266,56225,40,1731,1995,1343,3,생활,7,2729000,149.570365,178.0,NONE,230,180,2025-07-26 10:17:59,2025-08-25 11:19:08,LG전자 인스타그램,2024-06-25 11:00:00,0,31,43.322581,0.7,50,0.3,67150,MEGA,64.354839,55.83871,MEGA,HIGH,MEGA_HIGH,4,3,2,4,1,14,MEGA,0,,0.0,1.0
271,279,281,64438,54,6819,6876,6403,2,게임,2,2756000,252.312666,198.0,NONE,170,120,2025-07-26 01:27:29,2025-07-28 12:40:53,루나 모바일,2024-07-08 16:21:27,0,3,2134.333333,0.9,50,0.4,320150,MEGA,2292.0,2273.0,MEGA,HIGH,MEGA_HIGH,4,4,4,1,2,15,MEGA,0,,0.0,1.0
283,291,293,73878,27,2609,17138,220,8,미디어/컨텐츠,7,2708200,171.704545,139.0,NONE,4000,3200,2025-07-26 00:03:38,2025-08-25 11:21:00,굿툰 7일 무료체험,2024-07-19 13:00:00,0,31,7.096774,0.0,800,0.2,176000,MEGA,552.83871,84.16129,MEGA,LOW,MEGA_LOW,4,1,4,4,0,13,MEGA,0,270,1.0,1.0
288,296,298,86708,9,1403,1708,1376,8,금융,7,2701200,69.428052,60.0,NONE,1650,1230,2025-07-26 00:33:04,2025-08-25 00:21:42,케이뱅크 최초 회원 가입,2024-08-08 00:00:00,0,30,45.866667,0.8,420,0.3,577920,LARGE,56.933333,46.766667,MEGA,HIGH,MEGA_HIGH,3,3,2,3,2,13,MEGA,0,,0.0,1.0
302,310,312,95751,81,5034,5576,2785,1,뷰티,2,2750400,146.77702,87.0,NONE,230,180,2025-07-26 00:19:23,2025-08-25 00:38:55,[오픈하기] 여신티켓,2024-09-10 10:44:45,0,31,89.83871,0.5,50,0.3,139250,MEGA,179.870968,162.387097,MEGA,HIGH,MEGA_HIGH,4,3,3,4,0,14,MEGA,0,,0.0,1.0


In [161]:
# ============================================================
# Cohort-based media CVR prediction (fast, guarded, calibratable)
# - Similar-ads via one-hot (domain/category/os/type/rejoin) + price(log z)
# - Cohort-weighted clicks/conv -> media CVR with Bayesian smoothing
# - Evidence guardrails + blend with baseline (media / media×category)
# - Optional isotonic calibration (PAV) on target's observed points
# - Offline evaluation vs. actual in the same window
# - No file IO: returns DataFrames + metrics dict
# ============================================================

import numpy as np
import pandas as pd

# --------------------------- small utils ---------------------------
CAT_COLS_DEFAULT = ["domain", "ads_category", "ads_os_type", "ads_type", "ads_rejoin_type"]
PRICE_CANDS      = ("ads_media_price","media_price","contract_price")

def _norm_meta(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in ["domain","ads_type","ads_rejoin_type"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

def _pick_date_col(df: pd.DataFrame) -> str:
    for c in ["rpt_time_date","click_day","click_date"]:
        if c in df.columns: return c
    raise ValueError("날짜 열이 없습니다. (rpt_time_date / click_day / click_date 중 하나가 필요)")

def _clicks_convs_cols(df: pd.DataFrame):
    # 집계 or 로그 모두 지원
    clk = None; cv = None
    if "rpt_time_clk" in df.columns: clk = "rpt_time_clk"
    elif "clicks" in df.columns:     clk = "clicks"
    elif "click_key" in df.columns:  clk = None  # 로그면 size() 사용

    if "rpt_time_turn" in df.columns: cv = "rpt_time_turn"
    elif "conversions" in df.columns: cv = "conversions"
    elif "conversion"  in df.columns: cv = "conversion"
    return clk, cv

def _z(s):
    s = pd.to_numeric(s, errors="coerce")
    mu, sd = float(np.nanmean(s)), float(np.nanstd(s))
    sd = 1.0 if (sd == 0 or np.isnan(sd)) else sd
    return (s - mu) / sd

def _cosine_to(a, A):
    a = np.asarray(a, float)
    A = np.asarray(A, float)
    a = a / (np.linalg.norm(a) + 1e-12)
    norms = np.sqrt((A*A).sum(1)) + 1e-12
    return (A @ a) / norms

# --------- simple isotonic regression (PAV; 1D, weighted) ----------
def _isotonic_fit(x_pred, y_true, w=None):
    # returns piecewise-constant y_hat(x) evaluated at x_pred points
    x = np.asarray(x_pred, float)
    y = np.asarray(y_true, float)
    if w is None: w = np.ones_like(y)
    else: w = np.asarray(w, float)

    # sort by x
    order = np.argsort(x)
    x, y, w = x[order], y[order], w[order]

    # PAV
    avg = y.copy()
    weight = w.copy()
    idx = np.arange(len(y))
    while True:
        viol = np.where(np.diff(avg) < 0)[0]
        if len(viol) == 0: break
        i = viol[0]
        new_w = weight[i] + weight[i+1]
        new_avg = (weight[i]*avg[i] + weight[i+1]*avg[i+1]) / (new_w + 1e-12)
        avg[i] = new_avg; weight[i] = new_w
        avg = np.delete(avg, i+1); weight = np.delete(weight, i+1)
        # merge indices
        idx = np.delete(idx, i+1)

        # backtrack if needed
        j = i
        while j>0 and avg[j-1] > avg[j]:
            new_w = weight[j-1] + weight[j]
            new_avg = (weight[j-1]*avg[j-1] + weight[j]*avg[j]) / (new_w + 1e-12)
            avg[j-1] = new_avg; weight[j-1] = new_w
            avg = np.delete(avg, j); weight = np.delete(weight, j)
            idx = np.delete(idx, j)
            j -= 1

    # expand step function back to each original sorted point
    y_hat_sorted = np.empty_like(x)
    # segments are given by idx boundaries; fill forward
    prev = 0
    for k, boundary in enumerate(np.r_[idx[1:], len(x)]):
        y_hat_sorted[prev:boundary] = avg[k]
        prev = boundary
    # unsort to match original x_pred order
    inv = np.argsort(order)
    return y_hat_sorted[inv]

# ----------------------- feature space & cohort -----------------------
def build_feature_space(
    ad_meta: pd.DataFrame,
    cat_cols=CAT_COLS_DEFAULT,
    num_cols=("ads_media_price",),
    drop_rare_min_ads: int | None = 3,
    domain_weight: float = 1.0
):
    """
    광고별 단일행 메타(ads_idx 고유)로 원-핫 + 가격(log z) 피처를 만듭니다.
    - 희귀 원-핫 열은 드롭(drop_rare_min_ads)
    - 도메인 열에 가중치(domain_weight)
    """
    meta = _norm_meta(ad_meta).copy()
    base = meta.drop_duplicates("ads_idx").set_index("ads_idx")

    # price 후보 보정
    if ("ads_media_price" not in base.columns):
        for c in ("media_price","contract_price"):
            if c in base.columns:
                base["ads_media_price"] = base[c]
                break

    X_parts = []
    for c in cat_cols:
        if c in base.columns:
            one = pd.get_dummies(base[c].astype(str), prefix=c, dtype=float)
            if c == "domain" and domain_weight != 1.0:
                one = one * float(domain_weight)
            X_parts.append(one)

    if "ads_media_price" in base.columns:
        X_parts.append(_z(np.log1p(base["ads_media_price"])).to_frame("price_z"))

    if not X_parts:
        raise ValueError("사용 가능한 피처가 없습니다. ad_meta 컬럼 확인")

    X = pd.concat(X_parts, axis=1).fillna(0.0)
    # 희귀 열 드롭
    if drop_rare_min_ads and drop_rare_min_ads > 1:
        keep = X.columns[(X != 0).sum(0) >= drop_rare_min_ads]
        X = X[keep]

    X.index = X.index.astype(int)
    return X, base.reset_index()[["ads_idx"] + [c for c in cat_cols if c in base.columns]]

def find_similar_ads(
    ad_meta: pd.DataFrame,
    target_ad: int,
    K: int = 50,
    beta_sim: float = 1.0,
    domain_weight: float = 1.0,
    restrict_same_domain: bool = False,
    drop_rare_min_ads: int | None = 3
):
    X, meta_small = build_feature_space(
        ad_meta, drop_rare_min_ads=drop_rare_min_ads, domain_weight=domain_weight
    )
    if target_ad not in X.index:
        raise ValueError(f"target_ad({target_ad})가 ad_meta에 없습니다.")

    cand_idx = X.index
    if restrict_same_domain and "domain" in meta_small.columns:
        tdom = (meta_small.loc[meta_small["ads_idx"]==target_ad, "domain"]
                         .dropna().astype(str).str.strip())
        if not tdom.empty:
            cand_idx = pd.Index(
                meta_small[meta_small["domain"].astype(str).str.strip()==tdom.iat[0]]["ads_idx"].astype(int)
            )

    a = X.loc[target_ad].values
    A = X.loc[cand_idx].drop(index=target_ad, errors="ignore")
    sims = _cosine_to(a, A.values)
    top = np.argpartition(-sims, min(K, len(sims))-1)[:K]
    top = top[np.argsort(-sims[top])]
    sim_vals = sims[top]
    w = np.power(np.clip(sim_vals, 0, 1), beta_sim)
    w = w / (w.sum() + 1e-12)

    cohort = A.iloc[top][[]].copy()
    cohort["sim"] = sim_vals
    cohort["weight"] = w
    cohort.index.name = "ads_idx"
    return cohort[["weight","sim"]]

# --------------------- main: predict media CVR ---------------------
def predict_media_cvr(
    perf: pd.DataFrame,
    ad_meta: pd.DataFrame,
    target_ad: int,
    # similarity
    K: int = 50, beta_sim: float = 1.0,
    domain_weight: float = 1.0, restrict_same_domain: bool = False,
    drop_rare_min_ads: int | None = 3,
    # window & smoothing
    L_days: int = 30, H_days: int = 30,
    alpha_prior: float = 2.0, beta_prior: float = 120.0,
    blend_kappa: float = 20.0,
    # guardrails
    min_eff_clicks: float = 3.0,   # 코호트 가중 클릭 하한
    min_coverage_ads: int = 2,     # 코호트에 의해 관측된 광고 수 하한
    # optional calibration
    calibrate: bool = False,       # True면 target의 관측점으로 isotonic 보정(주의: 같은 윈도우라 오프라인 평가는 편향 가능)
    min_calib_points: int = 6
):
    # 0) cohort
    cohort = find_similar_ads(
        ad_meta, target_ad, K=K, beta_sim=beta_sim,
        domain_weight=domain_weight, restrict_same_domain=restrict_same_domain,
        drop_rare_min_ads=drop_rare_min_ads
    )

    # 1) window slice
    perf = perf.copy()
    dcol = _pick_date_col(perf); perf[dcol] = pd.to_datetime(perf[dcol])
    wend = perf[dcol].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[dcol]>=start) & (perf[dcol]<=wend)].copy()

    clk_col, cv_col = _clicks_convs_cols(hist)

    # 2) ensure ads_category in hist
    if "ads_category" not in hist.columns and "ads_category" in ad_meta.columns:
        cat_map = ad_meta.drop_duplicates("ads_idx").set_index("ads_idx")["ads_category"]
        hist = hist.merge(cat_map.rename("ads_category"), left_on="ads_idx", right_index=True, how="left")

    # 3) cohort-weighted (ads,mda) -> (mda)
    sub = hist[hist["ads_idx"].isin(cohort.index)].copy()
    if sub.empty:
        raise ValueError("코호트와 겹치는 히스토리 데이터가 없습니다.")

    if clk_col is None:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        ).reset_index()
    else:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        ).reset_index()

    w_map = cohort["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]

    agg = g.groupby("mda_idx").agg(
        cohort_eff_clicks=("w_clicks","sum"),
        cohort_eff_convs=("w_convs","sum"),
        coverage_ads=("ads_idx","nunique")
    )

    # 4) baselines
    if clk_col is None:
        base_m = hist.groupby("mda_idx").agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        )
    else:
        base_m = hist.groupby("mda_idx").agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    tcat = None
    if "ads_category" in hist.columns:
        _t = ad_meta.loc[ad_meta["ads_idx"]==target_ad, "ads_category"].dropna()
        if not _t.empty: tcat = int(_t.mode().iat[0])

    base_mc = pd.DataFrame()
    if tcat is not None:
        subcat = hist[hist["ads_category"]==tcat]
        if not subcat.empty:
            if clk_col is None:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=("ads_idx","size"), convs=("conversion","sum")
                )
            else:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=(clk_col,"sum"), convs=(cv_col,"sum")
                )
            base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)

    out = agg.join(base_m[["cvr_m"]], how="left").join(base_mc[["cvr_mc"]], how="left").fillna({"cvr_m":0.0})
    cvr_cohort = (out["cohort_eff_convs"] + alpha_prior) / (out["cohort_eff_clicks"] + alpha_prior + beta_prior)
    baseline = out["cvr_mc"].fillna(out["cvr_m"])

    w1 = out["cohort_eff_clicks"] / (out["cohort_eff_clicks"] + float(blend_kappa))
    pred = out.copy()
    pred["cvr_cohort"] = cvr_cohort
    pred["baseline_cvr"] = baseline
    pred["pred_cvr"] = w1 * cvr_cohort + (1.0 - w1) * baseline

    # 5) scenario clicks/conv using cohort daily avg
    if clk_col is None:
        per_day = (sub.groupby(["mda_idx", sub[dcol].dt.normalize()])["ads_idx"].size().rename("clk").reset_index())
    else:
        per_day = (sub.groupby(["mda_idx", sub[dcol].dt.normalize()])[clk_col].sum().rename("clk").reset_index())
    daily = per_day.groupby("mda_idx")["clk"].mean()
    pred["per_1000_clicks_conv"] = pred["pred_cvr"] * 1000.0
    pred["scenarioB_clicks"] = daily.reindex(pred.index).fillna(0.0).values * float(H_days)
    pred["scenarioB_conv"]   = pred["pred_cvr"] * pred["scenarioB_clicks"]

    # 6) guardrails
    pred = pred[(pred["cohort_eff_clicks"] >= float(min_eff_clicks)) & (pred["coverage_ads"] >= int(min_coverage_ads))].copy()

    # 7) optional isotonic calibration on target's observed points (use with caution)
    calib_info = None
    if calibrate:
        # build (x=pred, y=actual) on media where target has data in window
        act = hist[hist["ads_idx"]==target_ad].copy()
        if not act.empty:
            if clk_col is None:
                gact = act.groupby("mda_idx").agg(clicks=("ads_idx","size"), convs=("conversion","sum"))
            else:
                gact = act.groupby("mda_idx").agg(clicks=(clk_col,"sum"), convs=(cv_col,"sum"))
            gact["cvr"] = gact["convs"] / gact["clicks"].replace(0, np.nan)
            join = pred.join(gact[["cvr","clicks"]], how="inner")
            join = join.dropna(subset=["cvr","pred_cvr"]).copy()
            if len(join) >= int(min_calib_points):
                x = join["pred_cvr"].values
                y = join["cvr"].values
                w = np.clip(join["clicks"].values, 1.0, None)
                yhat = _isotonic_fit(x, y, w)
                # map all pred via isotonic (monotone) by projecting to nearest fitted levels
                # 여기서는 x 자체에 대한 fitted 값을 썼으므로, 동일 x의 분포 유지
                # 실서비스라면 균일 grid에 fit 후 보간 권장
                # 간단히 평균 스케일 보정도 저장
                pred["pred_cvr_raw"] = pred["pred_cvr"]
                # 최근접 수치대체: x->yhat로 매핑하기 위해, pred의 값이 join의 x에 없을 수 있으니
                # rank 기반 선형보간
                order = np.argsort(x); x_s, y_s = x[order], yhat[order]
                pred["pred_cvr"] = np.interp(pred["pred_cvr"], x_s, y_s, left=y_s[0], right=y_s[-1])
                pred["per_1000_clicks_conv"] = pred["pred_cvr"] * 1000.0
                pred["scenarioB_conv"] = pred["pred_cvr"] * pred["scenarioB_clicks"]
                calib_info = {"points": len(join), "note": "isotonic(PAV) on target observed media"}
            else:
                calib_info = {"points": len(join) if 'join' in locals() else 0, "note": "insufficient for calibration"}

    pred = pred.reset_index().rename(columns={"index":"mda_idx"})
    pred = pred.sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)

    info = {
        "window_end": str(wend.date()),
        "L_days": int(L_days), "H_days": int(H_days),
        "cohort_size": int(len(cohort)),
        "calibration": calib_info
    }
    return pred, cohort.reset_index(), info

# --------------------------- evaluation ---------------------------
def evaluate_against_actual(perf: pd.DataFrame, pred_df: pd.DataFrame, target_ad: int, L_days: int = 30):
    perf = perf.copy()
    dcol = _pick_date_col(perf); perf[dcol] = pd.to_datetime(perf[dcol])
    wend = perf[dcol].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[dcol]>=start) & (perf[dcol]<=wend)]

    clk_col, cv_col = _clicks_convs_cols(hist)
    act = hist[hist["ads_idx"]==target_ad]
    if act.empty:
        metrics = {"RMSE_cvr":np.nan,"MAE_cvr":np.nan,"WMAE_cvr":np.nan,"Pearson":np.nan,"Spearman":np.nan,"P5":np.nan,"P10":np.nan}
        table = pred_df.copy()
        for k,v in metrics.items(): table[k]=v
        return metrics, table

    if clk_col is None:
        g = act.groupby("mda_idx").agg(clicks=("ads_idx","size"), convs=("conversion","sum")).reset_index()
    else:
        g = act.groupby("mda_idx").agg(clicks=(clk_col,"sum"), convs=(cv_col,"sum")).reset_index()
    g["cvr"] = g["convs"] / g["clicks"].replace(0, np.nan)
    g["per_1000_clicks_conv_act"] = g["cvr"] * 1000.0

    df = pred_df.merge(g[["mda_idx","cvr","per_1000_clicks_conv_act","clicks"]], on="mda_idx", how="left")
    eval_df = df.dropna(subset=["cvr"]).copy()

    def _rmse(a,b): return float(np.sqrt(np.nanmean((a-b)**2)))
    def _mae(a,b):  return float(np.nanmean(np.abs(a-b)))
    def _wmae(a,b,w): w=np.asarray(w); return float(np.nansum(np.abs(a-b)*w)/(np.nansum(w)+1e-12))

    if not eval_df.empty:
        rmse = _rmse(eval_df["pred_cvr"], eval_df["cvr"])
        mae  = _mae(eval_df["pred_cvr"], eval_df["cvr"])
        wmae = _wmae(eval_df["pred_cvr"], eval_df["cvr"], eval_df["clicks"].fillna(0.0))
        pear = float(eval_df[["pred_cvr","cvr"]].corr().iloc[0,1])
        spear= float(eval_df[["pred_cvr","cvr"]].rank().corr().iloc[0,1])
        def precision_at(n):
            A = set(df.sort_values("per_1000_clicks_conv", ascending=False).head(n)["mda_idx"])
            B = set(eval_df.sort_values("per_1000_clicks_conv_act", ascending=False).head(n)["mda_idx"])
            return len(A & B) / max(1,len(A))
        P5, P10 = precision_at(5), precision_at(10)
    else:
        rmse=mae=wmae=pear=spear=P5=P10=np.nan

    metrics = {"RMSE_cvr":rmse,"MAE_cvr":mae,"WMAE_cvr":wmae,"Pearson":pear,"Spearman":spear,"P5":P5,"P10":P10}
    table = df.copy()
    for k,v in metrics.items(): table[k]=v
    return metrics, table

# --------------------------- quick usage ---------------------------
# perf_df = pd.read_csv("/path/수정_시간별적립보고서(최종).csv", encoding="utf-8-sig")
# ad_meta_df = pd.read_csv("/path/광고도메인리스트.csv", encoding="utf-8-sig")
# TARGET_AD = 9982
# pred_df, cohort_df, info = predict_media_cvr(
#     perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
#     K=50, beta_sim=1.0,
#     L_days=30, H_days=30,
#     alpha_prior=2.0, beta_prior=120.0, blend_kappa=20.0,
#     domain_weight=1.5, restrict_same_domain=False,
#     drop_rare_min_ads=3,
#     min_eff_clicks=3, min_coverage_ads=2,
#     calibrate=False  # True로 두면 같은 창의 관측으로 보정(오프라인 평가는 편향 가능)
# )
# metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
# display(pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","cohort_eff_clicks","coverage_ads","scenarioB_clicks","scenarioB_conv"]])
# display(cohort_df.head(20))
# display(table.head(20)); print(info); print(metrics)


In [165]:

TARGET_AD = 9982
pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
    K=50, beta_sim=1.0,
    L_days=30, H_days=7,
    alpha_prior=2.0, beta_prior=120.0, blend_kappa=20.0,
    domain_weight=1.5, restrict_same_domain=False,
    drop_rare_min_ads=3,
    min_eff_clicks=3, min_coverage_ads=2,
    calibrate=False  # True로 두면 같은 창의 관측으로 보정(오프라인 평가는 편향 가능)
)
metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
display(pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","cohort_eff_clicks","coverage_ads","scenarioB_clicks","scenarioB_conv"]])
display(cohort_df.head(20))
display(table.head(20)); print(info); print(metrics)

Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,scenarioB_clicks,scenarioB_conv
0,678,0.711111,711.111303,5.467193,2,71.12,50.574236
1,562,0.486762,486.76175,3.571837,26,84.466667,41.115142
2,1047,0.378362,378.362438,10.240962,2,115.966667,43.877431
3,824,0.347649,347.649496,11.412727,2,128.1,44.5339
4,1021,0.30542,305.42033,16.229402,2,186.2,56.869266
5,845,0.297794,297.794442,16.961394,2,194.6,57.950798
6,1046,0.294503,294.502609,10.551508,2,118.533333,34.908376
7,1022,0.286153,286.152522,18.070244,2,206.733333,59.157265
8,371,0.283267,283.266976,3.404905,10,43.296296,12.264411
9,1045,0.267854,267.853815,4.450915,2,78.75,21.093488


Unnamed: 0,ads_idx,weight,sim
0,34045,0.026598,1.0
1,445574,0.024377,0.916515
2,19488,0.021531,0.809524
3,438658,0.019734,0.741941
4,443132,0.019734,0.741941
5,442829,0.019734,0.741941
6,443424,0.019734,0.741941
7,442904,0.019734,0.741941
8,441071,0.019734,0.741941
9,433368,0.019734,0.741941


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,678,5.467193,5.253676,2,0.545877,0.889945,0.056906,0.889945,0.711111,711.111303,71.12,50.574236,,,,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
1,562,3.571837,1.894455,26,0.568155,,0.031014,0.568155,0.486762,486.76175,84.466667,41.115142,,,,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
2,1047,10.240962,2.756027,2,0.505827,0.553686,0.035965,0.553686,0.378362,378.362438,115.966667,43.877431,0.582888,582.887701,2431.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
3,824,11.412727,2.734495,2,0.491066,0.52578,0.035488,0.52578,0.347649,347.649496,128.1,44.5339,0.569173,569.173245,2407.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
4,1021,16.229402,2.605307,2,0.438942,0.526225,0.033316,0.526225,0.30542,305.42033,186.2,56.869266,0.569118,569.118414,1917.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
5,845,16.961394,2.131614,2,0.426366,0.52513,0.029732,0.52513,0.297794,297.794442,194.6,57.950798,0.569472,569.472412,2483.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
6,1046,10.551508,2.282335,2,0.487614,0.432831,0.032307,0.432831,0.294503,294.502609,118.533333,34.908376,0.462233,462.233169,2436.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
7,1022,18.070244,2.777558,2,0.429247,0.513878,0.034108,0.513878,0.286153,286.152522,206.733333,59.157265,0.563437,563.436563,2002.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
8,371,3.404905,0.615459,10,0.428818,0.327941,0.020856,0.327941,0.283267,283.266976,43.296296,12.264411,0.666667,666.666667,3.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8
9,1045,4.450915,0.0,2,0.682419,0.323944,0.015816,0.323944,0.267854,267.853815,78.75,21.093488,0.576923,576.923077,156.0,0.267021,0.25711,0.242552,0.784338,0.521978,0.2,0.8


{'window_end': '2025-08-25', 'L_days': 30, 'H_days': 7, 'cohort_size': 50, 'calibration': None}
{'RMSE_cvr': 0.2670206327217126, 'MAE_cvr': 0.25711023388790205, 'WMAE_cvr': 0.242551798759298, 'Pearson': 0.7843380519673492, 'Spearman': 0.521978021978022, 'P5': 0.2, 'P10': 0.8}


In [169]:
TARGET_AD = 73878
pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
    K=50, beta_sim=1.0,
    L_days=30, H_days=7,
    alpha_prior=2.0, beta_prior=120.0, blend_kappa=20.0,
    domain_weight=1.5, restrict_same_domain=False,
    drop_rare_min_ads=3,
    min_eff_clicks=3, min_coverage_ads=2,
    calibrate=False  # True로 두면 같은 창의 관측으로 보정(오프라인 평가는 편향 가능)
)
metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
display(pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","cohort_eff_clicks","coverage_ads","scenarioB_clicks","scenarioB_conv"]])
display(cohort_df.head(20))
display(table.head(20)); print(info); print(metrics)

Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,scenarioB_clicks,scenarioB_conv
0,563,0.151175,151.175047,44.868852,27,821.1,124.129831
1,270,0.104683,104.682843,14.43898,19,155.4,16.267714
2,539,0.076453,76.453177,45.182149,2,990.266667,75.709033
3,281,0.072456,72.456362,3.735883,10,45.769231,3.316272
4,371,0.051586,51.585677,3.89071,6,50.0,2.579284
5,761,0.041836,41.836211,32.43898,6,349.533333,14.62315
6,667,0.035804,35.804266,50.113843,2,513.566667,18.387878
7,583,0.026975,26.975364,52.398907,3,819.7,22.111706
8,22,0.023654,23.654322,291.467213,9,2989.233333,70.708289
9,1020,0.006306,6.306151,11.039162,2,188.611111,1.18941


Unnamed: 0,ads_idx,weight,sim
0,439985,0.022769,1.0
1,355220,0.022769,1.0
2,438658,0.022769,1.0
3,441597,0.022769,1.0
4,439405,0.022769,1.0
5,435357,0.022769,1.0
6,446902,0.022769,1.0
7,443941,0.022769,1.0
8,16523,0.022769,1.0
9,441863,0.022769,1.0


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,563,44.868852,25.571038,27,0.570481,0.119653,0.165226,0.119653,0.151175,151.175047,821.1,124.129831,,,,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
1,270,14.43898,1.726776,19,0.488986,0.160539,0.027315,0.160539,0.104683,104.682843,155.4,16.267714,0.029412,29.411765,34.0,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
2,539,45.182149,6.76776,2,0.000671,0.130692,0.052444,0.130692,0.076453,76.453177,990.266667,75.709033,,,,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
3,281,3.735883,0.42623,10,0.084266,0.082386,0.019296,0.082386,0.072456,72.456362,45.769231,3.316272,0.071429,71.428571,14.0,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
4,371,3.89071,0.481785,6,0.428818,0.057786,0.019714,0.057786,0.051586,51.585677,50.0,2.579284,0.02439,24.390244,41.0,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
5,761,32.43898,2.808743,6,0.184864,0.05919,0.031137,0.05919,0.041836,41.836211,349.533333,14.62315,0.027523,27.522936,109.0,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
6,667,50.113843,2.504554,2,0.319281,0.05994,0.026172,0.05994,0.035804,35.804266,513.566667,18.387878,0.052055,52.054795,365.0,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
7,583,52.398907,1.007286,3,0.041749,0.052472,0.017244,0.052472,0.026975,26.975364,819.7,22.111706,,,,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
8,22,291.467213,6.077413,9,0.410914,0.083675,0.019536,0.083675,0.023654,23.654322,2989.233333,70.708289,,,,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5
9,1020,11.039162,0.0,2,0.614679,0.001489,0.015033,0.001489,0.006306,6.306151,188.611111,1.18941,,,,0.037082,0.026812,0.019858,-0.009467,0.1,0.6,0.5


{'window_end': '2025-08-25', 'L_days': 30, 'H_days': 7, 'cohort_size': 50, 'calibration': None}
{'RMSE_cvr': 0.037081889629052955, 'MAE_cvr': 0.026811621226831, 'WMAE_cvr': 0.019858274245371246, 'Pearson': -0.009466504586187948, 'Spearman': 0.1, 'P5': 0.6, 'P10': 0.5}


In [177]:
TARGET_AD = 446302	
pred_df, cohort_df, info = predict_media_cvr(
    perf=perf_df, ad_meta=ad_meta_df, target_ad=TARGET_AD,
    K=50, beta_sim=1.0,
    L_days=30, H_days=7,
    alpha_prior=2.0, beta_prior=120.0, blend_kappa=20.0,
    domain_weight=1.5, restrict_same_domain=False,
    drop_rare_min_ads=3,
    min_eff_clicks=3, min_coverage_ads=2,
    calibrate=False  # True로 두면 같은 창의 관측으로 보정(오프라인 평가는 편향 가능)
)
metrics, table = evaluate_against_actual(perf_df, pred_df, target_ad=TARGET_AD, L_days=30)
display(pred_df.head(20)[["mda_idx","pred_cvr","per_1000_clicks_conv","cohort_eff_clicks","coverage_ads","scenarioB_clicks","scenarioB_conv"]])
display(cohort_df.head(20))
display(table.head(20)); print(info); print(metrics)

Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,scenarioB_clicks,scenarioB_conv
0,563,0.336395,336.395119,21.48,26,501.2,168.601234
1,371,0.297032,297.032422,12.02,15,155.814815,46.282052


Unnamed: 0,ads_idx,weight,sim
0,437533,0.02,1.0
1,439444,0.02,1.0
2,437524,0.02,1.0
3,437527,0.02,1.0
4,443777,0.02,1.0
5,440592,0.02,1.0
6,422797,0.02,1.0
7,445588,0.02,1.0
8,445587,0.02,1.0
9,445583,0.02,1.0


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,baseline_cvr,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv,cvr,per_1000_clicks_conv_act,clicks,RMSE_cvr,MAE_cvr,WMAE_cvr,Pearson,Spearman,P5,P10
0,563,21.48,11.48,26,0.570481,0.596781,0.09395,0.596781,0.336395,336.395119,501.2,168.601234,0.637037,637.037037,270.0,0.300642,0.300642,0.300642,,,0.5,0.5
1,371,12.02,4.82,15,0.428818,0.444965,0.050888,0.444965,0.297032,297.032422,155.814815,46.282052,,,,0.300642,0.300642,0.300642,,,0.5,0.5


{'window_end': '2025-08-25', 'L_days': 30, 'H_days': 7, 'cohort_size': 50, 'calibration': None}
{'RMSE_cvr': 0.30064191781004357, 'MAE_cvr': 0.30064191781004357, 'WMAE_cvr': 0.3006419178100424, 'Pearson': nan, 'Spearman': nan, 'P5': 0.5, 'P10': 0.5}


In [173]:
ads_pool[ads_pool['media_count']>5]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
5,5,5,9935,10,524,889,24,8,금융,7,2592600,145.083333,140.0,NONE,1500,1200,2025-07-26 00:39:57,2025-08-25 10:37:05,숨어있는 휴면포인트 조회,2021-02-01 10:00:00,0,31,0.774194,0.0,300,0.2,7200,MEGA,28.677419,16.903226,MEGA,LOW,MEGA_LOW,3,0,2,4,0,9,LARGE,0,,0.0,1.0
6,6,6,9982,36,22510,23187,12981,3,미디어/컨텐츠,7,2729900,114.674062,118.0,NONE,230,160,2025-07-26 03:09:06,2025-08-25 11:23:42,강원일보 네이버 뉴스,2021-02-05 14:00:00,0,31,418.741935,0.6,70,0.4,908670,MEGA,747.967742,726.129032,MEGA,HIGH,MEGA_HIGH,4,4,4,4,1,17,MEGA,0,,0.0,1.0
24,24,24,11059,9,111,260,7,5,게임,2,2097400,5449.571429,1817.0,NONE,300,200,2025-07-26 11:40:34,2025-08-25 00:14:56,파이널삼국지2,2021-05-03 17:00:00,0,30,0.233333,0.0,100,0.5,700,LARGE,8.666667,3.700000,LARGE,LOW,LARGE_LOW,3,0,1,3,0,7,MEDIUM,0,,0.0,1.0
26,29,29,13209,14,758,2162,169,8,금융,2,2496600,3529.295858,3527.0,NONE,1900,1425,2025-07-26 00:57:18,2025-08-25 10:06:34,아이부자,2021-10-01 17:00:00,0,31,5.451613,0.1,475,0.3,80275,MEGA,69.741935,24.451613,MEGA,LOW,MEGA_LOW,4,1,3,4,0,12,LARGE,0,22,1.0,1.0
28,31,31,13928,77,3111,3124,2834,1,생활,2,2758000,76.663020,70.0,NONE,170,120,2025-07-26 00:08:39,2025-08-25 00:14:22,다톡_실시간 동네친구 채팅,2021-11-12 10:07:00,0,31,91.419355,0.9,50,0.4,141700,MEGA,100.774194,100.354839,MEGA,HIGH,MEGA_HIGH,4,3,3,4,2,16,MEGA,0,,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4507,4561,9191,446301,6,443,661,104,4,커머스,7,1369936,161.394231,140.5,ADS_CODE_DAILY_UPDATE,18,14,2025-08-22 15:10:41,2025-08-25 11:15:11,이지은웨딩 179438,2025-08-22 15:06:56,0,3,34.666667,0.2,4,0.3,416,LARGE,220.333333,147.666667,MEGA,GOOD,MEGA_GOOD,3,2,3,1,0,9,LARGE,0,562563,1.0,1.0
4508,4562,9192,446302,7,229,299,186,4,생활,7,1369936,48.091398,40.0,NONE,18,14,2025-08-22 15:35:59,2025-08-25 10:53:04,용가리전자담배 제우스서브옴 코일 맞추기 8.22,2025-08-22 15:12:23,0,3,62.000000,0.6,4,0.3,744,LARGE,99.666667,76.333333,MEGA,HIGH,MEGA_HIGH,3,3,3,1,1,11,LARGE,0,562563,1.0,1.0
4522,4579,9229,446393,12,82,100,12,10,의료/건강,7,2701000,25.666667,23.5,NONE,1280,960,2025-08-22 17:28:20,2025-08-25 08:11:39,헬스케어,2025-08-22 17:00:00,0,3,4.000000,0.1,320,0.3,3840,MEGA,33.333333,27.333333,MEGA,LOW,MEGA_LOW,4,1,2,1,0,8,LARGE,0,,0.0,1.0
4529,4586,9243,446407,6,206,332,1,8,식음료,7,2759700,149896.000000,149896.0,NONE,200,160,2025-08-22 19:38:14,2025-08-25 10:25:01,[쇼핑라이브하트+채팅] [마라떡볶이 끝판왕] 진또배기 마라맛의 세계로,2025-08-22 19:00:00,0,3,0.333333,0.0,40,0.2,40,LARGE,110.666667,68.666667,MEGA,LOW,MEGA_LOW,3,0,3,1,0,7,MEDIUM,0,,0.0,1.0


In [190]:
# ==============================================================
# 신규(가상) 광고 → 유사 광고 코호트 기반 매체별 CVR/전환수 예측 (원샷 셀)
# ==============================================================

import numpy as np
import pandas as pd

# ----------------- 파일 경로 -----------------
PERF_CSV = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_시간별적립보고서(최종).csv"   # 시간별 집계(또는 로그)
META_CSV = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인리스트.csv"             # 기존 광고 메타(ads_idx 존재)
NEW_ADS_CSV = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/신규가상광고.csv"                                                    # 신규 가상광고 목록

# ----------------- 하이퍼파라미터 -----------------
L_DAYS = 30                 # 예측에 사용할 과거 창 길이
H_DAYS = 30                 # 시나리오B(향후 H일) 클릭/전환 예측 길이
K = 10                      # 유사 광고 코호트 크기
BETA_SIM = 1.0              # 유사도 가중 지수 (cos sim^beta)
ALPHA_PRIOR = 2.0           # 베타-바이노믹 스무딩 alpha
BETA_PRIOR = 120.0          # 베타-바이노믹 스무딩 beta
BLEND_KAPPA = 15.0          # 블렌딩 전환: eff/(eff+kappa)
DOMAIN_WEIGHT = 1.0         # 도메인 가중치(영향 키우려면 2~3)
RESTRICT_SAME_DOMAIN = False# True면 같은 도메인 후보만 코호트로
DROP_RARE_MIN_ADS = 3       # 희귀 원-핫 열 제거(3개 미만 광고에서만 등장)

CAT_COLS = ["domain", "ads_category", "ads_os_type", "ads_type", "ads_rejoin_type"]
PRICE_CANDIDATES = ("ads_media_price", "media_price", "contract_price")

# ----------------- 작은 유틸 -----------------
def _norm_meta(df):
    df = df.copy()
    for c in CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()
    return df

def _pick_date_col(df):
    for c in ["rpt_time_date","click_day","click_date"]:
        if c in df.columns:
            return c
    raise ValueError("날짜 컬럼(rpt_time_date / click_day / click_date) 없음")

def _clicks_convs_cols(df):
    clicks = None; convs = None
    if "rpt_time_clk" in df.columns: clicks = "rpt_time_clk"
    elif "clicks" in df.columns:     clicks = "clicks"
    elif "click_key" in df.columns:  clicks = None           # 로그면 size()로 산출

    if "rpt_time_turn" in df.columns: convs = "rpt_time_turn"
    elif "conversions" in df.columns: convs = "conversions"
    elif "conversion" in df.columns:  convs = "conversion"
    return clicks, convs

def _z(s):
    s = pd.to_numeric(s, errors="coerce")
    mu, sd = np.nanmean(s), np.nanstd(s)
    sd = 1.0 if (sd is None or sd == 0 or np.isnan(sd)) else sd
    return (s - mu) / sd, float(mu), float(sd)

# ----------------- 1) 기존 광고로 피처 공간 학습 -----------------
def build_feature_space(ad_meta, drop_rare_min_ads=3, domain_weight=1.0):
    meta = _norm_meta(ad_meta).drop_duplicates("ads_idx").copy()

    X_list = []
    group_cols = {}

    # (a) 범주형 원-핫
    for c in CAT_COLS:
        if c in meta.columns:
            one = pd.get_dummies(meta[c].astype(str), prefix=c, dtype=float)
            if c == "domain" and domain_weight != 1.0:
                one = one * float(domain_weight)
            X_list.append(one)
            group_cols[c] = list(one.columns)

    # (b) 수치형(가격) - 후보 중 존재하는 첫 컬럼 사용
    price_col = next((c for c in PRICE_CANDIDATES if c in meta.columns), None)
    price_mu = price_sd = None
    if price_col is not None:
        price_log = np.log1p(pd.to_numeric(meta[price_col], errors="coerce"))
        price_z, price_mu, price_sd = _z(price_log)
        X_list.append(price_z.to_frame("price_z"))

    if not X_list:
        raise ValueError("ad_meta에서 만들 수 있는 피처가 없습니다.")

    X = pd.concat(X_list, axis=1).fillna(0.0)
    # 희귀 원-핫 열 제거
    if drop_rare_min_ads and drop_rare_min_ads > 1:
        nz = (X != 0).sum(0)
        keep = nz[nz >= float(drop_rare_min_ads)].index
        X = X[keep]
        # 그룹 열 목록 갱신
        for g in list(group_cols.keys()):
            group_cols[g] = [col for col in group_cols[g] if col in X.columns]

    # z-score (열 단위)
    mu = X.mean()
    sd = X.std(ddof=0).replace(0, 1.0)
    A_z = (X - mu) / (sd + 1e-9)
    A_z.index = meta["ads_idx"].astype(int).values

    store = dict(
        A_z=A_z.astype(np.float32),
        mu=mu.astype(np.float32), sd=sd.astype(np.float32),
        cols=A_z.columns.tolist(),
        group_cols=group_cols,
        price_col=price_col,
        price_mu=price_mu, price_sd=price_sd,
        meta_small=meta[["ads_idx"] + [c for c in CAT_COLS if c in meta.columns]]
    )
    return store

# ----------------- 2) 신규 광고 1건 인코딩(기존 공간에 맞춤) -----------------
def encode_new_ad_row(row, store):
    cols = store["cols"]
    x = pd.Series(0.0, index=cols, dtype=float)

    # 범주형: 기존에 있던 열만 1로 세움(새로운 카테고리는 정보 없음 → 0)
    for c in CAT_COLS:
        if c in row.index:
            val = str(row[c]).strip()
            one_col = f"{c}_{val}"
            if one_col in x.index:
                x[one_col] = 1.0

    # 가격: log1p 후 기존 z스케일 사용
    pcol = store["price_col"]
    if pcol and pcol in row.index:
        val = pd.to_numeric(row[pcol], errors="coerce")
        if pd.notnull(val):
            z = (np.log1p(val) - store["price_mu"]) / (store["price_sd"] + 1e-9)
            if "price_z" in x.index:
                x["price_z"] = float(z)

    # L2 정규화용 벡터 반환
    a = x.values.astype(np.float32)
    a = a / (np.linalg.norm(a) + 1e-12)
    return x, a

# ----------------- 3) 신규 광고 → 유사 광고 코호트 추출 -----------------
def cohort_for_new_ad(row, store, K=50, beta=BETA_SIM, restrict_same_domain=False):
    A = store["A_z"]
    # 후보 제한(같은 도메인)
    cand = A
    if restrict_same_domain and "domain" in row.index and "domain" in store["meta_small"].columns:
        dom = str(row["domain"]).strip()
        ok_ids = store["meta_small"].loc[
            store["meta_small"]["domain"].astype(str).str.strip() == dom, "ads_idx"
        ].astype(int)
        cand = A.loc[A.index.intersection(ok_ids)]

    # 인코딩
    x, a = encode_new_ad_row(row, store)
    M = cand.values
    norms = np.sqrt((M*M).sum(1)) + 1e-12
    sims = (M @ a) / norms

    if len(sims) == 0:
        return pd.DataFrame(columns=["weight","sim"])

    k = min(K, len(sims))
    top = np.argpartition(-sims, k-1)[:k]
    top = top[np.argsort(-sims[top])]

    sim_vals = sims[top]
    w = np.power(np.clip(sim_vals, 0, 1), beta); w = w / (w.sum() + 1e-12)

    out = pd.DataFrame({"ads_idx": cand.index.values[top].astype(int),
                        "sim": sim_vals, "weight": w})
    out = out.set_index("ads_idx")
    return out

# ----------------- 4) 코호트 기반 매체사 CVR/전환수 예측 -----------------
def predict_media_from_cohort(perf_df, ad_meta_df, cohort_df, new_row,
                              L_days=30, H_days=30,
                              alpha_prior=ALPHA_PRIOR, beta_prior=BETA_PRIOR,
                              blend_kappa=BLEND_KAPPA):
    if cohort_df.empty:
        return pd.DataFrame(), {"window_end": None, "L_days": L_days, "H_days": H_days}

    perf = perf_df.copy()
    date_col = _pick_date_col(perf)
    perf[date_col] = pd.to_datetime(perf[date_col])
    wend = perf[date_col].max().normalize()
    start = wend - pd.Timedelta(days=L_days-1)
    hist = perf[(perf[date_col]>=start) & (perf[date_col]<=wend)].copy()

    clk_col, cv_col = _clicks_convs_cols(hist)

    # ads_category를 히스토리에 붙임(매체×카테 베이스라인용)
    if "ads_category" in ad_meta_df.columns and "ads_category" not in hist.columns:
        cat_map = ad_meta_df.drop_duplicates("ads_idx").set_index("ads_idx")["ads_category"]
        hist = hist.merge(cat_map.rename("ads_category"), left_on="ads_idx", right_index=True, how="left")

    # 코호트 가중 집계
    sub = hist[hist["ads_idx"].isin(cohort_df.index)].copy()
    if sub.empty:
        return pd.DataFrame(), {"window_end": str(wend.date()), "L_days": L_days, "H_days": H_days}

    if clk_col is None:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        ).reset_index()
    else:
        g = sub.groupby(["ads_idx","mda_idx"]).agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        ).reset_index()

    w_map = cohort_df["weight"].to_dict()
    g["w"] = g["ads_idx"].map(w_map).fillna(0.0)
    g["w_clicks"] = g["w"] * g["clicks"]
    g["w_convs"]  = g["w"] * g["convs"]

    agg = g.groupby("mda_idx").agg(
        cohort_eff_clicks=("w_clicks","sum"),
        cohort_eff_convs=("w_convs","sum"),
        coverage_ads=("ads_idx","nunique")
    )

    # 베이스라인 (매체 전체)
    if clk_col is None:
        base_m = hist.groupby("mda_idx").agg(
            clicks=("ads_idx","size"), convs=("conversion","sum")
        )
    else:
        base_m = hist.groupby("mda_idx").agg(
            clicks=(clk_col,"sum"), convs=(cv_col,"sum")
        )
    base_m["cvr_m"] = (base_m["convs"] + alpha_prior) / (base_m["clicks"] + alpha_prior + beta_prior)

    # 베이스라인 (매체×카테고리: 신규 광고의 카테고리 사용)
    tcat = None
    if "ads_category" in new_row.index:
        try:
            tcat = int(pd.to_numeric(new_row["ads_category"], errors="coerce"))
        except Exception:
            tcat = None

    base_mc = pd.DataFrame()
    if (tcat is not None) and ("ads_category" in hist.columns):
        subcat = hist[hist["ads_category"]==tcat]
        if not subcat.empty:
            if clk_col is None:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=("ads_idx","size"), convs=("conversion","sum")
                )
            else:
                base_mc = subcat.groupby("mda_idx").agg(
                    clicks=(clk_col,"sum"), convs=(cv_col,"sum")
                )
            base_mc["cvr_mc"] = (base_mc["convs"] + alpha_prior) / (base_mc["clicks"] + alpha_prior + beta_prior)

    out = agg.join(base_m[["cvr_m"]], how="left").join(base_mc[["cvr_mc"]], how="left").fillna({"cvr_m":0.0})
    out["cvr_cohort"] = (out["cohort_eff_convs"] + alpha_prior) / (out["cohort_eff_clicks"] + alpha_prior + beta_prior)
    base = out["cvr_mc"].fillna(out["cvr_m"])
    eff = out["cohort_eff_clicks"]
    w1 = eff / (eff + float(blend_kappa))
    out["pred_cvr"] = w1 * out["cvr_cohort"] + (1.0 - w1) * base
    out["per_1000_clicks_conv"] = out["pred_cvr"] * 1000.0

    # 시나리오: 코호트 일평균 클릭 × H_days
    if clk_col is None:
        per_day = (sub.groupby(["mda_idx", sub[date_col].dt.normalize()])["ads_idx"]
                   .size().rename("clk").reset_index())
    else:
        per_day = (sub.groupby(["mda_idx", sub[date_col].dt.normalize()])[clk_col]
                   .sum().rename("clk").reset_index())
    daily = per_day.groupby("mda_idx")["clk"].mean()
    out["scenarioB_clicks"] = daily.reindex(out.index).fillna(0.0).values * float(H_DAYS)
    out["scenarioB_conv"]   = out["pred_cvr"] * out["scenarioB_clicks"]

    out = out.reset_index().sort_values("per_1000_clicks_conv", ascending=False).reset_index(drop=True)
    info = {"window_end": str(wend.date()), "L_days": L_DAYS, "H_days": H_DAYS}
    return out, info

# ----------------- 5) 배치 실행: 신규 광고 목록 전체 처리 -----------------
def run_new_ads_batch(new_ads_df, ad_meta_df, perf_df,
                      topN_media=20, show_first_n=1):
    store = build_feature_space(ad_meta_df, drop_rare_min_ads=DROP_RARE_MIN_ADS,
                                domain_weight=DOMAIN_WEIGHT)

    results = {}  # key: new_ad_key → (pred_df, cohort_df, info)
    # 신규 광고의 key 컬럼 결정(ads_idx가 없으면 행번호 사용)
    key_col = "ads_idx" if "ads_idx" in new_ads_df.columns else None

    for i, row in new_ads_df.iterrows():
        new_key = int(row[key_col]) if key_col else int(i)
        cohort = cohort_for_new_ad(row, store, K=K, beta=BETA_SIM,
                                   restrict_same_domain=RESTRICT_SAME_DOMAIN)
        pred, info = predict_media_from_cohort(perf_df, ad_meta_df, cohort, row,
                                               L_days=L_DAYS, H_days=H_DAYS,
                                               alpha_prior=ALPHA_PRIOR, beta_prior=BETA_PRIOR,
                                               blend_kappa=BLEND_KAPPA)
        results[new_key] = (pred, cohort, info)

    # 미리보기
    shown = 0
    for k, (pred, cohort, info) in results.items():
        print(f"\n=== 신규광고 {k} : window_end={info['window_end']}, L_days={info['L_days']} ===")
        if not cohort.empty:
            disp = cohort.reset_index().rename(columns={"index":"ads_idx"})[["ads_idx","sim","weight"]].head(10)
            display(disp.style.format({"sim":"{:.3f}","weight":"{:.3f}"}))
        else:
            print("코호트가 비어 있습니다.")

        if not pred.empty:
            cols = ["mda_idx","pred_cvr","per_1000_clicks_conv",
                    "cohort_eff_clicks","coverage_ads",
                    "cvr_m","cvr_mc","cvr_cohort",
                    "scenarioB_clicks","scenarioB_conv"]
            display(pred.head(topN_media)[[c for c in cols if c in pred.columns]]
                    .style.format({"pred_cvr":"{:.6f}","per_1000_clicks_conv":"{:.3f}",
                                   "cvr_m":"{:.6f}","cvr_mc":"{:.6f}","cvr_cohort":"{:.6f}",
                                   "scenarioB_clicks":"{:.3f}","scenarioB_conv":"{:.3f}"}))
        else:
            print("히스토리 구간에서 코호트 데이터가 없습니다.")
        shown += 1
        if shown >= show_first_n:
            break

    return results

# ================== 실행 ==================
# 1) 데이터 로드(네 파일 스키마에 맞춰 자동 인식)
perf_df = pd.read_csv(PERF_CSV, encoding="utf-8-sig")
ad_meta_df = pd.read_csv(META_CSV, encoding="utf-8-sig")
new_ads_df = pd.read_csv(NEW_ADS_CSV, encoding="utf-8-sig")

# 2) 배치 실행 (상위 1개 신규광고만 미리보기)
results = run_new_ads_batch(new_ads_df, ad_meta_df, perf_df,
                            topN_media=20, show_first_n=1)

# 3) 특정 신규광고 결과 꺼내쓰기 예:
# new_id = list(results.keys())[0]
# pred_df, cohort_df, info = results[new_id]
# display(pred_df.head(30))
# display(cohort_df.head(20))
# print(info)



=== 신규광고 500000 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,21617,0.798,0.1
1,158724,0.798,0.1
2,57009,0.798,0.1
3,53626,0.798,0.1
4,375578,0.798,0.1
5,157268,0.798,0.1
6,150733,0.798,0.1
7,375579,0.798,0.1
8,119127,0.798,0.1
9,158693,0.798,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,270,0.052244,52.244,0.7,3,0.488986,0.053922,0.0163,35.0,1.829
1,54,0.022963,22.963,0.9,2,0.333544,0.023364,0.016273,38.571,0.886
2,281,0.020439,20.439,0.4,1,0.084266,0.020548,0.01634,40.0,0.818
3,337,0.01889,18.89,1.1,6,0.143646,0.019084,0.016247,47.143,0.891
4,714,0.017408,17.408,8.400001,6,0.017279,0.017279,0.017638,86.897,1.513
5,564,0.016261,16.261,0.1,1,0.340357,0.01626,0.01638,30.0,0.488
6,1049,0.016261,16.261,0.1,1,0.016129,0.01626,0.01638,30.0,0.488
7,1037,0.016003,16.003,0.1,1,0.042254,0.016,0.01638,30.0,0.48
8,73,0.01381,13.81,0.1,1,0.026316,0.013793,0.01638,30.0,0.414


In [191]:
# 1) 데이터 로드(네 파일 스키마에 맞춰 자동 인식)
perf_df = pd.read_csv(PERF_CSV, encoding="utf-8-sig")
ad_meta_df = pd.read_csv(META_CSV, encoding="utf-8-sig")
new_ads_df = pd.read_csv(NEW_ADS_CSV, encoding="utf-8-sig")

# 2) 배치 실행 (상위 1개 신규광고만 미리보기)
results = run_new_ads_batch(new_ads_df, ad_meta_df, perf_df,
                            topN_media=20, show_first_n=8)

# 3) 특정 신규광고 결과 꺼내쓰기 예:
new_id = list(results.keys())[0]
pred_df, cohort_df, info = results[new_id]
display(pred_df.head(30))
display(cohort_df.head(20))
print(info)


=== 신규광고 500000 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,21617,0.798,0.1
1,158724,0.798,0.1
2,57009,0.798,0.1
3,53626,0.798,0.1
4,375578,0.798,0.1
5,157268,0.798,0.1
6,150733,0.798,0.1
7,375579,0.798,0.1
8,119127,0.798,0.1
9,158693,0.798,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,270,0.052244,52.244,0.7,3,0.488986,0.053922,0.0163,35.0,1.829
1,54,0.022963,22.963,0.9,2,0.333544,0.023364,0.016273,38.571,0.886
2,281,0.020439,20.439,0.4,1,0.084266,0.020548,0.01634,40.0,0.818
3,337,0.01889,18.89,1.1,6,0.143646,0.019084,0.016247,47.143,0.891
4,714,0.017408,17.408,8.400001,6,0.017279,0.017279,0.017638,86.897,1.513
5,564,0.016261,16.261,0.1,1,0.340357,0.01626,0.01638,30.0,0.488
6,1049,0.016261,16.261,0.1,1,0.016129,0.01626,0.01638,30.0,0.488
7,1037,0.016003,16.003,0.1,1,0.042254,0.016,0.01638,30.0,0.48
8,73,0.01381,13.81,0.1,1,0.026316,0.013793,0.01638,30.0,0.414



=== 신규광고 500001 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,431654,0.642,0.1
1,436211,0.642,0.1
2,436212,0.642,0.1
3,443096,0.642,0.1
4,431643,0.642,0.1
5,443080,0.642,0.1
6,444876,0.642,0.1
7,436210,0.642,0.1
8,436216,0.642,0.1
9,444878,0.642,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,270,0.528866,528.866,0.1,1,0.488986,0.532283,0.01638,30.0,15.866
1,562,0.502525,502.525,1.9,4,0.568155,0.562907,0.025827,285.0,143.22
2,563,0.263548,263.548,33.900001,10,0.570481,0.596781,0.1161,1452.857,382.898
3,854,0.257742,257.742,0.1,1,0.281761,0.259346,0.017199,30.0,7.732



=== 신규광고 500002 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,441662,0.599,0.1
1,441660,0.599,0.1
2,437891,0.599,0.1
3,444673,0.599,0.1
4,437904,0.599,0.1
5,437905,0.599,0.1
6,437906,0.599,0.1
7,441664,0.599,0.1
8,441663,0.599,0.1
9,437888,0.599,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,634,0.533448,533.448,3.2,10,0.61607,0.641457,0.027157,137.143,73.159
1,270,0.488919,488.919,1.4,3,0.488986,0.532283,0.024311,70.0,34.224
2,492,0.41264,412.64,1.3,9,0.447681,0.446645,0.020276,65.0,26.822
3,371,0.304228,304.228,8.3,1,0.428818,0.444965,0.049885,355.714,108.218
4,442,0.181755,181.755,0.1,1,0.237931,0.182857,0.01638,30.0,5.453
5,481,0.015063,15.063,0.3,2,0.02,0.015038,0.016353,90.0,1.356



=== 신규광고 500003 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,34045,0.642,0.108
1,9982,0.642,0.108
2,445268,0.584,0.098
3,386953,0.584,0.098
4,445234,0.584,0.098
5,437205,0.584,0.098
6,433368,0.584,0.098
7,445264,0.584,0.098
8,412816,0.584,0.098
9,443424,0.584,0.098


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,562,0.519505,519.505,1.47079,4,0.568155,,0.023345,225.0,116.889
1,990,0.499125,499.125,1.940206,1,0.339799,0.560811,0.022225,54.0,26.953
2,1025,0.418021,418.021,47.31946,1,0.549827,0.847966,0.281731,470.357,196.619
3,1047,0.411087,411.087,262.035554,1,0.505827,0.553686,0.402924,2431.0,999.353
4,1029,0.411068,411.068,19.294267,1,0.698301,0.751852,0.146131,335.625,137.965
5,845,0.403136,403.136,267.640592,1,0.426366,0.52513,0.396298,2483.0,1000.986
6,1032,0.402575,402.575,48.72072,1,0.706681,0.780031,0.286364,502.222,202.182
7,824,0.399667,399.667,259.448613,1,0.491066,0.52578,0.392376,2407.0,961.999
8,1020,0.390136,390.136,315.28342,1,0.614679,0.479071,0.385905,3025.862,1180.499
9,1026,0.379552,379.552,29.965399,1,0.601795,0.754414,0.191905,297.857,113.052



=== 신규광고 500004 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,56225,0.597,0.113
1,442071,0.596,0.113
2,445574,0.593,0.112
3,155357,0.592,0.112
4,393760,0.571,0.108
5,446054,0.495,0.094
6,433924,0.458,0.087
7,445095,0.458,0.087
8,438250,0.458,0.087
9,438247,0.458,0.087


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,1032,0.774314,774.314,0.113141,1,0.706681,0.780031,0.016378,30.0,23.229
1,1027,0.53368,533.68,5.883352,1,0.464252,0.720559,0.057221,82.105,43.818
2,756,0.513666,513.666,102.899571,2,0.579098,0.89628,0.457891,9110.0,4679.493
3,678,0.511681,511.681,100.301682,2,0.545877,0.889945,0.455112,8880.0,4543.726
4,757,0.480809,480.809,82.45683,2,0.586401,0.880464,0.408107,3128.571,1504.247
5,1020,0.465622,465.622,0.451807,1,0.614679,0.479071,0.0191,120.0,55.875
6,676,0.442084,442.084,81.213033,2,0.563935,0.738946,0.387254,4314.0,1907.149
7,685,0.429003,429.003,73.418607,1,0.566522,0.744485,0.364548,6500.0,2788.52
8,725,0.423036,423.036,67.77216,2,0.532367,0.759278,0.348615,2250.0,951.831
9,760,0.422865,422.865,67.440323,2,0.529225,0.745747,0.35105,852.857,360.643



=== 신규광고 500005 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,27889,0.62,0.1
1,16730,0.62,0.1
2,80973,0.62,0.1
3,439973,0.62,0.1
4,111078,0.62,0.1
5,72968,0.62,0.1
6,93248,0.62,0.1
7,111724,0.62,0.1
8,100023,0.62,0.1
9,371469,0.62,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,337,0.112328,112.328,3.9,3,0.143646,0.132653,0.034154,167.143,18.775
1,294,0.071912,71.912,1.8,2,0.199506,0.078603,0.016155,67.5,4.854
2,32,0.070469,70.469,45.500004,1,0.097403,0.12305,0.053134,650.0,45.805
3,54,0.042139,42.139,40.700004,5,0.333544,0.066421,0.03319,407.0,17.151
4,356,0.040921,40.921,11.600001,1,0.151277,0.054622,0.023204,174.0,7.12
5,22,0.040274,40.274,14.400001,5,0.410914,0.052897,0.027126,205.714,8.285
6,651,0.034604,34.604,3.5,1,0.068886,0.038217,0.019124,87.5,3.028
7,270,0.033156,33.156,8.600001,6,0.488986,0.04119,0.019142,112.174,3.719
8,634,0.03165,31.65,0.1,1,0.61607,0.031746,0.017199,30.0,0.949
9,281,0.030715,30.715,1.5,6,0.084266,0.032086,0.017004,56.25,1.728



=== 신규광고 500006 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,163118,0.737,0.1
1,163126,0.737,0.1
2,183573,0.737,0.1
3,177225,0.737,0.1
4,163114,0.737,0.1
5,232537,0.737,0.1
6,163144,0.737,0.1
7,435606,0.737,0.1
8,430591,0.737,0.1
9,163134,0.737,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,655,0.046223,46.223,0.7,5,0.375385,0.047619,0.0163,35.0,1.618
1,562,0.044489,44.489,10.9,2,0.568155,0.063694,0.018059,125.769,5.595
2,652,0.019723,19.723,2.0,2,0.046512,0.020202,0.016129,100.0,1.972
3,401,0.015526,15.526,0.4,1,0.067834,0.015504,0.01634,40.0,0.621
4,761,0.009324,9.324,2.0,4,0.184864,0.008416,0.016129,66.667,0.622
5,54,0.007635,7.635,0.2,2,0.333544,0.007519,0.016367,30.0,0.229



=== 신규광고 500007 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,163214,0.731,0.1
1,163221,0.731,0.1
2,163207,0.731,0.1
3,163209,0.731,0.1
4,163212,0.731,0.1
5,163205,0.731,0.1
6,163217,0.731,0.1
7,163228,0.731,0.1
8,163226,0.731,0.1
9,163201,0.731,0.1


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,645,0.135125,135.125,0.3,1,0.532399,0.1375,0.016353,90.0,12.161
1,643,0.125578,125.578,0.5,5,0.242537,0.129193,0.017143,37.5,4.709
2,651,0.033026,33.026,0.2,1,0.068886,0.033237,0.017185,30.0,0.991
3,790,0.032345,32.345,0.2,1,0.477273,0.032558,0.016367,30.0,0.97
4,270,0.032001,32.001,3.9,7,0.488986,0.035571,0.018268,61.579,1.971
5,371,0.020467,20.467,1.9,4,0.428818,0.020913,0.016949,47.5,0.972
6,568,0.016007,16.007,0.3,1,0.145455,0.016,0.016353,90.0,1.441
7,337,0.015885,15.885,4.2,7,0.143646,0.015674,0.01664,63.0,1.001
8,281,0.00994,9.94,0.6,3,0.084266,0.009685,0.016313,45.0,0.447
9,761,0.009473,9.473,2.4,4,0.184864,0.008416,0.016077,51.429,0.487


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv
0,270,0.7,0.0,3,0.488986,0.053922,0.0163,0.052244,52.24417,35.0,1.828546
1,54,0.9,0.0,2,0.333544,0.023364,0.016273,0.022963,22.963103,38.571429,0.88572
2,281,0.4,0.0,1,0.084266,0.020548,0.01634,0.020439,20.438645,40.0,0.817546
3,337,1.1,0.0,6,0.143646,0.019084,0.016247,0.01889,18.890136,47.142857,0.890535
4,714,8.400001,0.3,6,0.017279,0.017279,0.017638,0.017408,17.40764,86.896552,1.512664
5,564,0.1,0.0,1,0.340357,0.01626,0.01638,0.016261,16.260956,30.0,0.487829
6,1049,0.1,0.0,1,0.016129,0.01626,0.01638,0.016261,16.260956,30.0,0.487829
7,1037,0.1,0.0,1,0.042254,0.016,0.01638,0.016003,16.002517,30.0,0.480075
8,73,0.1,0.0,1,0.026316,0.013793,0.01638,0.01381,13.810235,30.0,0.414307


Unnamed: 0_level_0,sim,weight
ads_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
21617,0.79787,0.1
158724,0.79787,0.1
57009,0.79787,0.1
53626,0.79787,0.1
375578,0.79787,0.1
157268,0.79787,0.1
150733,0.79787,0.1
375579,0.79787,0.1
119127,0.79787,0.1
158693,0.79787,0.1


{'window_end': '2025-08-25', 'L_days': 30, 'H_days': 30}


In [187]:
# 1) 데이터 로드(네 파일 스키마에 맞춰 자동 인식)
perf_df = pd.read_csv(PERF_CSV, encoding="utf-8-sig")
ad_meta_df = pd.read_csv(META_CSV, encoding="utf-8-sig")
new_ads_df = pd.read_csv(NEW_ADS_CSV, encoding="utf-8-sig")

# 2) 배치 실행 (상위 1개 신규광고만 미리보기)
results = run_new_ads_batch(new_ads_df, ad_meta_df, perf_df,
                            topN_media=20, show_first_n=8)

# 3) 특정 신규광고 결과 꺼내쓰기 예:
new_id = list(results.keys())[0]
pred_df, cohort_df, info = results[new_id]
display(pred_df.head(30))
display(cohort_df.head(20))
print(info)


=== 신규광고 500000 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,403909,0.798,0.021
1,119127,0.798,0.021
2,150733,0.798,0.021
3,158724,0.798,0.021
4,157268,0.798,0.021
5,21617,0.798,0.021
6,53626,0.798,0.021
7,57009,0.798,0.021
8,158693,0.798,0.021
9,375578,0.798,0.021


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,980,0.09421,94.21,73.381846,3,0.333211,0.333211,0.045356,5046.923,475.471
1,270,0.052732,52.732,0.494678,13,0.488986,0.053922,0.016664,55.385,2.921
2,854,0.027352,27.352,0.061835,3,0.281761,0.027397,0.016385,30.0,0.821
3,645,0.023583,23.583,0.082446,3,0.532399,0.023622,0.016551,60.0,1.415
4,54,0.022934,22.934,1.103456,17,0.333544,0.023364,0.017084,75.0,1.72
5,281,0.020491,20.491,0.206116,6,0.084266,0.020548,0.016366,75.0,1.537
6,337,0.018911,18.911,0.981686,23,0.143646,0.019084,0.016263,90.0,1.702
7,714,0.01715,17.15,3.477124,34,0.017279,0.017279,0.016596,170.0,2.916
8,564,0.01626,16.26,0.020612,1,0.340357,0.01626,0.016391,30.0,0.488
9,711,0.01626,16.26,0.020612,1,0.18595,0.01626,0.016391,30.0,0.488



=== 신규광고 500001 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,439605,0.642,0.02
1,444876,0.642,0.02
2,436210,0.642,0.02
3,444878,0.642,0.02
4,436211,0.642,0.02
5,431654,0.642,0.02
6,431643,0.642,0.02
7,443096,0.642,0.02
8,436212,0.642,0.02
9,436214,0.642,0.02


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,634,0.637323,637.323,0.1,2,0.61607,0.641457,0.017199,50.0,31.866
1,270,0.526173,526.173,0.18,5,0.488986,0.532283,0.017024,54.0,28.413
2,562,0.515588,515.588,1.44,11,0.568155,0.562907,0.022683,360.0,185.612
3,492,0.445501,445.501,0.04,2,0.447681,0.446645,0.016552,30.0,13.365
4,563,0.265391,265.391,38.600003,41,0.570481,0.596781,0.136613,2631.818,698.461
5,854,0.257102,257.102,0.14,2,0.281761,0.259346,0.016702,105.0,26.996



=== 신규광고 500002 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,444871,0.599,0.02
1,437906,0.599,0.02
2,441664,0.599,0.02
3,441663,0.599,0.02
4,441662,0.599,0.02
5,441661,0.599,0.02
6,441660,0.599,0.02
7,441659,0.599,0.02
8,443954,0.599,0.02
9,444713,0.599,0.02


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,634,0.542082,542.082,2.88,47,0.61607,0.641457,0.024504,187.826,101.817
1,270,0.500906,500.906,0.98,13,0.488986,0.532283,0.020654,98.0,49.089
2,492,0.432292,432.292,0.52,21,0.447681,0.446645,0.018283,78.0,33.719
3,371,0.325975,325.975,6.259999,6,0.428818,0.444965,0.040855,586.875,191.307
4,854,0.259022,259.022,0.02,1,0.281761,0.259346,0.016391,30.0,7.771
5,645,0.240602,240.602,0.16,5,0.532399,0.242991,0.016699,48.0,11.549
6,817,0.207678,207.678,0.06,2,0.585023,0.208443,0.016385,45.0,9.346
7,442,0.182195,182.195,0.06,3,0.237931,0.182857,0.016549,45.0,8.199
8,978,0.116558,116.558,0.06,1,0.125654,0.116959,0.016385,90.0,10.49
9,686,0.070777,70.777,0.04,1,0.086093,0.070922,0.016388,60.0,4.247



=== 신규광고 500003 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,34045,0.642,0.022
1,9982,0.642,0.022
2,444834,0.584,0.02
3,443407,0.584,0.02
4,443424,0.584,0.02
5,444851,0.584,0.02
6,444011,0.584,0.02
7,445988,0.584,0.02
8,446353,0.584,0.02
9,446605,0.584,0.02


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,1029,0.605604,605.604,3.919919,1,0.698301,0.751852,0.04597,335.625,203.256
1,1026,0.553566,553.566,6.087919,1,0.601795,0.754414,0.058698,297.857,164.884
2,1025,0.550256,550.256,9.613657,1,0.549827,0.847966,0.085745,470.357,258.817
3,990,0.546901,546.901,0.394182,1,0.339799,0.560811,0.017593,54.0,29.533
4,1027,0.517331,517.331,6.744889,1,0.464252,0.720559,0.065373,318.621,164.833
5,1032,0.50467,504.67,9.898344,1,0.706681,0.780031,0.087386,502.222,253.456
6,562,0.450283,450.283,4.223225,31,0.568155,,0.031627,397.5,178.987
7,997,0.418614,418.614,2.058505,1,0.262781,0.473098,0.021594,134.286,56.214
8,1013,0.388405,388.405,2.759273,1,0.274104,0.455128,0.025685,145.385,56.468
9,1010,0.370163,370.163,1.401535,1,0.288085,0.402936,0.019402,120.0,44.42



=== 신규광고 500004 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,56225,0.597,0.025
1,442071,0.596,0.025
2,445574,0.593,0.025
3,155357,0.592,0.025
4,393760,0.571,0.024
5,446054,0.495,0.021
6,438248,0.458,0.019
7,438249,0.458,0.019
8,438250,0.458,0.019
9,445095,0.458,0.019


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,1032,0.778745,778.745,0.025302,1,0.706681,0.780031,0.01639,30.0,23.362
1,1027,0.664538,664.538,1.315709,1,0.464252,0.720559,0.025862,82.105,54.562
2,1026,0.598863,598.863,4.243796,2,0.601795,0.754414,0.049058,1008.0,603.654
3,1029,0.582274,582.274,4.800866,2,0.698301,0.751852,0.052438,518.182,301.724
4,1025,0.571022,571.022,8.437199,2,0.549827,0.847966,0.078661,910.909,520.149
5,562,0.511897,511.897,1.727418,4,0.568155,,0.023381,534.0,273.353
6,1020,0.475979,475.979,0.101039,1,0.614679,0.479071,0.017001,120.0,57.118
7,757,0.474309,474.309,18.440032,2,0.586401,0.880464,0.143924,3128.571,1483.911
8,677,0.457482,457.482,10.482809,2,0.596462,0.71308,0.091743,3112.5,1423.913
9,678,0.456955,456.955,22.430722,2,0.545877,0.889945,0.167403,8880.0,4057.756



=== 신규광고 500005 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,13988,0.62,0.023
1,357626,0.62,0.023
2,93248,0.62,0.023
3,111724,0.62,0.023
4,371469,0.62,0.023
5,27889,0.62,0.023
6,111078,0.62,0.023
7,439973,0.62,0.023
8,80973,0.62,0.023
9,100023,0.62,0.023


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,389,0.57771,577.71,0.015807,1,0.578301,,0.016521,30.0,17.331
1,562,0.561835,561.835,0.17388,3,0.568155,,0.016629,110.0,61.802
2,563,0.427175,427.175,5.643192,12,0.570481,,0.046257,823.846,351.926
3,337,0.12127,121.27,1.697751,11,0.143646,0.132653,0.020695,130.588,15.836
4,32,0.083337,83.337,10.615095,1,0.097403,0.12305,0.02722,650.0,54.169
5,294,0.070122,70.122,2.49152,4,0.199506,0.078603,0.019064,110.69,7.762
6,356,0.049039,49.039,2.706266,1,0.151277,0.054622,0.018096,174.0,8.533
7,54,0.048556,48.556,9.793772,9,0.333544,0.066421,0.021194,420.0,20.393
8,22,0.043022,43.022,6.17211,11,0.410914,0.052897,0.019025,284.483,12.239
9,651,0.037124,37.124,0.816546,1,0.068886,0.038217,0.017044,87.5,3.248



=== 신규광고 500006 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,172889,0.737,0.02
1,163222,0.737,0.02
2,163175,0.737,0.02
3,163179,0.737,0.02
4,163188,0.737,0.02
5,163189,0.737,0.02
6,163192,0.737,0.02
7,163197,0.737,0.02
8,163208,0.737,0.02
9,435837,0.737,0.02


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,562,0.048012,48.012,8.38,15,0.568155,0.063694,0.019942,419.0,20.117
1,655,0.046807,46.807,0.4,13,0.375385,0.047619,0.01634,46.154,2.16
2,14,0.026425,26.425,0.38,1,0.271691,0.026681,0.016343,570.0,15.063
3,667,0.023032,23.032,1.86,6,0.319281,0.023786,0.016955,99.643,2.295
4,375,0.021977,21.977,0.22,8,0.342593,0.022059,0.016364,55.0,1.209
5,652,0.019871,19.871,1.48,7,0.046512,0.020202,0.016521,123.333,2.451
6,401,0.015512,15.512,0.14,3,0.067834,0.015504,0.016375,42.0,0.652
7,540,0.01324,13.24,17.6,8,0.348375,0.011124,0.015043,880.0,11.651
8,761,0.00961,9.61,2.72,24,0.184864,0.008416,0.016196,145.714,1.4
9,26,0.00803,8.03,1.34,5,0.090909,0.007299,0.016215,77.308,0.621



=== 신규광고 500007 : window_end=2025-08-25, L_days=30 ===


Unnamed: 0,ads_idx,sim,weight
0,163234,0.731,0.02
1,163257,0.731,0.02
2,177226,0.731,0.02
3,176518,0.731,0.02
4,439976,0.731,0.02
5,172888,0.731,0.02
6,163274,0.731,0.02
7,163272,0.731,0.02
8,163268,0.731,0.02
9,163263,0.731,0.02


Unnamed: 0,mda_idx,pred_cvr,per_1000_clicks_conv,cohort_eff_clicks,coverage_ads,cvr_m,cvr_mc,cvr_cohort,scenarioB_clicks,scenarioB_conv
0,645,0.136857,136.857,0.08,2,0.532399,0.1375,0.016383,60.0,8.211
1,643,0.121749,121.749,1.06,18,0.242537,0.129193,0.016415,83.684,10.188
2,634,0.105515,105.515,0.02,1,0.61607,0.105634,0.016391,30.0,3.165
3,651,0.032676,32.676,0.52,4,0.068886,0.033237,0.016487,60.0,1.961
4,790,0.03196,31.96,0.58,6,0.477273,0.032558,0.016479,108.75,3.476
5,270,0.031328,31.328,4.88,34,0.488986,0.035571,0.018285,244.0,7.644
6,371,0.020437,20.437,1.8,19,0.428818,0.020913,0.016478,100.0,2.044
7,563,0.016363,16.363,39.439999,5,0.570481,0.019973,0.01499,2191.111,35.853
8,568,0.016002,16.002,0.06,1,0.145455,0.016,0.016385,90.0,1.44
9,337,0.015814,15.814,3.46,28,0.143646,0.015674,0.01642,173.0,2.736


Unnamed: 0,mda_idx,cohort_eff_clicks,cohort_eff_convs,coverage_ads,cvr_m,cvr_mc,cvr_cohort,pred_cvr,per_1000_clicks_conv,scenarioB_clicks,scenarioB_conv
0,980,73.381846,6.861723,3,0.333211,0.333211,0.045356,0.09421,94.210082,5046.923077,475.471036
1,270,0.494678,0.041223,13,0.488986,0.053922,0.016664,0.052732,52.732089,55.384615,2.920546
2,854,0.061835,0.0,3,0.281761,0.027397,0.016385,0.027352,27.352051,30.0,0.820562
3,645,0.082446,0.020612,3,0.532399,0.023622,0.016551,0.023583,23.583395,60.0,1.415004
4,54,1.103456,0.103058,17,0.333544,0.023364,0.017084,0.022934,22.934106,75.0,1.720058
5,281,0.206116,0.0,6,0.084266,0.020548,0.016366,0.020491,20.491257,75.0,1.536844
6,337,0.981686,0.0,23,0.143646,0.019084,0.016263,0.018911,18.910664,90.0,1.70196
7,714,3.477124,0.082446,34,0.017279,0.017279,0.016596,0.01715,17.150201,170.0,2.915534
8,564,0.020612,0.0,1,0.340357,0.01626,0.016391,0.01626,16.260342,30.0,0.48781
9,711,0.020612,0.0,1,0.18595,0.01626,0.016391,0.01626,16.260342,30.0,0.48781


Unnamed: 0_level_0,sim,weight
ads_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
403909,0.79787,0.020612
119127,0.79787,0.020612
150733,0.79787,0.020612
158724,0.79787,0.020612
157268,0.79787,0.020612
21617,0.79787,0.020612
53626,0.79787,0.020612
57009,0.79787,0.020612
158693,0.79787,0.020612
375578,0.79787,0.020612


{'window_end': '2025-08-25', 'L_days': 30, 'H_days': 30}


In [None]:
ads_seg[ads_seg['ads_idx']==13988]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ads_idx,media_count,user_count,total_clicks,total_conversions,ads_category,domain,ads_os_type,ads_order,ctit_mean,ctit_median,ads_rejoin_type,contract_price,media_price,first_click,last_click,ads_name,ads_sdate,expire,days_active,daily_avg_conversions,cvr,margin,roi,total_net_return,ads_level_detailed,daily_clicks,daily_users,scale_level,performance_level,ads_level,media_score,conv_score,clicks_score,stability_score,cvr_score,total_score,ads_size,cluster,mda_idx_arr,M,A
29,32,32,13988,9,32,55,2,7,금융,7,1554607,41.5,41.5,NONE,6000,4500,2025-07-26 23:17:18,2025-08-24 13:56:08,주식투자 정보서비스,2021-11-16 13:00:00,0,29,0.068966,0.0,1500,0.3,3000,LARGE,1.896552,1.103448,LARGE,LOW,LARGE_LOW,3,0,0,3,0,6,MEDIUM,0,,0.0,1.0


In [185]:
ads_13988_analysis = analyze_ads_performance(13988, click)

In [186]:
ads_13988_analysis

Unnamed: 0,ads_idx,mda_idx,total_clicks,total_conversions,contract_price,media_price,domain,ads_category,cvr,profit_per_conversion,total_profit,first_click,last_click,days_active_calc,daily_clicks,daily_conversions,daily_profit,배분그룹
0,13988,761,9,1,6000,4500,금융,7,0.1111,1500,1500,2025-07-29 07:08:54,2025-07-30 15:45:20,2,4.5,0.5,750.0,잘 배분
1,13988,337,5,1,6000,4500,금융,7,0.2,1500,1500,2025-08-07 15:33:58,2025-08-16 13:48:38,9,0.555556,0.111111,166.666667,잘 배분
2,13988,270,8,0,6000,3150,금융,7,0.0,2850,0,2025-07-27 12:01:45,2025-08-24 10:47:52,28,0.285714,0.0,0.0,잘 배분
3,13988,371,13,0,6000,4500,금융,7,0.0,1500,0,2025-07-28 21:37:52,2025-08-24 00:11:14,27,0.481481,0.0,0.0,잘 배분
4,13988,564,1,0,6000,4500,금융,7,0.0,1500,0,2025-08-22 07:56:36,2025-08-22 07:56:36,1,1.0,0.0,0.0,잘 배분
5,13988,772,1,0,6000,4500,금융,7,0.0,1500,0,2025-08-24 13:56:08,2025-08-24 13:56:08,1,1.0,0.0,0.0,잘 배분
6,13988,790,5,0,6000,4500,금융,7,0.0,1500,0,2025-07-26 23:17:18,2025-08-16 13:04:53,21,0.238095,0.0,0.0,잘 배분
7,13988,854,1,0,6000,4500,금융,7,0.0,1500,0,2025-07-31 08:01:01,2025-07-31 08:01:01,1,1.0,0.0,0.0,잘 배분
8,13988,1042,12,0,6000,4500,금융,7,0.0,1500,0,2025-08-06 14:01:20,2025-08-21 15:57:30,16,0.75,0.0,0.0,잘 배분


In [None]:
print('hi')