In [None]:
# 필요한 라이브러리
import os
import re
from typing import List, Optional, Union

import pandas as pd
import numpy as np
from pandas_gbq import to_gbq
from google.cloud import bigquery
from google.oauth2 import service_account

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re

# 엑셀 to CSV

In [None]:
# 코드 by 유선 튜터님
def excel_to_csv(
    xlsx_path: str,
    out_dir: Optional[str] = None,
    sheet: Optional[Union[str, int]] = None,
    header: bool = True,
    encoding: str = "utf-8-sig",
    index: bool = False,
    na_rep: str = ""
) -> List[str]:

    if not os.path.isfile(xlsx_path):
        raise FileNotFoundError(f"파일을 찾을 수 없습니다: {xlsx_path}")

    if out_dir is None:
        out_dir = os.path.dirname(os.path.abspath(xlsx_path)) or "."
    os.makedirs(out_dir, exist_ok=True)

    xl = pd.ExcelFile(xlsx_path, engine="openpyxl")
    sheet_names = xl.sheet_names  # e.g. ['Sheet1', 'Result 1', '요약']

    if sheet is None:
        targets = sheet_names
    elif isinstance(sheet, int):
        # 0-based index 보장
        if sheet < 0 or sheet >= len(sheet_names):
            raise IndexError(f"시트 인덱스 범위 초과: {sheet} (총 {len(sheet_names)}개)")
        targets = [sheet_names[sheet]]
    elif isinstance(sheet, str):
        if sheet not in sheet_names:
            raise ValueError(f"시트를 찾을 수 없습니다: '{sheet}' (존재 시트: {sheet_names})")
        targets = [sheet]
    else:
        raise TypeError("sheet는 None, str, int 중 하나여야 합니다.")

    def _safe_sheet_name(name: str) -> str:

        name = re.sub(r"[\\/:*?\"<>|]", "_", name)
        return name.strip() or "Sheet"

    # 헤더 옵션 매핑
    header_opt = 0 if header else None

    created_paths: List[str] = []

    for sn in targets:
        try:
            df = xl.parse(sn, header=header_opt, dtype=str)

            if df is None or (df.empty and not header):
                print(f"[SKIP] 시트가 비어 있어 건너뜁니다: '{sn}'")
                continue

            base = os.path.splitext(os.path.basename(xlsx_path))[0]
            out_name = f"{base}__{_safe_sheet_name(sn)}.csv"
            out_path = os.path.join(out_dir, out_name)

            df.to_csv(out_path, index=index, encoding=encoding, na_rep=na_rep)
            created_paths.append(out_path)
            print(f"[OK] {sn} → {out_path}")

        except ValueError as e:
            print(f"[SKIP] '{sn}' 변환 중 ValueError: {e}")
        except Exception as e:
            print(f"[ERROR] '{sn}' 변환 실패: {e}")

    if not created_paths:
        raise RuntimeError("변환된 CSV가 없습니다. 시트가 비어있거나 옵션을 확인하세요.")
    return created_paths

In [None]:
# excel_to_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/아이브1년치_참여데이터.xlsx')
# excel_to_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/IVE_광고목록.xlsx')
# excel_to_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/IVE_광고적립.xlsx')
# excel_to_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/IVE_광고참여정보.xlsx')
# excel_to_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/jc_cpi_ads_schedule.xlsx')

In [None]:
# 여러 시트 csv 파일 하나로 합치기 
# df = pd.DataFrame()
# for i in range(1, 18, 1):
# 	# file 이름 중에 바뀌지 않는 부분
#     file = "/Users/t2023-m0052/Desktop/vscode/python/IVE_KOREA/광고참여정보/IVE_광고참여정보__Result "
#     temp = pd.read_csv(file + str(i) + '.csv')
#     df = pd.concat([df, temp], ignore_index=True)
# print("완료")
# print(df.head())
# df.to_csv('광고참여정보')

# 광고목록

In [6]:
# 광고목록 파일 경로 
path_sh = "/Users/t2023-m0052/Documents/GitHub/final_project/data/IVE_광고목록.csv" # 세희
ad_list = pd.read_csv(path_sh, low_memory= False)
ad_list.head()

Unnamed: 0,ads_idx,ads_code,aff_idx,adv_idx,sch_idx,ads_type,ads_category,ads_name,ads_search,ads_icon_img,...,ads_age_min,ads_age_max,ads_os_type,ads_contract_price,ads_reward_price,ads_order,ads_rejoin_type,ads_require_adid,regdate,delyn
0,160,tr71opm6C1,6,10,13604,1,1,리니지레드나이츠,"리니지레드나이츠,com.ncsoft.redknights,ecaab3f9d2616f6...",https://lh3.googleusercontent.com/oDGXxngO9oOP...,...,0,100,2,170,150,982800,NONE,Y,2017-04-06 13:19:53,N
1,284,xlNpfkgtLX,6,10,13605,1,1,강철의함대:Ocean Overlord,"강철의함대:Ocean Overlord,com.gamepub.lw.g,a370ab70...",http://nextapps-nas.aws.appang.kr/icon/2016122...,...,0,100,2,180,160,987200,NONE,Y,2017-01-10 10:37:22,N
2,292,7Hj8V5QKiy,6,10,13606,1,1,스노우 SNOW,"스노우 SNOW,com.campmobile.snow,5b892d7e0cff73621...",http://nextapps-nas.aws.appang.kr/icon/2016092...,...,0,100,2,160,160,987300,NONE,Y,2017-01-02 12:51:34,N
3,304,1RfxHJcuEe,6,10,13607,2,1,서머너즈 워: 천공의 아레나,"서머너즈 워: 천공의 아레나,com.com2us.smon.normal.freeful...",http://nextapps-nas.aws.appang.kr/icon/EecpHzP...,...,0,100,2,230,210,931000,NONE,Y,2017-03-30 17:05:23,N
4,306,l3GyilYEVk,6,10,23543,1,1,하이마트,"하이마트,com.himart.main,8230c6a95581533fad10b0e3b...",http://nextapps-nas.aws.appang.kr/icon/2016032...,...,0,100,2,170,140,1155100,NONE,Y,2017-06-12 15:24:46,N


In [7]:
ad_list.isnull().sum()

ads_idx                    0
ads_code                   0
aff_idx                    0
adv_idx                    0
sch_idx                    0
ads_type                   0
ads_category               0
ads_name                   0
ads_search                 2
ads_icon_img               5
ads_summary             7061
ads_guide                303
ads_limit             444717
ads_payment           442925
ads_save_way             514
ads_day_cap                0
ads_sdate                  0
ads_edate                  0
ads_package           433998
ads_sex_type          445250
ads_age_min                0
ads_age_max                0
ads_os_type                0
ads_contract_price         0
ads_reward_price           0
ads_order                  0
ads_rejoin_type            0
ads_require_adid           0
regdate                    0
delyn                      0
dtype: int64

In [8]:
# 삭제된 광고 확인
ad_list.loc[ad_list['delyn'] == 'Y', ['ads_type', 'ads_name', 'ads_sdate', 'ads_edate', 'regdate']]

Unnamed: 0,ads_type,ads_name,ads_sdate,ads_edate,regdate
7,3,데저트스톰_사전예약(desert_storm_cpa),2016-12-28 15:42:00,2017-01-31 15:42:00,2017-01-05 11:31:16
14,1,지하철종결자,0000-00-00 00:00:00,0000-00-00 00:00:00,2017-01-11 16:47:10
26,1,메탈크래쉬,0000-00-00 00:00:00,0000-00-00 00:00:00,2017-01-19 18:34:22
456,3,핀크 가입 이벤트,2017-12-08 15:00:00,2018-01-01 00:00:00,2017-12-08 15:06:55
475,1,변호사님닷컴,0000-00-00 00:00:00,0000-00-00 00:00:00,2017-12-15 13:21:11
...,...,...,...,...,...
439413,3,[클릭메이트-단골맺기] 홍선수,0000-00-00 00:00:00,0000-00-00 00:00:00,2025-08-06 14:16:36
440342,1,초월자 키우기,0000-00-00 00:00:00,0000-00-00 00:00:00,2025-08-08 16:54:36
442692,1,[원스토어] 조조의 꿈,2025-08-14 00:00:00,2025-09-02 00:00:00,2025-08-14 15:33:46
442693,2,조조의 꿈,2025-08-14 00:00:00,2025-08-27 00:00:00,2025-08-14 15:33:53


In [5]:
ad_list.columns

Index(['ads_idx', 'ads_code', 'aff_idx', 'adv_idx', 'sch_idx', 'ads_type',
       'ads_category', 'ads_name', 'ads_search', 'ads_icon_img', 'ads_summary',
       'ads_guide', 'ads_limit', 'ads_payment', 'ads_save_way', 'ads_day_cap',
       'ads_sdate', 'ads_edate', 'ads_package', 'ads_sex_type', 'ads_age_min',
       'ads_age_max', 'ads_os_type', 'ads_contract_price', 'ads_reward_price',
       'ads_order', 'ads_rejoin_type', 'ads_require_adid', 'regdate', 'delyn'],
      dtype='object')

## 전처리

In [13]:
# 필요없는 열 제거
ad_list = ad_list.drop(['sch_idx', 'ads_search', 'ads_guide', 'ads_limit', 'ads_payment', 'ads_package', 'ads_sex_type', 'ads_require_adid'], axis=1)
# 앞에서 확인한 delyn: 삭제한 광고가 아닌 것만 가져오기
ad_list = ad_list[ad_list['delyn'] == 'N']
ad_list = ad_list.drop(['delyn'], axis=1)
# 등록 날짜 - 날짜 형식으로 변환
ad_list['regdate']   = pd.to_datetime(ad_list['regdate'])
ad_list

Unnamed: 0,ads_idx,ads_code,aff_idx,adv_idx,ads_type,ads_category,ads_name,ads_icon_img,ads_summary,ads_save_way,...,ads_sdate,ads_edate,ads_age_min,ads_age_max,ads_os_type,ads_contract_price,ads_reward_price,ads_order,ads_rejoin_type,regdate
0,160,tr71opm6C1,6,10,1,1,리니지레드나이츠,https://lh3.googleusercontent.com/oDGXxngO9oOP...,[앱설명]<br />[설치형] 매력만점 영웅들과 소환수들이 함께하는 취향저격 RPG...,받기,...,2016-12-01 00:00:00,9999-12-31 23:59:59,0,100,2,170,150,982800,NONE,2017-04-06 13:19:53
1,284,xlNpfkgtLX,6,10,1,1,강철의함대:Ocean Overlord,http://nextapps-nas.aws.appang.kr/icon/2016122...,"[앱설명]<br />[설치형] 한, 중, 일 아시아 유저들과 함께 대규모 해상 전투...",받기,...,2016-12-29 00:00:00,9999-12-31 23:59:59,0,100,2,180,160,987200,NONE,2017-01-10 10:37:22
2,292,7Hj8V5QKiy,6,10,1,1,스노우 SNOW,http://nextapps-nas.aws.appang.kr/icon/2016092...,"[앱설명]<br />[설치형] 셀카, 얼굴인식 스티커, 꿀잼 카메라<br /><br...",받기,...,2016-12-29 00:00:00,9999-12-31 23:59:59,0,100,2,160,160,987300,NONE,2017-01-02 12:51:34
3,304,1RfxHJcuEe,6,10,2,1,서머너즈 워: 천공의 아레나,http://nextapps-nas.aws.appang.kr/icon/EecpHzP...,[앱설명]<br />[실행형] 전 세계 5000만 소환사들을 사로잡은 명작 RPG<...,최초 오픈,...,2017-01-01 00:00:00,9999-12-31 23:59:59,0,100,2,230,210,931000,NONE,2017-03-30 17:05:23
4,306,l3GyilYEVk,6,10,1,1,하이마트,http://nextapps-nas.aws.appang.kr/icon/2016032...,새롭게 단장한 하이마트 쇼핑몰 앱!,받기,...,2017-01-01 00:00:00,2019-11-01 00:00:00,0,100,2,170,140,1155100,NONE,2017-06-12 15:24:46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445255,446914,qy1HUatIL9,85,1631,3,4,한식뷔페&카페 봄날,https://i.imgur.com/XdcGFx7.jpeg,"1. 가이드라인 대로 진행하지 않으시면, 매크로로 인식하여 추후 문제가 발생할 수...",정답입력,...,2025-08-25 11:08:15,9999-12-31 23:59:59,0,100,7,16,10,1369920,ADS_CODE_DAILY_UPDATE,2025-08-25 11:08:15
445256,446915,UKnGFJUnmg,84,1623,3,4,용융소금,https://adwize.co.kr/common_img/reward/GNSQ_IC...,[참여방법]\n- 미션 참여 방법을 꼭 확인하신 후 참여해 주세요.\n- 제공되는 ...,퀴즈 맞추기,...,2025-08-25 11:12:12,9999-12-31 23:59:59,0,100,7,15,10,1369919,NONE,2025-08-25 11:12:12
445257,446916,g3lUg3nuxR,86,1655,3,4,1kg 원두커피 블렌드 7종 맛있는 당일로스팅 고소한 납품 도매 납품 홀빈 180244,https://cashplan-r2.uk/quiz-mission.png,[참여방법]\n1. 본문에서 퀴즈와 상품/장소 확인\n1-1. 퀴즈: 7번째 태그 ...,네이버스마트스토어 퀴즈,...,2025-08-25 11:12:16,9999-12-31 23:59:59,0,100,7,18,12,1369918,ADS_CODE_DAILY_UPDATE,2025-08-25 11:12:16
445258,446917,0GLHsRVZNv,57,84,3,8,[가입하기] 슈퍼티비,https://s3.ap-northeast-2.amazonaws.com/com.ga...,1. 회원가입하기 \n**성인인증 받을필요는 없음❌\n\n2. 왼쪽상단 메뉴바 클릭...,참여,...,2025-08-25 11:00:00,2030-12-27 00:00:00,0,100,7,300,225,2764400,NONE,2025-08-25 11:18:35


In [14]:
print(ad_list.loc[ad_list['ads_sdate'] == '0000-00-00 00:00:00', 'ads_idx']) # 227
ad_list[(ad_list['ads_sdate'] == '0000-00-00 00:00:00') & (ad_list['ads_edate'] == '0000-00-00 00:00:00')] # 227로 동일

67          1306
81          1341
123         1448
124         1450
125         1452
           ...  
370994    372653
410717    412376
438476    440135
442294    443953
442588    444247
Name: ads_idx, Length: 227, dtype: int64


Unnamed: 0,ads_idx,ads_code,aff_idx,adv_idx,ads_type,ads_category,ads_name,ads_icon_img,ads_summary,ads_save_way,...,ads_sdate,ads_edate,ads_age_min,ads_age_max,ads_os_type,ads_contract_price,ads_reward_price,ads_order,ads_rejoin_type,regdate
67,1306,lLU3dyqi3P,15,62,2,1,파이널 드래곤 나이츠 실행형_aos,https://lh3.googleusercontent.com/KybE5bKufVNQ...,3D 초대형 판타지 모바일 게임의 레볼루션! 파이널 드래곤 나이츠! \n이건 ARP...,최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,260,210,933000,NONE,2017-04-06 14:13:53
81,1341,rOAourUQgw,1,109,3,2,프로야구 H2 사전예약,https://lh3.googleusercontent.com/NW3yM9glVbFq...,지금 사전 예약해주세요~! \n선수뽑기 카드와 두둑한 지원금을 드립니다!!\n지금 ...,사전예약 완료,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,7,1500,1200,659100,NONE,2017-03-24 10:29:58
123,1448,oaHWT18w24,30,80,2,1,여명 for kakao,https://cdn-ad-static.buzzvil.com/uploads/1489...,"4월 26일 대규모 업데이트!\n- 신규 직업 \""대검사\"" 추가\n- 캐릭터 최고...",최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,200,180,933400,NONE,2017-05-11 13:10:24
124,1450,bY9l99IGOw,30,80,1,1,몬스터,https://d3aulf22blzf9p.cloudfront.net/uploads/...,▶한 번 시작하면 멈출 수 없는 액션\n- 고퀄리티 그래픽과 화려한 이펙트의 극한 ...,받기,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,160,150,996800,NONE,2017-05-11 13:13:47
125,1452,5GgMwttXai,30,80,2,1,다음-Daum,https://buzzvil.akamaized.net/adfit.image/uplo...,[서비스 접근 권한 안내]\n\n*다음앱에서 요청하는 모든 권한은 선택적 접근 권한...,최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,200,180,933500,NONE,2017-05-11 13:33:33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370994,372653,KfzMhzCv4W,1,557,2,2,삼국지 올스타,https://i.imgur.com/a2hE0UL.jpeg,[참여 방법]\n1. 광고 참여 버튼 터치하여 마켓으로 이동\n2. App 최초 오...,최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,1,280,180,2748100,NONE,2025-05-30 13:10:07
410717,412376,gL1vFwGwZR,1,1660,3,8,나두기업몰,https://i.imgur.com/grRbX8G.jpeg,일반인 누구나 회원가입 가능!\n\n[참여 방법]\n1. 광고 참여 버튼 터치하여 ...,참여,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,1200,800,2752600,NONE,2025-07-02 13:34:23
438476,440135,MEnre3naZI,1,1662,2,1,보스포커,https://i.imgur.com/V9xckPU.png,[참여 방법]\n1. 광고 참여 버튼 터치하여 마켓으로 이동\n2. App 최초 오...,최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,1,280,180,2756700,NONE,2025-08-04 17:46:45
442294,443953,HiAARvqMP1,1,1670,2,1,신트노트,https://i.imgur.com/AYHFLoW.png,[참여 방법]\n1. 광고 참여 버튼 터치하여 마켓으로 이동\n2. App 최초 오...,최초 오픈,...,0000-00-00 00:00:00,0000-00-00 00:00:00,0,100,2,10,10,2761400,NONE,2025-08-13 12:43:15


광고 시작, 광고 끝 날짜는 날짜 타입으로 변환 어려워서 두고 등록일자만 변환

In [None]:
# # ad_list 저장
# ad_list.to_csv('수정2_광고목록.csv', index = False, encoding='utf-8')

In [None]:
# 수정2_광고목록으로 데이터 불러올 때는 바로 이 셀부터 시작!
ad_list = pd.read_csv('수정2_광고목록.csv')

### 도메인 붙이기

1. 위의 광고목록과 광고 참여 정보에 있는 광고 아이디만을 돌려서 광고의 도메인을 분류해서 붙임

2. 위의 광고목록과 1년치 시간별 데이터에 있는 광고 아이디만을 돌려서 광고의 도메인을 분류해서 붙임

In [None]:
# Google Bigquery에 올리기
# # row_id 컬럼 추가
# df = ad_list.head(200).reset_index().rename(columns={"index": "row_id"})

# # 업로드
# to_gbq(
#     df,
#     destination_table="test_ad_list.test",  # dataset.table 형식
#     project_id="sodium-primer-471002-n4",   # gcb 프로젝트 id 
#     if_exists="replace"  # append 도 가능
# )

In [None]:
# 광고참여정보에 기록이 있는 광고 목록

# ads = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정2_광고목록.csv")
# participate = pd.read_csv("/Users/Jiwon/Documents/GitHub/final_project/Jiwon/수정_광고참여정보.csv")

# ads_participate = ads.merge(participate, on='ads_idx', how='left')
# ads_participate = ads_participate[ads_participate['mda_idx'].notna()]
# ads_participate_list = ads_participate.drop_duplicates(subset=['ads_idx'], keep='first')
# df = ads_participate_list.reset_index().rename(columns={"index": "row_id"})

In [None]:
# 참여&적립 테이블에만 있는 광고들 - gcp에서 도메인 분류해 본 csv 들고오기
path_sh = "/Users/Jiwon/Documents/GitHub/final_project/Jiwon/광고도메인.csv"
ads_list = pd.read_csv(path_sh)
ads_list.head()

Unnamed: 0,ads_idx,ads_code,aff_idx,adv_idx,ads_type,ads_category,ads_name,ads_icon_img,ads_summary,ads_save_way,...,ads_edate,ads_age_min,ads_age_max,ads_os_type,ads_contract_price,ads_reward_price,ads_order,ads_rejoin_type,regdate,domain
0,6508,3u4lOKJAek,13,335,3,10,스마트피싱보호,https://seimg.pincrux.com/icon/1551770735.png,"[상품소개]\n알면 피하고, 모르면 당하는 보이스피싱\n나와 소중한 가족을 안전하게...",서비스 가입,...,9999-12-31 00:00:00,0,100,7,1400,1100,2381500,NONE,2020-04-23 17:53:01 UTC,금융\n
1,6985,SijDrtH4GV,13,335,3,7,DB손해보험 다이렉트,https://seimg.pincrux.com/icon/1575337232.jpg,[참여방법]\n- 빠르고 간편한 DB손해보험 다이렉트에서\n- 내 차 보험료를 지금...,내차 보험료 조회,...,9999-06-06 00:00:00,0,100,7,2700,2300,1201938,NONE,2020-06-30 15:22:59 UTC,금융\n
2,8327,k1RZftVHoW,13,335,3,10,파일썬,https://static.i-screen.kr/img/ad/icon/aff_13_...,[참여 방법]\n1. 광고 참여 버튼 터치하여 이벤트 페이지로 이동\n2. 이벤트 ...,첫 결제,...,9999-12-31 23:59:00,0,100,7,3200,2420,1673458,NONE,2020-11-10 11:21:43 UTC,미디어/컨텐츠\n
3,9264,EGZcaVOtlc,23,220,3,10,하나 가득담은 운전자보험가입,https://i.imgur.com/kaWiYyT.png,하나손해보험 다이렉트 운전자보험 가입시\n\n01.참여방법\n1) 계산하기 페이지를...,보험 가입 완료,...,9999-12-31 23:59:59,0,100,7,11000,8250,2372800,NONE,2020-11-30 15:16:45 UTC,금융\n
4,9716,NwZZq45Kgb,6,10,3,8,베이비러브 참여 신청,https://nextapps-nas.aws.appang.kr/icon/SDl2uC...,[참여방법]\n- 페이지 이동 > 베이버러브 참여신청 완료\n- 25-55세 SKT...,참여신청 (25세 ~ 55세),...,2029-11-29 00:00:00,25,55,2,600,480,2699900,NONE,2021-01-11 16:11:27 UTC,생활\n


#### 1년치 광고 목록 - GCP에서 도메인 분류해 본 테이블 불러오기

In [None]:
key_path = "/Users/t2023-m0052/Desktop/kids/sodium-primer-471002-n4-82c44c68a51f.json"
credentials = service_account.Credentials.from_service_account_file(key_path)
# 빅쿼리 클라이언트 연결
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# 쿼리 작성
query = """
SELECT *
FROM `sodium-primer-471002-n4.test_ad_list.plus_domain1`
"""

# 쿼리 실행 후 DataFrame으로 변환
ads_list = client.query(query).to_dataframe()
print(ads_list.head())

ads_list = ads_list.drop(['prompt', 'result', 'ml_generate_text_rai_result', 'ml_generate_text_status'], axis = 1)
ads_list.rename(columns={'ml_generate_text_llm_result':'domain'}, inplace=True)

In [11]:
# domain 도메인명만 나오게 정제

ALLOWED = [
    "게임","뷰티","커머스","의료/건강","금융","미디어/컨텐츠","식음료","생활",
    "채용","운세","식당/카페","운동/스포츠","교육","비영리/공공","기타"
]

PATTERNS = [
    (r'\b게임\b', "게임"),
    (r'\b뷰티\b', "뷰티"),
    (r'\b커머스\b', "커머스"),
    (r'의료\s*/?\s*건강', "의료/건강"),
    (r'\b금융\b', "금융"),
    (r'미디어\s*/?\s*[컨콘]텐츠|미디어콘텐츠', "미디어/컨텐츠"),
    (r'\b식음료\b', "식음료"),
    (r'\b생활\b', "생활"),
    (r'\b채용\b', "채용"),
    (r'\b운세\b', "운세"),
    (r'식당\s*/?\s*카페|식당카페', "식당/카페"),
    (r'운동\s*/?\s*스포츠|운동스포츠', "운동/스포츠"),
    (r'\b교육\b', "교육"),
    (r'비영리\s*/?\s*공공', "비영리/공공"),
    (r'\b기타\b', "기타"),
]

def normalize_domain(text: str) -> str:
    if pd.isna(text):
        return "기타"
    s = str(text)

    # 1) JSON 형태면 값만 추출
    m = re.search(r'"domain"\s*:\s*"([^"]+)"', s)
    if m:
        s = m.group(1)

    # 2) 포매팅/개행 제거
    s = re.sub(r'(\\n|[\r\n])+', ' ', s)           # 개행 제거
    s = re.sub(r'[`"*#]+', '', s)                 # 서식문자 제거
    s = s.strip()

    # 3) 허용 라벨 패턴 찾기
    for pat, lab in PATTERNS:
        if re.search(pat, s):
            return lab

    # 4) 라벨명이 정확히 쓰인 경우(여분 텍스트 동반)
    for lab in ALLOWED:
        if lab in s:
            return lab

    return "기타"

# 사용: df["result"] → 표준라벨
ads_list["domain"] = ads_list["domain"].apply(normalize_domain)

# 광고적립

In [4]:
# 광고적립 파일 경로
path_sh = '/Users/Jiwon/Documents/GitHub/final_project/data/IVE_광고적립.csv'
rewards = pd.read_csv(path_sh)
rewards.head()

Unnamed: 0,rwd_idx,ads_idx,ads_code,mda_idx,pub_sub_rel_id,dvc_idx,advid,click_key,show_cost,adv_cost,rwd_cost,earn_cost,click_date,regdate,ctit
0,238304818,413490,CFZ6lzEmeD,22,27496512,32947806,09b6a808-4f76-4584-87c5-626f21f472a0,b0e22dba943810794d25c993dff425c4e16d0744,190,190,120,120,2025-07-26 00:00:31,2025-07-26 00:05:02,271
1,238304829,413490,CFZ6lzEmeD,22,27496512,47387244,ba4fd0c8-4f06-4dfa-9c3f-c6b39ee7066e,0679ff4c6a14beba296e44d7ffdd88129f42ffbf,190,190,120,120,2025-07-26 00:00:32,2025-07-26 00:08:17,465
2,238304816,413490,CFZ6lzEmeD,22,27496512,32425124,64c675d0-8edd-4478-9906-a806651fdf92,68eb28d636724061da21e490a1afff6a39cd78f7,190,190,120,120,2025-07-26 00:00:42,2025-07-26 00:04:43,241
3,238304817,413490,CFZ6lzEmeD,22,27496513,34006545,a08ebb04-e70a-4492-89d4-17fa96bf8c75,37da42f12b6f732d8c8bf1a609c01e569b7f8a09,190,190,120,120,2025-07-26 00:00:54,2025-07-26 00:04:45,231
4,238304802,133653,rl8ud4uNQC,801,1,56867331,dbe54f40-41b5-4423-ae2f-048fd6ecf5af,624b1bbc57c9ea81005c74e1956d48e76d974816,260,260,97,195,2025-07-26 00:01:07,2025-07-26 00:02:04,57


In [6]:
pd.set_option('display.float_format', lambda x: f'{x:f}')

In [7]:
print("===== 고유값 개수 =====")
print (rewards.nunique())
print()

print("===== null 값 개수 =====")
print("null 개수 : ", rewards.isnull().sum())
print("null 비중 : ", rewards.isnull().mean())
print("\n 광고 식별자만 null 값 가짐\n")

print("===== info =====")
print(rewards.info())
print()

print("===== decribe =====")
print(rewards.describe())

===== 고유값 개수 =====
rwd_idx           1475031
ads_idx              8379
ads_code            18701
mda_idx               174
pub_sub_rel_id       4062
dvc_idx            642059
advid              642164
click_key         1475031
show_cost             178
adv_cost              177
rwd_cost              343
earn_cost             282
click_date         881394
regdate            871979
ctit                18551
dtype: int64

===== null 값 개수 =====
null 개수 :  rwd_idx              0
ads_idx              0
ads_code             0
mda_idx              0
pub_sub_rel_id       0
dvc_idx              0
advid             7171
click_key            0
show_cost            0
adv_cost             0
rwd_cost             0
earn_cost            0
click_date           0
regdate              0
ctit                 0
dtype: int64
null 비중 :  rwd_idx          0.000000
ads_idx          0.000000
ads_code         0.000000
mda_idx          0.000000
pub_sub_rel_id   0.000000
dvc_idx          0.000000
advid            0.

In [8]:
# adv_cost(광고 소진금액), earn_cost(매체사 수익금액) 만 남기기

rewards = rewards.drop(columns=['show_cost','rwd_cost'])

# 날짜관련데이터 테이트타임으로 타입 변환

rewards['click_date'] = pd.to_datetime(rewards['click_date'])
rewards['regdate'] = pd.to_datetime(rewards['regdate'])

# 광고참여

In [16]:
# 광고참여정보 파일 경로
path_sh = "/Users/t2023-m0052/Documents/GitHub/final_project/data/IVE_광고참여정보.csv"
ad_prpt = pd.read_csv(path_sh)
ad_prpt.head()

Unnamed: 0,click_key,ads_idx,dvc_idx,mda_idx,pub_sub_rel_id,adv_price,contract_price,media_price,reward_price,reward_point,click_day,click_time,click_date,exp_day,network,carrier,user_ip
0,000000d54b9faad47ee99d6cd3cf53894dd4baa5,313780,61906528,539,144350110,6000,6000,4500,4500,4500,2025-08-17,21,2025-08-17 21:07:37,2025-09-16,0,,16.184.28.219
1,000002b4d92f7648b455877c2676452efcd22a09,412426,34422806,58,46032732,180,180,170,170,170,2025-07-26,2,2025-07-26 02:18:24,2025-08-25,,,35.78.117.76
2,0000057e97361ff3d0263aaecee34cfaa3ba30fb,443660,38366075,808,1,170,170,120,60,60,2025-08-12,18,2025-08-12 18:17:59,2025-09-11,WIFI,SKTelecom,39.7.55.192
3,00000607f60139015da3ee1dd5499db3faa100dc,360192,61894110,539,144350110,6000,6000,4500,4500,4500,2025-08-17,3,2025-08-17 03:35:02,2025-09-16,0,,211.252.100.157
4,0000066bc25d4a6d147c27326cf972a4de88024e,372307,61956954,539,144350110,15600,15600,11700,11700,11700,2025-08-18,8,2025-08-18 08:01:26,2025-09-17,0,,3.38.148.211


In [19]:
ad_prpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16854865 entries, 0 to 16854864
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   click_key       object        
 1   ads_idx         int64         
 2   dvc_idx         int64         
 3   mda_idx         int64         
 4   pub_sub_rel_id  int64         
 5   contract_price  int64         
 6   media_price     int64         
 7   click_day       datetime64[ns]
 8   click_time      int64         
 9   click_date      datetime64[ns]
 10  exp_day         datetime64[ns]
 11  network         object        
 12  user_ip         object        
dtypes: datetime64[ns](3), int64(7), object(3)
memory usage: 1.6+ GB


## 전처리

In [17]:
# 필요없는 열 제거
ad_prpt = ad_prpt.drop(['adv_price', 'reward_price', 'reward_point', 'carrier'], axis=1)
# 날짜 형식 변환
ad_prpt[['click_day', 'click_date', 'exp_day']] = ad_prpt[['click_day', 'click_date', 'exp_day']].apply(lambda x: pd.to_datetime(x))
ad_prpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16854865 entries, 0 to 16854864
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   click_key       object        
 1   ads_idx         int64         
 2   dvc_idx         int64         
 3   mda_idx         int64         
 4   pub_sub_rel_id  int64         
 5   contract_price  int64         
 6   media_price     int64         
 7   click_day       datetime64[ns]
 8   click_time      int64         
 9   click_date      datetime64[ns]
 10  exp_day         datetime64[ns]
 11  network         object        
 12  user_ip         object        
dtypes: datetime64[ns](3), int64(7), object(3)
memory usage: 1.6+ GB


In [18]:
# network 변환
print(ad_prpt['network'].unique())
# network 숫자 정보로 변환
def change_network(x):
    if x == '2':
        return 'MOBILE'
    elif x == '4':
        return 'WIFI'
    else:
        return x
    
ad_prpt['network'] = ad_prpt['network'].apply(change_network)

['0' nan 'WIFI' 'MOBILE' 'LTE' '2' 'ERROR' '4']


## 적립과 조인

In [None]:
# 지원님 - GCP에서 붙이심!

In [None]:
part_earn = pd.read_csv('/Users/t2023-m0052/Documents/GitHub/final_project/data/참여적립조인.csv')
part_earn = part_earn.iloc[:, 1:]
part_earn

# 시간별 광고 적립

In [None]:
# 광고참여정보 파일 경로
path_sh = ''