In [16]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions,
    df_to_bq_table
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [17]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    row_number, 
    col, 
    lit, 
    count, 
    log, 
    exp, 
    sum as spark_sum
)
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, DateType

In [18]:
# Parameters
current_dt = "2024-06-21"
state = "stg"
log_duration = "30"
input_nb_url = (
    "gs://nes_notebooks_seoul_28d/2024-07-01/5a62dcf9-bad4-4c51-84ee-3505ece219ea.ipynb"
)
output_nb_url = "https://aim.sktai.io/nes/jobs/202407010726-24ddd1b0-5c4c-44e8-8804-9a4aed16db99/logs"

In [19]:
long_start_dt = '2024-05-23'
short_start_dt = '2024-06-15'

In [20]:
xdr_valid_cat_list = {
    "business": [],
    "communication": ["community", "sns", "dating"],
    "education":["education_info"],
    "entertainment": ["movie, svod", "music", "bideo_broadcasting", "radio"],
    "finance": ["stock", "cryptoCurrency", "assetmanagement"],
    "game": "all",
    "information": ["news", "blog"],
    "leisure": ["travel", "leisure", "sports", "travel", "hotel"],
    "location": ["map_navigation", "ride_request", "transport", "kids"],
    "shopping": ["fresh_delivery", "overseas direct purchase", "itshopping", "openmarket", "homeshopping", "used", "socialcommerce"],
    "life":["weather", "food", "kids", "health", "carlife", "fashion_beauty"],
    "utility": ["sktservice"],
}
 
xdr_where_clauses = []
for key, vals in xdr_valid_cat_list.items():
    if isinstance(vals, str):
        if vals == "all":
            xdr_where_clauses.append(
                f"TRIM(LOWER(cat2)) = '{key.lower()}'"
            )
            xdr_where_clauses.append("OR")
        else:
            continue
    elif isinstance(vals, list):
        if vals:
            item_list_str = ', '.join(f'"{item.lower()}"' for item in vals)
            xdr_where_clauses.append(
                f"(TRIM(LOWER(cat2)) = '{key.lower()}' AND TRIM(LOWER(cat1)) in ({item_list_str}))"
            )
            xdr_where_clauses.append("OR")
    else:
        raise TypeError(f'should be list | str type')
    
xdr_where_clause = ' '.join(xdr_where_clauses[:-1])

In [42]:
cat2_set = {
            "information":"정보", 
            "life":"생활", 
            "location":"위치/이동", 
            "game":"게임", 
            "shopping":"쇼핑",
            "communication":"커뮤니케이션", 
            "finance":"금융", 
            "utility":"유틸리티", 
            "entertainment": "엔터테인먼트",
            "education":"교육", 
            "business":"회사/사업", 
            "leisure":"스포츠/레져/여행"
        }

In [48]:
cat1_set = {
    "references": "지식/위키",
    "fashion_beauty": "스타일/패션/뷰티",
    "art_design": "디자인",
    "location_etc": "location_etc",
    "action": "액션게임",
    "openmarket": "온라인 쇼핑몰",
    "voip": "화상 채팅/회의",
    "stock": "주식투자",
    "blog": "블로그",
    "casual": "캐주얼 게임",
    "bank_card": "은행",
    "webservice": "웹서비스",
    "information_etc": "기타 정보 커뮤니티",
    "payment": "결제",
    "entertainment_etc": "entertainment_etc",
    "cartoon": "만화",
    "sns": "소셜미디어/SNS",
    "religion": "종교",
    "carlife": "자동차/중고차/렌트",
    "government": "정부운영",
    "job": "구직",
    "weather": "날씨/미세먼지",
    "sktservice": "SKT서비스",
    "strategy": "전략게임",
    "sharing": "공유 전기 자전거/킥보드",
    "operation/management": "출퇴근 관리",
    "itshopping": "IT/ 전자기기 쇼핑몰",
    "puzzle_quiz": "퍼즐/퀴즈 게임",
    "messenger": "메신저",
    "travel": "여행",
    "mall_mart_dfs": "대형 쇼핑몰 앱",
    "b2b_solution": "전자상거래 비즈니스/마케팅 제휴",
    "life_etc": "life_etc",
    "lifestyle": "라이프스타일",
    "simulation": "시뮬레이션 게임",
    "food": "맛집/음식점, 배달",
    "hotel": "호텔/숙박",
    "rhythmgame": "리듬게임",
    "move/interior": "이사 및 인테리어",
    "education_info": "학생/아이 케어 및 관리",
    "delivery_cargo": "물류/택배",
    "productivity_tools": "공유 협업",
    "mnd": "군인",
    "ride_request": "공유차(택시앱)",
    "vr/ar": "vr/ar",
    "leisure": "취미 활동",
    "dating": "데이팅",
    "assetmanagement": "자산 관리/투자",
    "racing": "레이싱게임",
    "communication_etc": "communication_etc",
    "telco": "통신사",
    "cloudgaming": "클라우드 게임",
    "portal": "포탈앱",
    "business_etc": "회사 관련",
    "email": "메일",
    "aiservice": "AI서비스",
    "overseas direct purchase": "해외직구",
    "used": "중고거래",
    "movie": "영화관",
    "news": "온라인 뉴스",
    "coupons_discounts": "쿠폰/포인트앱",
    "video_broadcasting": "영상 플랫폼",
    "sportsgame": "스포츠게임",
    "book": "온라인 서점/e 북",
    "socialcommerce": "소셜커머스",
    "culture and arts": "문화예술",
    "board": "보드게임",
    "transport": "운송(공유차/비행기/버스)",
    "smartdevice": "홈 IOT",
    "map_navigation": "지도/네비",
    "game_etc": "game_etc",
    "homeshopping": "홈쇼핑",
    "p2p_webhard": "P2P_Webhard",
    "photo": "사진",
    "shopping_etc": "Shopping_etc",
    "security": "보안",
    "pcs": "PC 쇼핑몰",
    "english": "영어교육",
    "fresh_delivery": "신선 식품/식재료 전문 스토어",
    "finance_etc": "finance_etc",
    "utility_etc": "utility_etc",
    "radio": "라디오",
    "music": "음악",
    "education_etc": "education_etc",
    "insurance": "보험",
    "roleplaying": "롤플레잉게임",
    "kids": "유아/아기",
    "school": "대학교 홈페이지",
    "onlinepayment": "온라인 결제",
    "health": "헬스/비대면진료",
    "svod": "영상 OTT",
    "card": "카드게임",
    "companion animal": "동물 캠페인",
    "realestates": "부동산",
    "cryptocurrency": "암호화폐",
    "sports": "스포츠",
    "adult": "성인관련",
    "humor": "유머/코믹"
}

In [49]:
import json
js_cat2_set = json.dumps(cat2_set, ensure_ascii=False)

In [50]:
js_cat1_set = json.dumps(cat1_set, ensure_ascii=False)

In [25]:
# query = f"""
# CREATE TEMP FUNCTION xdr_cat2_to_text(xdr_cat2 STRING)
# RETURNS STRING
# LANGUAGE js AS '''
#   var cat2_set = {js_cat2_set};
  
#   return (xdr_cat2 in cat2_set) ? cat2_set[xdr_cat2] : '기타';
# ''';
# """

In [26]:
# query = f"""
# CREATE TEMP FUNCTION xdr_cat1_to_text(xdr_cat2 STRING)
# RETURNS STRING
# LANGUAGE js AS '''
#   var cat1_set = {js_cat1_set};
  
#   if (xdr_cat1 in cat1_set) {{
#     if (xdr_cat1 === "community") {{
#       if (xdr_cat2 === "회사/사업") {{
#         return '커뮤니티/게시판';
#       }} else {{
#         return '대학교 관련 정보';
#       }}
#     }} else {{
#       return cat1_set[xdr_cat1];
#     }}
#   }} else {{
#     return "기타";
#   }}
# ''';
# """

In [55]:
query = f"""
CREATE TEMP FUNCTION xdr_cat2_to_text(xdr_cat2 STRING)
RETURNS STRING
LANGUAGE js AS '''
  var cat2_set = {js_cat2_set};
  
  return (xdr_cat2 in cat2_set) ? cat2_set[xdr_cat2] : '기타';
''';

CREATE TEMP FUNCTION xdr_cat1_to_text(xdr_cat1 STRING, xdr_cat2 STRING)
RETURNS STRING
LANGUAGE js AS '''
  var cat1_set = {js_cat1_set};
  
  if (xdr_cat1 in cat1_set) {{
    if (xdr_cat1 === "community") {{
      if (xdr_cat2 === "회사/사업") {{
        return '커뮤니티/게시판';
      }} else {{
        return '대학교 관련 정보';
      }}
    }} else {{
      return cat1_set[xdr_cat1];
    }}
  }} else {{
    return "기타";
  }}
''';


WITH data_sum AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat1,
    SUM(cat1_cnt) AS cat1_cnt_sum
  FROM adot_reco_dev.xdr_cat1_cnt
  WHERE dt >= '{long_start_dt}'
  AND ( {xdr_where_clause} )
  GROUP BY svc_mgmt_num, luna_id, cat1
),
count_by_group AS (
  SELECT
    cat1,
    COUNT(*) AS count
  FROM data_sum
  GROUP BY cat1
),
df_with_count AS (
  SELECT
    a.*,
    b.count
  FROM data_sum a
  LEFT JOIN count_by_group b
  ON a.cat1 = b.cat1
),

df_with_cumsum AS (
  SELECT
    *,
    SUM(1) OVER (PARTITION BY cat1 ORDER BY cat1_cnt_sum ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_sum
  FROM df_with_count
),

df_with_ecdf AS (
  SELECT
    svc_mgmt_num,
    luna_id,
    cat1,
    cat1_cnt_sum,
    cum_sum / count AS ecdf
  FROM df_with_cumsum
),

data_dt_cnt AS (
  SELECT
    luna_id,
    cat1,
    COUNT(DISTINCT dt) AS luna_cat1_cnt
  FROM adot_reco_dev.xdr_cat1_cnt
  WHERE dt >= '{long_start_dt}'
  AND ( {xdr_where_clause} )
  GROUP BY luna_id, cat1
),

data_dt_cnt_with_weights AS (
  SELECT
    *,
    1 / LOG(60 / luna_cat1_cnt + 1.0e-8) AS df_weight,
    1.0 / (1.0 + EXP(-1 / LOG(60 / luna_cat1_cnt + 1.0e-8))) AS rev_df_weight
  FROM data_dt_cnt
),

merge_data AS (
  SELECT
    a.luna_id,
    a.cat1,
    a.ecdf,
    b.rev_df_weight,
    a.ecdf * 0.6 + b.rev_df_weight * 0.4 AS score
  FROM df_with_ecdf a
  LEFT JOIN data_dt_cnt_with_weights b
  ON a.luna_id = b.luna_id AND a.cat1 = b.cat1
),

data_cat1_distinct AS (
  SELECT DISTINCT
    cat1,
    cat2
 FROM adot_reco_dev.xdr_cat1_cnt
  WHERE dt >= '{long_start_dt}'
  AND ( {xdr_where_clause} )
),

merge_data_with_cat2 AS (
  SELECT
    a.*,
    b.cat2
  FROM merge_data a
  LEFT JOIN data_cat1_distinct b
  ON a.cat1 = b.cat1
),

ranked_data AS (
  SELECT
    *,
    ROW_NUMBER() OVER (PARTITION BY luna_id ORDER BY score DESC) AS rank
  FROM merge_data_with_cat2
),

filtered_data AS (
  SELECT
    luna_id,
    STRING_AGG(DISTINCT xdr_cat1_to_text(cat1, cat2)) AS cat1_profiles,
    STRING_AGG(DISTINCT xdr_cat2_to_text(cat2)) AS cat2_profiles
  FROM ranked_data
  WHERE rank <= 10 AND score >= 0.6
  GROUP BY luna_id
)
SELECT  luna_id,
        cat1_profiles,
        cat2_profiles
        
FROM filtered_data
"""

In [54]:
df  = bq_to_pandas(query)

BadRequest: 400 SyntaxError: Unexpected token 'else' at xdr_cat2_to_text(STRING) line 5, columns 4-8

Location: asia-northeast3
Job ID: 0ee44c70-82fb-4cec-a1a4-c537caf0b691


In [47]:
df

Unnamed: 0,luna_id,cat1_profiles,cat2_profiles
0,APL00000D2EZMVEI991C,"음악,소셜미디어/SNS,지도/네비","엔터테인먼트,위치/이동,커뮤니케이션"
1,APL00000DDBFRUDGH3I8,"맛집/음식점, 배달,소셜미디어/SNS,음악,주식투자","금융,커뮤니케이션,생활,엔터테인먼트"
2,APL00000CYQMU5QIZI0W,"학생/아이 케어 및 관리,SKT서비스,소셜미디어/SNS,온라인 뉴스,온라인 쇼핑몰,...","쇼핑,정보,교육,게임,유틸리티,위치/이동,커뮤니케이션,생활"
3,APL00000DBRVUG7Y9UDC,"소셜미디어/SNS,소셜커머스,운송(공유차/비행기/버스),음악,game_etc,주식투...","생활,정보,위치/이동,게임,쇼핑,금융,커뮤니케이션,엔터테인먼트"
4,APL00000DE7GACSU8IYO,"음악,여행","스포츠/레져/여행,엔터테인먼트"
...,...,...,...
3149891,APL00000C22LQY1TKPOG,"기타,지도/네비","커뮤니케이션,위치/이동"
3149892,APL00000C6L2S4FI2WOW,"운송(공유차/비행기/버스),기타","커뮤니케이션,위치/이동"
3149893,APL00000C3F7TKYUBGU8,"운송(공유차/비행기/버스),기타","커뮤니케이션,위치/이동"
3149894,APL00000D1N3K83WXN9C,"운송(공유차/비행기/버스),기타,지도/네비","커뮤니케이션,위치/이동"
