In [1]:
# # # Parameters
DT = "2024-06-11"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1(도메인), cat3(장르 등) level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
    - 장기 preference 정보
        - 60일 내 cat1, cat3 정보를 통해서 생성
        - cat3의 경우 음악의 경우는 장르, 가수 분리
- 가정
    - 제대로 하려면 서비스 이용한 빈도 기반이 아닌 체류시간으로 해야 함(음악은 한시간 내 여러번 클릭이 존재 할 수 있으나, 영상은 1-2번 클릭일 수 있음)
    - 에이닷 사용자들이 여러 도메인을 다 헤비 유저로써 이용하지 않는다?
    - 위 가정을 깨려면 도메인 별 평균 체류 시간을 기반한 가중치 조절이 들어가야 함.

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 프로파일 : adot

In [3]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=60)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-11
DT_threshold :  2024-04-12
DT_threshold :  2024-06-04


In [4]:
query = f"""
    SELECT luna_id as user_id, item, item_id, cat3, cat2, cat1, basis_date as dt
    FROM adot_reco.apollo_daily_preprocessed_prd
    where dt >= '{DT_threshold1}'
"""

In [5]:
data = bq_to_df(query)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/12 15:56:10 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/12 15:56:32 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [6]:
data = data.withColumn("split_cat3", split(data["cat3"], "\|"))
data = data.withColumn("cat3_genre", data["split_cat3"].getItem(0)) \
            .withColumn("cat3_artist", data["split_cat3"].getItem(1))

In [7]:
data.show(n=10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+----+-------+----------------+----+------------------+----------+------------------+----------------+-----------+
|             user_id|item|item_id|            cat3|cat2|              cat1|        dt|        split_cat3|      cat3_genre|cat3_artist|
+--------------------+----+-------+----------------+----+------------------+----------+------------------+----------------+-----------+
|APL00000CQLOUASE27EO|null|   null|            null|    |      apollo_alarm|2024-06-05|              null|            null|       null|
|APL00000BUSE4V0A8Q2O|null|   null|TBN 대전교통방송|    |      apollo_radio|2024-06-05|[TBN 대전교통방송]|TBN 대전교통방송|       null|
|APL00000D1LFK6NYDYWW|null|   null|            null|    |      apollo_radio|2024-05-23|              null|            null|       null|
|APL00000BPV29QRHWXS0|null|   null|            null|    |      apollo_radio|2024-04-22|              null|            null|       null|
|APL00000CRHT4GPFEWOW|null|   null|            null|    |      apo

                                                                                

In [8]:
data.select('cat3').distinct().show()



+-------------------------------------+
|                                 cat3|
+-------------------------------------+
|              해외 팝|아리아나 그란데|
|                        트로트|김범룡|
|                해외 팝|휘트니 휴스턴|
|                          가족,드라마|
|                            액션,전쟁|
|                   국내 발라드|김동희|
|                    해외 팝|원 디렉션|
|                    국내 락/메탈|헌서|
|                     국내 발라드|니브|
|                  해외 팝|런던 보이즈|
|                국내 팝/어쿠스틱|디어|
|                        트로트|오세욱|
|                            트로트|린|
|                     국내 힙합|오션검|
|                       OST/BGM|박강성|
|클래식|슬로바키아 필하모니 오케스트라|
|             해외 힙합|Dirty Hipho...|
|                 해외 일렉트로닉|토부|
|               국내 팝/어쿠스틱|샘 옥|
|                       OST/BGM|유주혜|
+-------------------------------------+
only showing top 20 rows



                                                                                

In [9]:
data.select('cat2').distinct().show()



+--------------------+
|                cat2|
+--------------------+
|      play.education|
|    general.run.menu|
|ask.weather.tempe...|
|phone.tagdetail.c...|
|radio.ask.radio.c...|
|music.playlistgen...|
|        routine.view|
|    routine.recoedit|
|music.play.music....|
|tmbr.event.detail...|
|          keep.intro|
|congestion.placed...|
|phone.contact.uns...|
|            tmbr.otb|
|           tmbr.0day|
|     request.reserve|
|music.playlistgen...|
|      general.repeat|
|music.show.music....|
|callbrief.detail....|
+--------------------+
only showing top 20 rows



                                                                                

In [10]:
data.select('cat1').distinct().show()



+-------------------+
|               cat1|
+-------------------+
|      apollo_liveqa|
|    apollo_calendar|
|       apollo_photo|
|   apollo_character|
|apollo_samsungstock|
|   apollo_calendar2|
|apollo_dailycostume|
|        apollo_call|
|       apollo_quest|
|     apollo_fortune|
|        apollo_news|
|        apollo_game|
|    apollo_campaign|
|     apollo_weather|
|        apollo_time|
|        apollo_tmap|
|       apollo_music|
|       apollo_sleep|
|     apollo_english|
|        apollo_mytv|
+-------------------+
only showing top 20 rows



                                                                                

In [11]:
data.select('item').distinct().show()



+--------------------------+
|                      item|
+--------------------------+
|                송파감자국|
|           충청남도 예산군|
|           경기도 의정부시|
|                    고고기|
|                 붉은 노을|
|                    Winwin|
|                겨울이야기|
|         What A Girl Wants|
|                    Helium|
|    바다보러갈래 (SEE SEA)|
|             브람스 자장가|
|                  1도 없어|
|             그때가 좋았어|
|                   KissHug|
|            Where Are You?|
|                    Heaven|
|             뚫고 지나가요|
|          5882 (OPPA ASAP)|
|      상사화 (With 안예은)|
|Miracle (기적은 너와 내...|
+--------------------------+
only showing top 20 rows



                                                                                

In [119]:
def prefer_list(data, cate="cat1", thre=0.7):
    data = data.select(['user_id',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('user_id',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('user_id').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on='user_id', how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("user_id").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    top_keywords = top_keywords.filter(trim(col(f'{cate}')) != "")
    
    # data = top_keywords.groupBy("user_id").agg(concat_ws(",",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return top_keywords.select(['user_id','cat1'])

In [84]:
def prefer_list_cat1(data, cate="cat1", thre=0.7):
    data = data.select(['user_id','cat1',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('user_id','cat1',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('user_id','cat1').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on=['user_id','cat1'], how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("user_id","cat1").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    top_keywords = top_keywords.filter(trim(col(f'{cate}')) != "")

    data = top_keywords.groupBy("user_id","cat1").agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return data

## adot item level

In [164]:
data_item = data.filter(col('dt') >= f'{DT_threshold2}')
data_item = data_item.select(['user_id','cat1','item']).na.drop("any", subset='item')

In [165]:
data_item1 = prefer_list_cat1(data_item.filter(col('cat1')!="apollo_music"), cate="item", thre=0.4)
data_item1 = data_item1.withColumn("music_item_list", lit(None))
data_item2 = prefer_list_cat1(data_item.filter(col('cat1')=="apollo_music"), cate="item", thre=0.4)

data_item2 = data_item2.withColumnRenamed("item_list", "music_item_list")
data_item2 = data_item2.withColumn("item_list", lit(None))
data_item2 = data_item2.select('user_id','cat1','item_list','music_item_list')

In [166]:
data_item = data_item1.union(data_item2)

In [167]:
data_item.show(n=10)

                                                                                

+--------------------+--------------+------------------------------+---------------+
|             user_id|          cat1|                     item_list|music_item_list|
+--------------------+--------------+------------------------------+---------------+
|APL00000BJ19IFQXK54W|   apollo_game|요리조리 길 건너기,월드 사커 M|           null|
|APL00000BJ436WSTST8G|     apollo_mj|                        도마29|           null|
|APL00000BJ4RDYF1Q39C|   apollo_mytv|                     YTN 뉴스 |           null|
|APL00000BJB1OGJWJF9C|  apollo_radio|               김현정의 뉴스쇼|           null|
|APL00000BJCAJNIZHBLS|   apollo_mytv|                        집결호|           null|
|APL00000BJEUSHR47DHC|apollo_weather|               대전광역시 서구|           null|
|APL00000BJFZS2Y4RRI8|   apollo_game|                        주시팡|           null|
|APL00000BJQE5Y830Y68|   apollo_mytv|                     YTN 뉴스 |           null|
|APL00000BJQFTBD134SG|     apollo_mj|               팔레드신,피크닉|           null|
|APL00000BJQFTBD134SG|   

## adot cat level

- cat3 : music에 경우는 장르, artist 분리한 버전으로 나머지는 그대로

In [128]:
data_cat3_1 = prefer_list_cat1(data.filter(col("cat1") != "apollo_music"), cate="cat3", thre=0.5)
data_cat3_genre = prefer_list_cat1(data.filter(col("cat1") == "apollo_music"), cate="cat3_genre", thre=0.5)
data_cat3_artist = prefer_list_cat1(data.filter(col("cat1") == "apollo_music"), cate="cat3_artist", thre=0.5)

In [131]:
data_cat3_music = data_cat3_genre.join(data_cat3_artist,on=["user_id","cat1"],how="left")
# data_cat3_music = data_cat3_music.withColumn("cat3_list", concat_ws(" & ", concat_ws(": ", lit("장르"), "cat3_genre_list"), concat_ws(": ", lit("가수"), "cat3_artist_list")))
data_cat3_music = data_cat3_music.withColumn("cat3_list", lit(None))
data_cat3_music_1 = data_cat3_music.select("user_id","cat1","cat3_genre_list", "cat3_artist_list")
data_cat3_music = data_cat3_music.drop("cat3_genre_list", "cat3_artist_list")

In [132]:
data_cat3 = data_cat3_1.union(data_cat3_music)

In [133]:
data_cat3 = data_cat3.join(data_cat3_music_1, on=['user_id','cat1'], how="left")

In [134]:
data_cat3.columns

['user_id', 'cat1', 'cat3_list', 'cat3_genre_list', 'cat3_artist_list']

- cat1

In [120]:
data_cat1 = prefer_list(data, cate="cat1", thre=0.5)

In [121]:
data_cat1.columns

['user_id', 'cat1']

### 전체 데이터 Merge

- user, cat1 별 item, cat3

In [169]:
full_data1 = data_item.join(data_cat3, on=['user_id','cat1'], how="inner")

In [171]:
full_data1.columns

['user_id',
 'cat1',
 'item_list',
 'music_item_list',
 'cat3_list',
 'cat3_genre_list',
 'cat3_artist_list']

In [91]:
full_data1.show(n=5)



+--------------------+------------+-------------------------------+-------------------------+
|             user_id|        cat1|                      item_list|                cat3_list|
+--------------------+------------+-------------------------------+-------------------------+
|APL00000BJ19IFQXK54W| apollo_game|[요리조리 길 건너기, 월드 사...|           [하이퍼캐쥬얼]|
|APL00000BJ4RDYF1Q39C| apollo_mytv|                    [YTN 뉴스 ]|           [뉴스, 드라마]|
|APL00000BJB1OGJWJF9C|apollo_radio|              [김현정의 뉴스쇼]|             [CBS 표준FM]|
|APL00000BJCAJNIZHBLS| apollo_mytv|                       [집결호]|[뉴스, 액션,전쟁, 스포츠]|
|APL00000BJFZS2Y4RRI8| apollo_game|                       [주시팡]|           [하이퍼캐쥬얼]|
+--------------------+------------+-------------------------------+-------------------------+
only showing top 5 rows



                                                                                

In [172]:
full_data1 = full_data1.withColumn("dt", lit(DT).cast("date"))

In [173]:
full_data1 = full_data1.withColumnRenamed("cat1", "adot_cat1")\
                       .withColumnRenamed("item_list", "adot_item_prefer")\
                       .withColumnRenamed("cat3_list", "adot_cat3_prefer") 

In [174]:
full_data1.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- adot_cat1: string (nullable = true)
 |-- adot_item_prefer: string (nullable = true)
 |-- music_item_list: string (nullable = true)
 |-- adot_cat3_prefer: string (nullable = true)
 |-- cat3_genre_list: string (nullable = true)
 |-- cat3_artist_list: string (nullable = true)
 |-- dt: date (nullable = true)



- cat1 정보

In [175]:
full_data2 = data_cat1
full_data2 = full_data2.withColumn("dt", lit(DT).cast("date"))

In [176]:
full_data2 = full_data2.withColumnRenamed("cat1", "adot_cat1_prefer")

In [177]:
full_data2.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- adot_cat1_prefer: string (nullable = true)
 |-- dt: date (nullable = true)



### AIDP 저장

- user, cat1 별 item, cat3 정보 저장

In [178]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_adot_cat3_item"

In [179]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        user_id STRING,
        adot_cat1 STRING,
        adot_item_prefer STRING,
        music_item_list STRING,
        adot_cat3_prefer STRING,
        cat3_genre_list STRING,
        cat3_artist_list STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_adot_cat3_item


In [180]:
df_to_bq_table(df=full_data1,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                

- cat3 정보 저장

In [181]:
partitioned_dest_table1 = "one_model_profile_adot_cat1"

In [182]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table1}
    (
        user_id STRING,
        adot_cat1_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table1}")

생성된 테이블 : x1113099.one_model_profile_adot_cat1


In [183]:
df_to_bq_table(df=full_data2,
               dataset=dest_dataset,
               table_name=partitioned_dest_table1,
               mode="overwrite")

                                                                                