In [17]:
# # # Parameters
DT = "2024-06-11"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2 level로 프로파일링 작업 수행
    - 단기 preference 정보
        - 7일 내 item 정보를 통해서 생성
    - 장기 preference 정보
        - 60일 내 cat1, cat2 정보를 통해서 생성
        - cat2가 더 상위 레벨임. 어떤 레벨로 장기 구성할지 선택 필요

In [18]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# 프로파일 : tmap

In [19]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=60)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-11
DT_threshold :  2024-04-12
DT_threshold :  2024-06-04


In [20]:
query = f"""
    select *
    from adot_reco.recgpt_log_sequence_daily_prd
    where dt >= '{DT_threshold1}' and type = 'tmap'
"""

In [21]:
data = bq_to_df(query)

24/06/11 23:17:54 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [22]:
data.show(n=10)

+--------------------+------+----+----+----+--------------------+--------+----+----+----------+----+
|        svc_mgmt_num|  item|cat1|cat2|cat3|           unix_time|second_s|etc1|etc2|        dt|type|
+--------------------+------+----+----+----+--------------------+--------+----+----+----------+----+
|3b9c9ad074f565d9e...| D.I.Y|가구|쇼핑|null|1714032276.000000000|   61476|null|null|2024-04-25|tmap|
|c88672cabec36d5b3...|이케아|가구|쇼핑|null|1716253321.000000000|   36121|null|null|2024-05-21|tmap|
|3408cfa2572c73d9b...|이케아|가구|쇼핑|null|1716269311.000000000|   52111|null|null|2024-05-21|tmap|
|a404393781e4c743f...|이케아|가구|쇼핑|null|1716251758.000000000|   34558|null|null|2024-05-21|tmap|
|41c3030e5336caad6...|이케아|가구|쇼핑|null|1713060274.000000000|   39874|null|null|2024-04-14|tmap|
|6b33e9c42f92916d1...|이케아|가구|쇼핑|null|1713068544.000000000|   48144|null|null|2024-04-14|tmap|
|243475c726fa363fd...|이케아|가구|쇼핑|null|1713075967.000000000|   55567|null|null|2024-04-14|tmap|
|540a34ba046298e0d...|이케아|가구|쇼핑|null

In [23]:
data.select('cat3').distinct().show()



+----+
|cat3|
+----+
|null|
+----+



                                                                                

In [24]:
data.select('cat2').distinct().show()



+---------+
|     cat2|
+---------+
| 금융편의|
| 의료편의|
| 교통편의|
|여행/레저|
| 공공편의|
| 생활편의|
|건물/시설|
|     쇼핑|
|      AOI|
|기업/단체|
|     배경|
+---------+



                                                                                

In [25]:
data.select('cat1').distinct().show()



+--------------+
|          cat1|
+--------------+
|  의료편의기타|
|  교통편의기타|
| 여행/레저기타|
|      의료시설|
|          숙박|
|      가정의례|
|   레저/스포츠|
|  공공편의기타|
|      관광명소|
|      건축관련|
|장애인편의시설|
|          카페|
|        살거리|
|    생활서비스|
|    대형유통점|
|  금융편의기타|
|      언론기관|
|      공공시설|
|      산업시설|
|  구역내시설물|
+--------------+
only showing top 20 rows



                                                                                

In [26]:
data.select('item').distinct().show()



+-------------------+
|               item|
+-------------------+
|        제2금융기타|
|           사회단체|
|               악기|
|       건강진단센터|
|   엔터테인먼트기타|
|         경영자단체|
|               금호|
|       자동차검사소|
|           조명기구|
|             남성복|
|               법원|
|             방송사|
|   정부출연연구기관|
|       의료편의기타|
|       건축관련기타|
|       교통편의기타|
|농/수/축/임협출장소|
|               낚시|
|               연극|
|               연금|
+-------------------+
only showing top 20 rows



                                                                                

In [27]:
def prefer_list(data, cate="cat1", thre=0.7):
    data = data.select(['svc_mgmt_num',f'{cate}']).na.drop("any", subset=f'{cate}')
    data = data.groupby('svc_mgmt_num',f'{cate}').agg(count("*").alias(f'{cate}_cnt'))
    data_user = data.groupby('svc_mgmt_num').agg(max(f'{cate}_cnt').alias(f'{cate}_max_cnt'))
    data = data.join(data_user, on='svc_mgmt_num', how="left")
    data = data.withColumn(f'{cate}_prop', col(f'{cate}_cnt')/col(f'{cate}_max_cnt'))
    data = data.filter(col(f'{cate}_prop') > thre)
    
    window_spec = Window.partitionBy("svc_mgmt_num").orderBy(desc(f'{cate}_prop'))

    top_keywords = data.withColumn("rank", row_number().over(window_spec)) \
                    .filter(col("rank") <= 10) \
                    .drop("rank")
    
    data = top_keywords.groupBy("svc_mgmt_num").agg(concat_ws(", ",collect_list(f'{cate}')).alias(f'{cate}_list'))
    
    return data

## tmap item level

In [28]:
data_item = data.filter(col('dt') >= f'{DT_threshold2}')
data_item = data_item.select(['svc_mgmt_num','item']).na.drop("any", subset='item')
data_item = prefer_list(data_item, cate="item", thre=0.5)

In [29]:
data_item.columns

['svc_mgmt_num', 'item_list']

## tmap cat level
- cat2(더 상위 카테고리) > cat1

- cat1

In [30]:
data_cat1 = prefer_list(data, cate="cat1", thre=0.6)

- cat2

In [31]:
data_cat2 = prefer_list(data, cate="cat2", thre=0.6)

### 전체 데이터 Merge

In [32]:
full_data = data_item.join(data_cat1,on="svc_mgmt_num")
full_data = full_data.join(data_cat2,on="svc_mgmt_num")
full_data = full_data.withColumn("dt", lit(DT).cast("date"))

In [33]:
full_data.show(n=10)

[Stage 56:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-----------------------------------+------------------+----------+
|        svc_mgmt_num|          item_list|                          cat1_list|         cat2_list|        dt|
+--------------------+-------------------+-----------------------------------+------------------+----------+
|000dfdda25bce18c3...|성당시설,중고품기타|     슈퍼/마트,종교,중고품,관광명소|              쇼핑|2024-06-11|
|00103359a02e1bceb...|           정비센터|업종별기업,대형유통점,전기/가정용품|    쇼핑,기업/단체|2024-06-11|
|0014b1175b9c9b917...|   섬,중식,쇼핑센터|                          슈퍼/마트|              쇼핑|2024-06-11|
|00197086be4d7002d...|           행정기관|           의료시설,음식점,교육기관| 생활편의,공공편의|2024-06-11|
|001a1b1cf350562bc...|             도서관|                             음식점|          생활편의|2024-06-11|
|001b57774a8240d1f...|             납골당|                        음식점,숙박|여행/레저,생활편의|2024-06-11|
|001e7e48d5d667209...|    현대아울렛,약국|                             음식점|          생활편의|2024-06-11|
|001e8102fb08e9a28...|     패밀리레스토랑| 

                                                                                

In [34]:
full_data = full_data.withColumnRenamed("item_list", "tmap_item_prefer")\
                     .withColumnRenamed("cat1_list", "tmap_cat1_prefer")\
                     .withColumnRenamed("cat2_list", "tmap_cat2_prefer")

In [35]:
full_data.printSchema()

root
 |-- svc_mgmt_num: string (nullable = true)
 |-- tmap_item_prefer: string (nullable = false)
 |-- tmap_cat1_prefer: string (nullable = false)
 |-- tmap_cat2_prefer: string (nullable = false)
 |-- dt: date (nullable = true)



### AIDP 저장

In [36]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_tmap"

In [37]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        svc_mgmt_num STRING,
        tmap_item_prefer STRING,
        tmap_cat1_prefer STRING,
        tmap_cat2_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_tmap


In [38]:
df_to_bq_table(df=full_data,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

24/06/12 08:39:22 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-391429e9-de11-45f9-93f5-2e26fa5f5899. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-391429e9-de11-45f9-93f5-2e26fa5f5899
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:171)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:110)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:91)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1193)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:318)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:314)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach