In [1]:
# # # Parameters
DT = "2024-06-10"   

## **One Model Profile 생성 작업**

- 개요
    - One Model Profile 생성 작업
- 내용
    - item, cat1, cat2, cat3에 대한 정보 저장
    - 이때, prod1은 가장 최신 요금제 정보를 저장

In [2]:
from skt.gcp import get_bigquery_client, bq_insert_overwrite, get_max_part, bq_to_df, bq_to_pandas, pandas_to_bq_table, load_query_result_to_table, df_to_bq_table
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from datetime import datetime, date, timedelta
from skt.ye import get_spark
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pyspark.sql.functions import row_number

# 프로파일 : prod1

In [3]:
# DT = "2024-06-02"
current_date = datetime.strptime(DT, "%Y-%m-%d")
DT_threshold1 = (current_date - timedelta(days=60)).strftime("%Y-%m-%d")
DT_threshold2 = (current_date - timedelta(days=7)).strftime("%Y-%m-%d")

print("DT : ", DT)
print("DT_threshold : ", DT_threshold1)
print("DT_threshold : ", DT_threshold2)

DT :  2024-06-10
DT_threshold :  2024-04-11
DT_threshold :  2024-06-03


In [4]:
query = f"""
    select *
    from adot_reco.recgpt_log_sequence_daily_prd
    where dt >= '{DT_threshold1}' and type = 'prod1'
"""

In [5]:
data = bq_to_df(query)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/10 10:45:52 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
24/06/10 10:46:16 WARN SQLConf: The SQL config 'spark.sql.execution.arrow.enabled' has been deprecated in Spark v3.0 and may be removed in the future. Use 'spark.sql.execution.arrow.pyspark.enabled' instead of it.


In [6]:
data.select('cat3').distinct().show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+
|  cat3|
+------+
|요금제|
+------+



                                                                                

In [7]:
data.select('cat2').distinct().show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------+
|    cat2|
+--------+
|    기타|
|  유통망|
|고객센터|
| T world|
+--------+



                                                                                

In [8]:
data.select('cat1').distinct().show()

[Stage 6:>                                                          (0 + 1) / 1]

+---------------+
|           cat1|
+---------------+
|다이렉트 요금제|
|    일반 요금제|
+---------------+



                                                                                

In [9]:
data.select('item').distinct().show()

[Stage 9:>                                                          (0 + 1) / 1]

+---------------------+
|                 item|
+---------------------+
|         T끼리 어르신|
|            0 청년 69|
|                 슬림|
|         5GX 플래티넘|
|           5GX 프라임|
|            0 청년 43|
|            0 청년 49|
|         0플랜 미디엄|
|            0 청년 89|
|       다이렉트LTE 48|
|  베이직플러스 30GB업|
|         T플랜 스페셜|
|       다이렉트LTE 22|
|         T플랜 안심4G|
|        다이렉트5G 62|
|  T플랜 시니어 에센스|
|         T플랜 세이브|
|     5GX 프라임플러스|
|T플랜 시니어 안심2.8G|
|            0 청년 99|
+---------------------+
only showing top 20 rows



                                                                                

In [10]:
windowSpec = Window.partitionBy("svc_mgmt_num").orderBy(col("dt").desc())
df_with_row_number = data.withColumn("row_number", row_number().over(windowSpec))
df_max_dt = df_with_row_number.filter(col("row_number") == 1).drop("row_number")

In [11]:
df_max_dt.show(n=10)

[Stage 12:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-----------+--------+------+--------------------+--------+----+----+----------+-----+
|        svc_mgmt_num|               item|       cat1|    cat2|  cat3|           unix_time|second_s|etc1|etc2|        dt| type|
+--------------------+-------------------+-----------+--------+------+--------------------+--------+----+----+----------+-----+
|000061c4f6bec017f...|베이직플러스 13GB업|일반 요금제|  유통망|요금제|1714787755.000000000| 36000.0|null|null|2024-05-04|prod1|
|0001562af03ef5091...|          0 청년 43|일반 요금제|  유통망|요금제|1714525806.000000000| 36000.0|null|null|2024-05-01|prod1|
|0001e785daf87ab8e...|          0 청년 49|일반 요금제| T world|요금제|1714356563.000000000| 39600.0|null|null|2024-04-29|prod1|
|0002da519e15825fd...|          0 청년 49|일반 요금제|  유통망|요금제|1714977414.000000000| 54000.0|null|null|2024-05-06|prod1|
|00039af6cdedb3a8e...|       T플랜 세이브|일반 요금제|  유통망|요금제|1714532945.000000000| 43200.0|null|null|2024-05-01|prod1|
|0005438a8e4294e58...|   5GX 레귤러플러스|일반 요금제|  유

                                                                                

In [12]:
full_data1 = df_max_dt.select(['svc_mgmt_num','item','cat1','cat2','cat3']).withColumn("dt", lit(DT).cast("date"))

In [13]:
full_data1.show(n=10)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+-------------------+-----------+--------+------+----------+
|        svc_mgmt_num|               item|       cat1|    cat2|  cat3|        dt|
+--------------------+-------------------+-----------+--------+------+----------+
|000061c4f6bec017f...|베이직플러스 13GB업|일반 요금제|  유통망|요금제|2024-06-10|
|0001562af03ef5091...|          0 청년 43|일반 요금제|  유통망|요금제|2024-06-10|
|0001e785daf87ab8e...|          0 청년 49|일반 요금제| T world|요금제|2024-06-10|
|0002da519e15825fd...|          0 청년 49|일반 요금제|  유통망|요금제|2024-06-10|
|00039af6cdedb3a8e...|       T플랜 세이브|일반 요금제|  유통망|요금제|2024-06-10|
|0005438a8e4294e58...|   5GX 레귤러플러스|일반 요금제|  유통망|요금제|2024-06-10|
|0005a9bd227d0b541...|T플랜 시니어 세이브|일반 요금제|  유통망|요금제|2024-06-10|
|0005f6d6aa3239658...|          0 청년 69|일반 요금제|고객센터|요금제|2024-06-10|
|00077c3fc76bbe411...|베이직플러스 13GB업|일반 요금제|  유통망|요금제|2024-06-10|
|0007ca6073960672e...|           베이직　|일반 요금제|  유통망|요금제|2024-06-10|
+--------------------+-------------------+-----------+--------+------+----------+
only 

                                                                                

In [33]:
full_data1 = full_data1.withColumnRenamed("item", "prod1_item_prefer")\
                        .withColumnRenamed("cat1", "prod1_cat1_prefer")\
                        .withColumnRenamed("cat2", "prod1_cat2_prefer")\
                        .withColumnRenamed("cat3", "prod1_cat3_prefer")

In [34]:
full_data1.printSchema()

root
 |-- svc_mgmt_num: string (nullable = true)
 |-- prod1_item_prefer: string (nullable = true)
 |-- prod1_cat1_prefer: string (nullable = true)
 |-- prod1_cat2_prefer: string (nullable = true)
 |-- prod1_cat3_prefer: string (nullable = true)
 |-- dt: date (nullable = true)



### AIDP 저장

In [35]:
dest_dataset = "x1113099"
partitioned_dest_table = "one_model_profile_prod1"

In [36]:
get_bigquery_client().query(f"""
    CREATE TABLE IF NOT EXISTS {dest_dataset}.{partitioned_dest_table}
    (
        svc_mgmt_num STRING,
        prod1_item_prefer STRING,
        prod1_cat1_prefer STRING,
        prod1_cat2_prefer STRING,
        prod1_cat3_prefer STRING,
        dt DATE,
    )
    PARTITION BY dt
""").result()

print(f"생성된 테이블 : {dest_dataset}.{partitioned_dest_table}")

생성된 테이블 : x1113099.one_model_profile_prod1


In [37]:
df_to_bq_table(df=full_data1,
               dataset=dest_dataset,
               table_name=partitioned_dest_table,
               mode="overwrite")

                                                                                