In [1]:
from skt.vault_utils import get_secrets
proxies = get_secrets('proxies')

In [2]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    df_to_bq_table,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send,
    get_secrets
)

In [3]:
import os

In [4]:
os.environ['http_proxy'] = proxies['http']
os.environ['https_proxy'] = proxies['https']

In [5]:
#!pip install transformers

In [6]:
import torch
from transformers import (
    AdamW,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoConfig
)
import torch.nn.functional as F


# EVENT EMBEDDING

# data

In [30]:
def get_tables(table_nm:str):
    query = f"""
        SELECT * 
        FROM adot_reco_dev.{table_nm}
    """
    return bq_to_pandas(query)

In [31]:
data_df = get_tables(table_nm="adotUser_tdeal_profile_temp")

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [32]:
#data_df.head(10)

In [33]:
profile_dict = data_df.set_index("luna_id").to_dict()['tdeal_profile']

In [11]:
profiles = []
luna_ids = []
for luna_id, profile in profile_dict.items():
    profiles.append(profile)
    luna_ids.append(luna_id)

# Inference

In [7]:
model = AutoModel.from_pretrained('BM-K/KoDiffCSE-RoBERTa')
tokenizer = AutoTokenizer.from_pretrained('BM-K/KoDiffCSE-RoBERTa')

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [9]:
from tqdm.notebook import tqdm

In [10]:
def batch_embedd(profiles:list, luna_ids:list, batch_size = 32, method="cls"):
    embedding_result = []
    luna_id_result = []
    with torch.no_grad():
        for i in range(0, len(profiles), batch_size):
            batch = profiles[i:i+batch_size]
            batch_luna_ids = luna_ids[i:i+batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = model(**inputs, return_dict=True)
            #embeddings, _ = model(**inputs, return_dict=False)
            if method =='cls':
                embeddings = outputs.last_hidden_state[:, 0, :]  
                embeddings = F.normalize(embeddings, p=2, dim=1)
                embeddings = embeddings.unsqueeze(1)
            elif method =='mean_pool':
                embeddings = torch.mean(outputs.last_hidden_state, dim=1)
                embeddings = F.normalize(embeddings, p=2, dim=1)
                embeddings = embeddings.unsqueeze(1) # Shape: (batch_size, 768)
            embedding_result.extend(embeddings.cpu().numpy())
            luna_id_result.extend(batch_luna_ids)
            
    return embedding_result, luna_id_result

In [16]:
embedding_result, luna_id_result = batch_embedd(profiles, luna_ids)

In [17]:
result = []
for i in range(len(luna_id_result)):
    result.append({"profile_text": profiles[i], "profile_vector": embedding_result[i].squeeze(0), "index":luna_id_result[i], 'source_domain':'tdeal', 'dt':'temp'})

In [18]:
result[0]["profile_vector"].shape

(768,)

In [21]:
import pandas as pd
df = pd.DataFrame(result)

In [22]:
df.head(2)

Unnamed: 0,profile_text,profile_vector,index,source_domain,dt
0,"특징: 육아/아기,평소 헬스,테니스,건강식품,디지털/가전,스포츠/레저,화장품/미용 ...","[0.0702496, -0.02036032, -0.042267658, -0.0299...",APL00000DC538LWDCX6O,tdeal,temp
1,"평소 기타스포츠용품,오토바이/스쿠터,디지털/가전,스포츠/레저,화장품/미용 관련 용품...","[0.041700225, -0.021257862, -0.024179967, -0.0...",APL00000D1JXIKIR3BI8,tdeal,temp


In [23]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
table_name = "tdeal_profile_vector_temp"

In [24]:
pandas_to_bq(pd_df = df, destination=f"{PROJECT_ID}.{db_name}.{table_name}")

# TEST SET

In [25]:
profile_vector_dict = dict()
for res in result:
    profile_vector_dict[res["profile_text"]] = res["profile_vector"]

In [203]:
# Quality test
query_set = ["간단한 레슨과 특별한 경기로 골프를 경험해 보세요!",
             "평소 건강체크에 관심이 많은 고객",
             "깊은 수면 부족했던 어젯밤 오늘은 깊게 잠들 수 있게 요가로 릴랙스해 봐요",
             "미워할 수 없는 동물들의 이야기를 함께 들여다볼까요?",
             "골프존, 카카오골프 등 골프 관련 어플이나 웹페이지 경험이 있고, 골프존 마켓, 카카오프렌즈 골프, tdeal 등에서 골프 관련 제품 쇼핑 경험이 있으며, tmap에서 골프 관련 장소를 찾은 적이 있고, 골프 관련 통화내역이 존재하며, 에이닷 미디어에서 골프 관련 영상을 시청한 적이 있는 고객"
            ]     

In [204]:
def embedd(sentences:list, method='cls'):
    result = dict()
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    outputs  = model(**inputs, return_dict=True)
    if method =='cls':
        embeddings = outputs.last_hidden_state[:, 0, :]  
        embeddings = F.normalize(embeddings, p=2, dim=1)
        embeddings = embeddings.unsqueeze(1)
    elif method =='mean_pool':
        embeddings = torch.mean(outputs.last_hidden_state, dim=1)
        embeddings = F.normalize(embeddings, p=2, dim=1)
        embeddings = embeddings.unsqueeze(1) # Shape: (batch_size, 768)
    for i, sent in enumerate(sentences):
        result[sent] = embeddings[i][0].cpu()
    return result

In [205]:
query_dict = embedd(query_set)

In [206]:
import random
random_indices = random.sample(range(len(profiles)), 1000)
random_profiles = [profiles[i] for i in random_indices] + ["반려동물"]

In [207]:
#profile_dict = embedd(random_profiles)

In [208]:
def get_top_k_profiles(query_dict, profile_dict, k=5):
    # Convert dictionaries to tensors
    query_strings = list(query_dict.keys())
    profile_strings = list(profile_dict.keys())
    
    query_embeddings = torch.stack([torch.tensor(emb) for emb in query_dict.values()])
    profile_embeddings = torch.stack([torch.tensor(emb) for emb in profile_dict.values()])

    # Normalize embeddings
    #query_embeddings = query_embeddings / query_embeddings.norm(dim=1)[:, None]
    #profile_embeddings = profile_embeddings / profile_embeddings.norm(dim=1)[:, None]

    # Calculate cosine similarity
    similarity_matrix = torch.mm(query_embeddings.squeeze(1), profile_embeddings.squeeze(1).t())

    # Get top k similar profiles for each query
    top_k_values, top_k_indices = torch.topk(similarity_matrix, k=k, dim=1)

    results = {}
    for i, query_string in enumerate(query_strings):
        top_profiles = [
            (profile_strings[idx.item()], sim.item())
            for idx, sim in zip(top_k_indices[i], top_k_values[i])
        ]
        results[query_string] = top_profiles

    return results

In [209]:
top_profiles = get_top_k_profiles(query_dict, profile_vector_dict, k=5)

  query_embeddings = torch.stack([torch.tensor(emb) for emb in query_dict.values()])


In [1]:
#top_profiles

# adot

In [10]:
def get_tables(table_nm:str):
    query = f"""
        SELECT  luna_id,
                adot_profile
        FROM x1113099.{table_nm}
    """
    return bq_to_pandas(query)

In [11]:
adot_df = get_tables(table_nm="user_retrieval_profile_adot_text")

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [12]:
profile_dict = adot_df.set_index("luna_id").to_dict()['adot_profile']

In [13]:
profiles = []
luna_ids = []
for luna_id, profile in profile_dict.items():
    profiles.append(profile)
    luna_ids.append(luna_id)

In [15]:
embedding_result, luna_id_result = batch_embedd(profiles, luna_ids)

In [17]:
result = []
for i in range(len(luna_id_result)):
    result.append({"profile_text": profiles[i], "profile_vector": embedding_result[i].squeeze(0), "index":luna_id_result[i], 'source_domain':'adot', 'dt':'temp'})

In [18]:
import pandas as pd
df = pd.DataFrame(result)

In [22]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
table_name = "adot_profile_vector_temp"

In [23]:
pandas_to_bq(pd_df = df, destination=f"{PROJECT_ID}.{db_name}.{table_name}")

# Tmap

In [46]:
def get_tables(table_nm:str):
    query = f"""
        SELECT  A.luna_id,
                A.tmap_profile
        FROM x1113099.{table_nm} AS A
        
        INNER JOIN 
        (
            SELECT luna_id
            FROM adot_reco.onemodelV3_input_inference_prd
            WHERE dt = '2024-06-27'
        ) AS B
        ON A.luna_id = B.luna_id
    """
    return bq_to_pandas(query)

In [47]:
tmap_df = get_tables(table_nm="user_retrieval_profile_tmap_text")

unsupported operand type(s) for /: 'NoneType' and 'int'
Downloading: 100%|[32m██████████[0m|


In [43]:
tmap_df.luna_id.nunique()

1299844

In [48]:
profile_dict = tmap_df.set_index("luna_id").to_dict()['tmap_profile']

In [52]:
def batch_embedd(profiles:list, luna_ids:list, batch_size = 128, method="cls"):
    embedding_result = []
    luna_id_result = []
    with torch.no_grad():
        for i in range(0, len(profiles), batch_size):
            batch = profiles[i:i+batch_size]
            batch_luna_ids = luna_ids[i:i+batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = model(**inputs, return_dict=True)
            #embeddings, _ = model(**inputs, return_dict=False)
            if method =='cls':
                embeddings = outputs.last_hidden_state[:, 0, :]  
                embeddings = F.normalize(embeddings, p=2, dim=1)
                embeddings = embeddings.unsqueeze(1)
            elif method =='mean_pool':
                embeddings = torch.mean(outputs.last_hidden_state, dim=1)
                embeddings = F.normalize(embeddings, p=2, dim=1)
                embeddings = embeddings.unsqueeze(1) # Shape: (batch_size, 768)
            embedding_result.extend(embeddings.cpu().tolist())
            luna_id_result.extend(batch_luna_ids)
            
    return embedding_result, luna_id_result

In [53]:
profiles = []
luna_ids = []
for luna_id, profile in profile_dict.items():
    profiles.append(profile)
    luna_ids.append(luna_id)

In [54]:
embedding_result, luna_id_result = batch_embedd(profiles, luna_ids)

In [63]:
result = []
for i in range(len(luna_id_result)):
    result.append({"profile_text": profiles[i], "profile_vector": embedding_result[i][0], "index":luna_id_result[i], 'source_domain':'tmap', 'dt':'temp'})

In [96]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
table_name = "tmap_profile_vector_temps"

In [93]:
from google.cloud import bigquery

bigquery_schema = [
    bigquery.SchemaField("profile_text", "STRING"),
    bigquery.SchemaField("profile_vector", "FLOAT64", mode="REPEATED"),
    bigquery.SchemaField("index", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("source_domain", "STRING"),
    bigquery.SchemaField("dt", "STRING"),
]

schema = StructType([
    StructField("profile_vector", ArrayType(DoubleType()), True),
    StructField("profile_text", StringType(), True),
    StructField("index", StringType(), True),
    StructField("source_domain", StringType(), True),
    StructField("dt", StringType(), True)
])

In [97]:
table_ref = client.dataset(db_name).table(table_name)
table = bigquery.Table(table_ref, schema=bigquery_schema)
table = client.create_table(table, exists_ok=True)

In [99]:
df_to_bq_table(df=df, dataset="adot_reco_dev", table_name="tmap_profile_vector_temps")

24/06/29 02:06:56 WARN TaskSetManager: Stage 6 contains a task of very large size (85075 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Py4JJavaError: An error occurred while calling o233.save.
: java.lang.RuntimeException: Failed to write to BigQuery
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:69)
	at com.google.cloud.spark.bigquery.BigQueryInsertableRelation.insert(BigQueryInsertableRelation.scala:43)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:111)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:97)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:97)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:93)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:93)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:78)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:115)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
	at com.google.cloud.bigquery.connector.common.BigQueryClient.loadDataIntoTable(BigQueryClient.java:532)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.loadDataToBigQuery(BigQueryWriteHelper.scala:87)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:66)
	... 43 more


24/06/29 08:07:21 WARN Dispatcher: Message RemoteProcessDisconnected(10.40.84.161:4122) dropped. Could not find BlockManagerEndpoint1.


In [None]:
for i in tqdm(range(len(chunk_list))):
    chunk = chunk_list[i]
    df = spark.createDataFrame(chunk, schema)
    df_to_bq_table(df=df, dataset="adot_reco_dev", table_name="tmap_profile_vector_temps")

In [87]:
client = get_bigquery_client()

In [88]:
result = []
for i in range(len(luna_id_result)):
    result.append({"profile_text": profiles[i], "profile_vector": embedding_result[i][0], "index":luna_id_result[i], 'source_domain':'tmap', 'dt':'temp'})

In [89]:
chunk_size = 100000
chunk_list = [result[i:i + chunk_size] for i in range(0, len(result), chunk_size)]

In [92]:
df.printSchema()

root
 |-- profile_vector: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- profile_text: string (nullable = true)
 |-- index: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- dt: string (nullable = true)



In [90]:
for i in tqdm(range(len(chunk_list))):
    chunk = chunk_list[i]
    df = spark.createDataFrame(chunk, schema)
    
    #df_to_bq_table(df=df, dataset="adot_reco_dev", table_name="tmap_profile_vector_temps")

  0%|          | 0/13 [00:00<?, ?it/s]

24/06/29 02:03:15 WARN TaskSetManager: Stage 4 contains a task of very large size (85075 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Py4JJavaError: An error occurred while calling o223.save.
: java.lang.RuntimeException: Failed to write to BigQuery
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:69)
	at com.google.cloud.spark.bigquery.BigQueryInsertableRelation.insert(BigQueryInsertableRelation.scala:43)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:111)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:97)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:97)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:93)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:93)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:78)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:115)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
	at com.google.cloud.bigquery.connector.common.BigQueryClient.loadDataIntoTable(BigQueryClient.java:532)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.loadDataToBigQuery(BigQueryWriteHelper.scala:87)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:66)
	... 43 more


In [81]:
def df_to_bq_table(df, dataset, table_name, partition=None, mode="append"):
    import base64
    from skt.vault_utils import get_secrets

    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    table = f"{dataset}.{table_name}${partition}" if partition else f"{dataset}.{table_name}"
    df.write.format("bigquery").option("project", "sktaic-datahub").option(
        "credentials", base64.b64encode(key.encode()).decode()
    ).option("table", table).option("temporaryGcsBucket", "temp-seoul-7d").save(mode=mode)

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, DoubleType
import numpy as np

In [66]:
schema = StructType([
    StructField("profile_vector", ArrayType(DoubleType()), True),
    StructField("profile_text", StringType(), True),
    StructField("index", StringType(), True),
    StructField("source_domain", StringType(), True),
    StructField("dt", StringType(), True)
])

In [65]:
spark = get_spark()

In [67]:
df = spark.createDataFrame(result, schema)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.readRDDFromFile.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.api.java.JavaRDD$.readRDDFromInputStream(JavaRDD.scala:252)
	at org.apache.spark.api.java.JavaRDD$.readRDDFromFile(JavaRDD.scala:239)
	at org.apache.spark.api.python.PythonRDD$.readRDDFromFile(PythonRDD.scala:274)
	at org.apache.spark.api.python.PythonRDD.readRDDFromFile(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [33]:
df.printSchema()

root
 |-- profile_vector: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- profile_text: string (nullable = true)
 |-- index: string (nullable = true)
 |-- source_domain: string (nullable = true)
 |-- dt: string (nullable = true)



In [None]:
PROJECT_ID = "skt-datahub"
db_name = "adot_reco_dev"
table_name = "tmap_profile_vector_temp"

In [None]:
df_to_bq_table(df=spark_df, dataset='adot_reco_dev', table_name=table_name)