In [157]:
from google.cloud import bigquery
from google.cloud.bigquery import job
from google.cloud.bigquery import SchemaField
import pandas as pd
import datetime as dt
import numpy as np
import re
import os
from tqdm import tqdm
import pandas as pd
PROJCECT = 'ballosodeuk'
bq = bigquery.Client(project=PROJCECT)

In [158]:
# 클라이언트 설정
client = bigquery.Client()
query_name = "0711_동적쿼리테스트"
# SQL 파일 읽기
# file = './query/쿠팡0403_0509.sql'
file = f'../query/{query_name}.sql'
with open(file, 'r') as file:
    query = file.read()


In [226]:

# 쿼리 실행
job_config = bigquery.QueryJobConfig()
query_job = client.query(query, job_config=job_config)

df = query_job.to_dataframe()

In [227]:
df_copy = df.copy()

### 1. 신규 DF의 컬럼 추론
- 새로운 컬럼 생성 대응
- 기존 스키마 Order 로 정렬

In [14]:
def infer_schema_from_dataframe(df):
    """데이터프레임에서 스키마 추론."""
    def infer_field_type(value):
        if isinstance(value, dict):
            subfields = {k: infer_field_type(v) for k, v in value.items()}
            return {"type": "RECORD", "fields": subfields}
        elif isinstance(value, (np.ndarray, list)) and len(value) > 0 and isinstance(value[0], dict):
            subfields = {k: infer_field_type(v) for k, v in value[0].items()}
            return {"type": "RECORD", "mode": "REPEATED", "fields": subfields}
        elif isinstance(value, (np.ndarray, list)):
            return {"type": "STRING", "mode": "REPEATED"}
        elif pd.api.types.is_integer_dtype(type(value)):
            return "INTEGER"
        elif pd.api.types.is_float_dtype(type(value)):
            return "FLOAT"
        elif pd.api.types.is_bool_dtype(type(value)):
            return "BOOLEAN"
        elif pd.api.types.is_datetime64_any_dtype(type(value)):
            return "TIMESTAMP"
        elif isinstance(value, pd.Timestamp):
            return "DATE"
        else:
            return "STRING"
    
    schema = {}
    
    for row in range(len(df)):
        post_schema = {col: infer_field_type(df[col].iloc[row]) for col in df.columns}        
        schema = merge_schemas(schema, post_schema)

    return schema

### 2. 신규 Df의 filed 타겟 값 찾기

In [162]:
def find_tg_field(tg_schema):
    
    def find_keys(data, pattern):
        matches = []
        if isinstance(data, dict):
            for key, value in data.items():
                if re.match(pattern, key):
                    matches.append(key)
                if isinstance(value, dict):
                    matches.extend(find_keys(value, pattern))
                elif isinstance(value, list):
                    for item in value:
                        matches.extend(find_keys(item, pattern))
        return matches

    tg_field = tg_schema['e']['fields']['d']

    pattern = r'^_field'
    matches = find_keys(tg_field, pattern)

    return matches[0]    

### 2. 신규 DF의 field Key-Value 삭제

In [100]:
def pop_error_new_df(df,tg):
    tg = '_field_10'
    for index, row in df.iterrows():
        for e_item in row['e']:
            for d_item in e_item['d']:
                if tg in d_item:
                    d_item.pop(tg)

### 3. 기존 테이블 수집

In [6]:
def get_current_schema(table_ref):
    """기존 테이블의 스키마를 가져옵니다."""
    table = client.get_table(table_ref)
    # return {field.name: field for field in table.schema}
    return table.schema

### 4. 기존 테이블 스키마의 객체화

In [7]:
# 기존 스키마 추출
def convert_schema_fields(schema):
    def convert_field(field):
        if field.field_type == 'RECORD':
            return {
                "type": field.field_type,
                "mode": field.mode,
                "fields": {subfield.name: convert_field(subfield) for subfield in field.fields}
            }
        elif field.field_type == 'STRING' and field.mode == 'REPEATED':
            return {
                "type": field.field_type,
                "mode": field.mode,
                "fields": {subfield.name: convert_field(subfield) for subfield in field.fields} if field.fields else {}
            }
        else:
            return field.field_type

    return {field.name: convert_field(field) for field in schema}

### 5. 신/구 스키마 병합
- 교집합 요소는 구, 신규는 신

In [8]:
def merge_schemas(schema1, schema2):
    if isinstance(schema1, dict) and isinstance(schema2, dict):
        merged = {}
        for key in set(schema1.keys()).union(set(schema2.keys())):
            if key in schema1 and key in schema2:
                if schema1[key] == schema2[key]:
                    merged[key] = schema1[key]
                else:
                    merged[key] = merge_schemas(schema1[key], schema2[key])
            elif key in schema1:
                merged[key] = schema1[key]
            elif key in schema2:
                merged[key] = schema2[key]
        return merged
    else:
        return schema1 if schema1 == schema2 else "STRING"

### 6. 통합 스키마 정렬

In [9]:
def order_schema(schema, order):
    ordered_schema = {}
    for key in order:
        if key in schema:
            ordered_schema[key] = schema[key]
    return ordered_schema

### 7. 통합 스키마의 BQ 스킴 변환

In [10]:
def convert_to_schema_fields(schema):
    """스키마 딕셔너리를 SchemaField 객체로 변환."""
    fields = []
    for name, dtype in schema.items():
        if isinstance(dtype, dict):
            subfields = convert_to_schema_fields(dtype.get("fields", {}))
            mode = dtype.get("mode", "NULLABLE")
            fields.append(SchemaField(name, dtype["type"], mode=mode, fields=subfields))
        else:
            fields.append(SchemaField(name, dtype))
    return fields


### 8. 기존 BQ 테이블 스키마 업데이트

In [206]:
def update_table_schema(table_ref, merged_schema):
    """테이블 스키마에 새 열을 추가합니다."""
    table = client.get_table(table_ref)
    table.schema = merged_schema
    client.update_table(table, ["schema"])
    print(f"테이블 {table_ref}의 스키마가 업데이트되었습니다: {merged_schema}")
    

### 10. BQ 테이블로 현재 DF 업로드

In [12]:
def load_data_to_bigquery(df, table_ref):
    """데이터프레임을 빅쿼리 테이블로 업로드합니다."""
    job = client.load_table_from_dataframe(df, table_ref)
    job.result()
    print(f"테이블 {table_ref}에 데이터가 성공적으로 업로드되었습니다.")

## 실행

In [160]:
# 기존 데이터셋 수집
dataset_id = 'airbridge_mart'
table_id = 'app_df_2'
table_ref = f'ballosodeuk.{dataset_id}.{table_id}'


In [205]:
client.get_table(table_ref)

Table(TableReference(DatasetReference('ballosodeuk', 'airbridge_mart'), 'app_df_2'))

In [237]:
df = df_copy.copy()

In [240]:
df['Event_Date'] = pd.to_datetime(df.Event_Date)
new_schema = infer_schema_from_dataframe(df)
tg_field = find_tg_field(new_schema)
pop_error_new_df(df,tg_field)
new_schema = infer_schema_from_dataframe(df)
cur_schema = get_current_schema(table_ref)
cur_schema = convert_schema_fields(cur_schema)
merged_schema = merge_schemas(cur_schema, new_schema)
merged_schema = order_schema(merged_schema, df.columns)
merged_schema = convert_to_schema_fields(merged_schema)
update_table_schema(table_ref, merged_schema)


In [249]:
new_schema

{'Event_Count_Total': 'INTEGER',
 'Event_Category': 'STRING',
 'Airbridge_Device_ID_Type': 'STRING',
 'Event_Date': 'DATE',
 'i': {'fields': {'Target_Event_Timestamp': 'STRING',
   'Client_IP_City': 'STRING',
   'Is_First_Event_per_Device_ID': 'BOOLEAN',
   'Device_Type': 'STRING',
   'Term_ID': 'STRING',
   'Client_IP_Country_Code': 'STRING',
   'Platform': 'STRING',
   'Is_First_Event_per_User_ID': 'BOOLEAN',
   'Ad_Creative_ID': 'STRING',
   'Campaign_ID': 'STRING',
   'Device_Model': 'STRING',
   'Ad_Group_ID': 'STRING',
   'Target_Event_Category': 'STRING',
   'Client_IP_Subdivision': 'STRING',
   'Is_Re_engagement': 'STRING',
   'Is_First_Target_Event_per_Device': 'STRING'},
  'type': 'RECORD'},
 'Event_Value_Total': 'FLOAT',
 'e': {'type': 'RECORD',
  'mode': 'REPEATED',
  'fields': {'Label': 'STRING',
   'Action': 'STRING',
   'Event_Value_Sum': 'FLOAT',
   'Event_Count': 'INTEGER',
   'u': {'type': 'RECORD',
    'mode': 'REPEATED',
    'fields': {'User_ID': 'STRING'}},
   'd':

In [239]:
df.loc[0]['i']

{'Device_Model': 'SM-N971N',
 'Device_Type': 'mobile',
 'Platform': 'Android',
 'Client_IP_Country_Code': 'KR',
 'Client_IP_Subdivision': 'Gyeonggi-do',
 'Client_IP_City': 'Suwon',
 'Campaign_ID': None,
 'Ad_Group_ID': None,
 'Ad_Creative_ID': None,
 'Term_ID': None,
 'Is_Re_engagement': True,
 'Is_First_Event_per_User_ID': False,
 'Is_First_Event_per_Device_ID': False,
 'Is_First_Target_Event_per_Device': False,
 'Target_Event_Timestamp': Decimal('1720536693400.000000000'),
 'Target_Event_Category': 'Deeplink Open (App)'}

In [236]:
pd.api.types.is_bool_dtype(type(df.loc[0]['i']['Is_Re_engagement']))

True

In [232]:
list(df.sample(10)['e'])[0][0]['d'][0]

{'Timestamp': '2024-07-11T22:03:18+09:00',
 'Label': '상품상세_구매하기',
 'Action': '아이스 카페 아메리카노 T',
 'Value': 6429.0,
 'tester': 'true',
 'name': None,
 'price': None,
 'productID': None,
 'transactionID': None,
 'products_struct_alias': array([{'name': '아이스 카페 아메리카노 T', 'price': '6429', 'position': None}],
       dtype=object)}

In [167]:
df['e'][0][0]['d'][0]

{'Timestamp': '2024-07-11T15:57:48+09:00',
 'Label': '상품상세_구매하기',
 'Action': '동서)스타벅스커피라떼컵200ml',
 'Value': 3000.0,
 'tester': 'true',
 'name': None,
 'price': None,
 'productID': None,
 'transactionID': None,
 'products_struct_alias': array([{'name': '동서)스타벅스커피라떼컵200ml', 'price': '3000', 'position': None}],
       dtype=object)}

In [168]:
df['e']

0       [{'Label': '상품상세_구매하기', 'Action': '동서)스타벅스커피라떼...
1       [{'Label': '상품상세_구매하기', 'Action': '아이스 카페 아메리카...
2       [{'Label': '상품상세_구매하기', 'Action': '아메리카노(ICED)...
3       [{'Label': '상품상세_구매하기', 'Action': '농심)너구리얼큰(봉지...
4       [{'Label': '상품상세_구매하기', 'Action': '싸이버거', 'Eve...
                              ...                        
1471    [{'Label': '상품상세_구매하기', 'Action': '허니콤보+콜라1.25...
1472    [{'Label': '상품상세_구매하기', 'Action': '패밀리 아이스크림',...
1473    [{'Label': '상품상세_구매하기', 'Action': '패밀리 아이스크림',...
1474    [{'Label': '상품상세_구매하기', 'Action': '허니콤보+콜라1.25...
1475    [{'Label': '상품상세_구매하기', 'Action': '허니콤보+콜라1.25...
Name: e, Length: 1476, dtype: object

In [169]:
merged_schema

{'Event_Category': 'STRING',
 'Event_Count_Total': 'INTEGER',
 'Airbridge_Device_ID_Type': 'STRING',
 'Event_Date': 'STRING',
 'i': {'fields': {'Target_Event_Timestamp': 'STRING',
   'Client_IP_City': 'STRING',
   'Is_First_Event_per_Device_ID': 'BOOLEAN',
   'Device_Type': 'STRING',
   'Term_ID': 'STRING',
   'Client_IP_Country_Code': 'STRING',
   'Platform': 'STRING',
   'Is_First_Event_per_User_ID': 'BOOLEAN',
   'Ad_Creative_ID': 'STRING',
   'Campaign_ID': 'STRING',
   'Device_Model': 'STRING',
   'Ad_Group_ID': 'STRING',
   'Target_Event_Category': 'STRING',
   'Client_IP_Subdivision': 'STRING',
   'Is_Re_engagement': 'STRING',
   'Is_First_Target_Event_per_Device': 'STRING'},
  'type': 'RECORD',
  'mode': 'NULLABLE'},
 'Event_Value_Total': 'FLOAT',
 'e': {'fields': {'Event_Value_Sum': 'FLOAT',
   'Action': 'STRING',
   'Event_Count': 'INTEGER',
   'u': {'type': 'RECORD',
    'mode': 'REPEATED',
    'fields': {'User_ID': 'STRING'}},
   'Label': 'STRING',
   'd': {'fields': {'name

TypeError: unhashable type: 'dict'