In [1]:
from google.cloud import bigquery
from google.cloud.bigquery import job
from google.cloud.bigquery import SchemaField
import pandas as pd
import datetime as dt
import numpy as np
import re
import os
from tqdm import tqdm
import pandas as pd
PROJCECT = 'ballosodeuk'
bq = bigquery.Client(project=PROJCECT)

In [2]:
# 클라이언트 설정
client = bigquery.Client()
query_name = "0711_동적쿼리테스트"
# SQL 파일 읽기
# file = './query/쿠팡0403_0509.sql'
file = f'../query/{query_name}.sql'
with open(file, 'r') as file:
    query = file.read()


In [3]:

# 쿼리 실행
job_config = bigquery.QueryJobConfig()
query_job = client.query(query, job_config=job_config)

df = query_job.to_dataframe()

### 1. 신규 DF의 field Key-Value 삭제

In [None]:
def pop_error_new_df(df,tg):
    tg = '_field_9'
    for index, row in df.iterrows():
        for e_item in row['e']:
            for d_item in e_item['d']:
                if tg in d_item:
                    d_item.pop(tg)

### 2. 신규 DF의 컬럼 추론 & 정렬
- 새로운 컬럼 생성 대응
- 기존 스키마 Order 로 정렬

In [None]:
def infer_schema_from_dataframe(df):
    """데이터프레임에서 스키마 추론."""
    def infer_field_type(value):
        if isinstance(value, dict):
            subfields = {k: infer_field_type(v) for k, v in value.items()}
            return {"type": "RECORD", "fields": subfields}
        elif isinstance(value, (np.ndarray, list)) and len(value) > 0 and isinstance(value[0], dict):
            subfields = {k: infer_field_type(v) for k, v in value[0].items()}
            return {"type": "RECORD", "mode": "REPEATED", "fields": subfields}
        elif isinstance(value, (np.ndarray, list)):
            return {"type": "STRING", "mode": "REPEATED"}
        elif pd.api.types.is_integer_dtype(type(value)):
            return "INTEGER"
        elif pd.api.types.is_float_dtype(type(value)):
            return "FLOAT"
        elif pd.api.types.is_bool_dtype(type(value)):
            return "BOOLEAN"
        elif pd.api.types.is_datetime64_any_dtype(type(value)):
            return "TIMESTAMP"
        elif isinstance(value, pd.Timestamp):
            return "DATE"
        else:
            return "STRING"
    
    schema = {}
    
    for row in range(len(df)):
        post_schema = {col: infer_field_type(df[col].iloc[row]) for col in df.columns}        
        schema = merge_schemas(schema, post_schema)

    return schema


def order_schema(schema, order):
    ordered_schema = {}
    for key in order:
        if key in schema:
            ordered_schema[key] = schema[key]
    return ordered_schema


### 3. 기존 테이블 수집

In [None]:
def get_current_schema(table_ref):
    """기존 테이블의 스키마를 가져옵니다."""
    table = client.get_table(table_ref)
    # return {field.name: field for field in table.schema}
    return table.schema

### 4. 기존 테이블 스키마의 객체화

In [None]:
# 기존 스키마 추출
def convert_schema_fields(schema):
    def convert_field(field):
        if field.field_type == 'RECORD':
            return {
                "type": field.field_type,
                "mode": field.mode,
                "fields": {subfield.name: convert_field(subfield) for subfield in field.fields}
            }
        elif field.field_type == 'STRING' and field.mode == 'REPEATED':
            return {
                "type": field.field_type,
                "mode": field.mode,
                "fields": {subfield.name: convert_field(subfield) for subfield in field.fields} if field.fields else {}
            }
        else:
            return field.field_type

    return {field.name: convert_field(field) for field in schema}

### 5. 신/구 스키마 병합
- 교집합 요소는 구, 신규는 신

In [None]:
def merge_schemas(schema1, schema2):
    if isinstance(schema1, dict) and isinstance(schema2, dict):
        merged = {}
        for key in set(schema1.keys()).union(set(schema2.keys())):
            if key in schema1 and key in schema2:
                if schema1[key] == schema2[key]:
                    merged[key] = schema1[key]
                else:
                    merged[key] = merge_schemas(schema1[key], schema2[key])
            elif key in schema1:
                merged[key] = schema1[key]
            elif key in schema2:
                merged[key] = schema2[key]
        return merged
    else:
        return schema1 if schema1 == schema2 else "STRING"

### 6. 통합 스키마 정렬

In [None]:
def order_schema(schema, order):
    ordered_schema = {}
    for key in order:
        if key in schema:
            ordered_schema[key] = schema[key]
    return ordered_schema

### 7. 통합 스키마의 BQ 스킴 변환

In [None]:
def convert_to_schema_fields(schema):
    """스키마 딕셔너리를 SchemaField 객체로 변환."""
    fields = []
    for name, dtype in schema.items():
        if isinstance(dtype, dict):
            subfields = convert_to_schema_fields(dtype.get("fields", {}))
            mode = dtype.get("mode", "NULLABLE")
            fields.append(SchemaField(name, dtype["type"], mode=mode, fields=subfields))
        else:
            fields.append(SchemaField(name, dtype))
    return fields


### 8. 기존 BQ 테이블 스키마 업데이트

In [None]:
def update_table_schema(table_ref, new_columns):
    """테이블 스키마에 새 열을 추가합니다."""
    client = bigquery.Client()
    table = client.get_table(table_ref)
    current_schema = {field.name: field for field in table.schema}
    updated_schema = list(table.schema)  # 기존 스키마 유지

    def add_new_fields(new_fields, parent_fields):
        parent_fields_dict = {field.name: field for field in parent_fields}
        for name, dtype in new_fields.items():
            if name not in parent_fields_dict:
                if isinstance(dtype, dict):
                    subfields = convert_to_schema_fields(dtype.get("fields", {}))
                    mode = dtype.get("mode", "NULLABLE")
                    parent_fields.append(SchemaField(name, dtype["type"], mode=mode, fields=subfields))
                else:
                    parent_fields.append(SchemaField(name, dtype))
            else:
                existing_field = parent_fields_dict[name]
                if existing_field.field_type == 'RECORD' and 'fields' in dtype:
                    existing_field_fields_list = list(existing_field.fields)
                    add_new_fields(dtype['fields'], existing_field_fields_list)
                    parent_fields[parent_fields.index(existing_field)] = SchemaField(
                        name=existing_field.name,
                        field_type=existing_field.field_type,
                        mode=existing_field.mode,
                        fields=tuple(existing_field_fields_list)
                    )

    add_new_fields(new_columns, updated_schema)

    table.schema = updated_schema
    client.update_table(table, ["schema"])
    print(f"테이블 {table_ref}의 스키마가 업데이트되었습니다: {new_columns}")

### 10. BQ 테이블로 현재 DF 업로드

In [None]:
def load_data_to_bigquery(df, table_ref):
    """데이터프레임을 빅쿼리 테이블로 업로드합니다."""
    job = client.load_table_from_dataframe(df, table_ref)
    job.result()
    print(f"테이블 {table_ref}에 데이터가 성공적으로 업로드되었습니다.")