In [1]:
!python3 -m venv venv
!source venv/bin/activate

In [None]:
!pip3 install "google-cloud-bigquery>=3.17"
!pip3 install "google-cloud-aiplatform>=1.38"
!pip3 install "pandas>=2.2.0"

In [36]:
import os
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
from vertexai.preview import generative_models
from vertexai.preview.generative_models import GenerativeModel
import re
import time
from proto.marshal.collections import repeated
from proto.marshal.collections import maps

from flask import Flask, request

project_id = "ai-sandbox-sw"
dataset_id = "mstudy"

raw_target_table = "target2"
target_table = "target2_ordered"

raw_source_tables = ["source-uipetmis","source-uispet"]
raw_source_tables_wildcard = 'source-*'
source_table = "source_ordered"

job_scheduler_bucket_name = f"{project_id}-job-scheduler-bucket-test"
queued_jobs_bucket_name = f"{project_id}-queued-jobs-bucket-test"

in_progress_jobs_bucket_name = f"{project_id}-in_progress_jobs_bucket_test"
failed_jobs_bucket_name = f"{project_id}-failed_jobs_bucket_test"
successful_jobs_bucket_name = f"{project_id}-completed_jobs_bucket"
bq_upload_queue_bucket_name = f"{project_id}-bq_upload_queue_bucket_test"


client = bigquery.Client(project=project_id)

In [37]:
def add_unique_ref_and_create_new_table(project_id, dataset_id, raw_table, new_table, new_col, prefix):
    """Adds 'Source_Unique_Ref' column if missing, then creates a new BigQuery table.

    Args:
        project_id: BigQuery project ID.
        dataset_id: BigQuery dataset ID.
        raw_tables: Dict containing raw table names
        new_table: Desired name of finalised table
        new_col: the name of the reference column for the table
    """

    raw_query = f"""
        SELECT *  
        FROM `{project_id}.{dataset_id}.{raw_table}`
    """
    raw_df = client.query(raw_query).to_dataframe()

    # Rename columns with prefix
    for col in raw_df.columns:
        print(f"col is {col}")
        if col != f"Unique_Ref":
            print(f"therefore renaming {col} to {prefix}_{col}")
            raw_df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)
        else:
            print(f"therefore renaming {col} to Original_{col}")
            raw_df.rename(columns={col: f"Original_{col}"}, inplace=True)

    raw_df[new_col] = range(1, len(raw_df) + 1)
    
    new_table_id = f"{project_id}.{dataset_id}.{new_table}"

    job_config = bigquery.LoadJobConfig()  
    job = client.load_table_from_dataframe(raw_df, new_table_id, job_config=job_config)
    job.result()

    return job.result()

In [38]:
# Source table setup
source_table_ref = client.dataset(dataset_id).table(source_table) 
try:
    client.get_table(source_table_ref)  # Will raise NotFound if the table doesn't exist
    print("Source table '{}' exists.".format(source_table))
except:
    print("Source table '{}' does not exist.".format(source_table))
    print("creating Source table...")
    new_source_col = 'Source_Unique_Ref'
    source_prefix = "Source"   
    add_unique_ref_and_create_new_table(project_id, dataset_id, raw_source_tables_wildcard, source_table, new_source_col, source_prefix)

source_query = f"""
    SELECT *  
    FROM `{project_id}.{dataset_id}.{source_table}`
"""
source_df = client.query(source_query).to_dataframe()
print(f"source_df length is {source_df.shape[0]}")


# Target table setup
target_table_ref = client.dataset(dataset_id).table(target_table) 
try:
    client.get_table(target_table_ref)  # Will raise NotFound if the table doesn't exist
    print(f"Target table {target_table} exists.")
except:
    print(f"Target table {target_table} does not exist.")
    print("creating Target table...")
    new_target_col = 'Target_Unique_Ref'
    target_prefix = "Target"   
    add_unique_ref_and_create_new_table(project_id, dataset_id, raw_target_table, target_table, new_target_col, target_prefix)

target_query = f"""
    SELECT *  
    FROM `{project_id}.{dataset_id}.{target_table}`
"""
target_df = client.query(target_query).to_dataframe()
print(f"target_df length is {target_df.shape[0]}")

Source table 'source_ordered' exists.
source_df length is 5411
Target table target2_ordered exists.
target_df length is 997


In [39]:
source_df.head()

Unnamed: 0,Source_SchemaName,Source_TableName,Source_Column_Name,Source_Data_type,Source_Max_Length,Source_precision,Source_scale,Source_is_nullable,Source_Unique_Ref
0,Aggregator,coinsurance_override,force_coinsurance,bit,1,1,0,0,1
1,dbo,bad_quotes_adu_20160722,confirm_excluded_breeds,bit,1,1,0,0,2
2,dbo,blog,show,bit,1,1,0,0,3
3,dbo,charity,active,bit,1,1,0,0,4
4,dbo,discount,staff,bit,1,1,0,0,5


In [40]:
target_df.head()

Unnamed: 0,Original_Unique_Ref,Target_Tranche,Target_Level_1,Target_Level_2,Target_Level_3,Target_Level_4,Target_Complex_Type,Target_Attribute,Target_Description,Target_Mandatory__,Target_Data_Type,Target_Accepted_Values,Target_Validation,Target_Drop_Down_Metaval,Target_Unique_Ref
0,1,CLIENT,Configuration,,,,configuration,dataImportOperatorName,This is the logon name for the data import ope...,Mandatory,string,Reference Data: operator.loginName,Must be Strata Operator ID. ‘-11’,,1
1,2,CLIENT,Configuration,,,,configuration,requestUUID,The is a unique identifier for the XML message,Optional,string (8),,,,2
2,3,CLIENT,Configuration,,,,configuration,noUpdateMode,Setting to true will mean the XML message will...,Optional,boolean,,,,3
3,4,CLIENT,Configuration,,,,configuration,overrideStrataReferences,Setting to true will mean the classicOffsetRef...,Optional,integer,,,,4
4,5,CLIENT,Configuration,,,,configuration,useDvlaLookup,Specifies whether the DVLA service will be cal...,Optional,boolean,,,,5


In [41]:
def dataframe_to_string(df):
    """Converts a DataFrame to a string with column names and row values.

    Args:
        df: The pandas DataFrame to convert.

    Returns:
        A string representation of the DataFrame.
    """

    output = f"Column Names: {', '.join(df.columns)}\n"  # Header with column names

    for _, row in df.iterrows():
        row_string = ', '.join(str(value) for value in row)
        output += f"Row: {row_string}\n"

    return output


def parse_function_call(function_call):
    """Parses a FunctionCall object, adds a description, and returns a JSON-compatible dictionary.

    Args:
        function_call: The FunctionCall object to parse.

    Returns:
        A dictionary containing the function name, attributes, and description.
    """

    result = {
        "function_name": function_call.name,
        "attributes": {},
    }
    for key, value in function_call.args.items():
        result["attributes"][key] = value

    return result

def convert_dict_to_list_of_dicts(dict):
    """Converts a dictionary of lists and strings to a list of flat dictionaries.

    Args:
        data: The input dictionary containing lists and strings.

    Returns:
        A list of dictionaries, where each dictionary represents a  
        combination of elements from the input lists.
    """
    
    list_of_attribute_dicts = []
    string_keys = []
    list_keys = []

    for key, value in dict.items() :
        if isinstance(value, str):
            string_keys.append(key)
        else:
            list_keys.append(key)       
    
    for i in range (len(dict[list_keys[0]])):
        new_dict = {}
        
        for key in list_keys:
            new_dict[key] = dict[key][i]
        for key in string_keys:
            new_dict[key] = dict[key]

        list_of_attribute_dicts.append(new_dict)

    return list_of_attribute_dicts

def create_df_from_target_row_df_and_list_of_dicts(list_of_attribute_dicts, test_target_df_row):
    """
    Appends rows to a DataFrame, combining a base row with data from a list of dictionaries.

    Args:
        list_of_dicts: A list of dictionaries, each representing column values.
        test_target_df_row: A DataFrame row containing base columns.

    Returns:
        The modified DataFrame with the newly appended rows.
    """

    df_list = []
     
    for attribute_dict in list_of_attribute_dicts:
        df = test_target_df_row.copy()  # Make a copy 
        for key in attribute_dict.keys(): # Combine the base row with the current dictionary
            df[key] = attribute_dict[key]

        df_list.append(df)

    return pd.concat(df_list)

def merge_dataframes_and_string(target_df_row, source_df_row, confidence_level):
    target_df_row = target_df_row.reset_index(drop=True)
    source_df_row = source_df_row.reset_index(drop=True)
    merged_df = pd.concat([target_df_row, source_df_row], axis=1)
    confidence_df = pd.DataFrame({'Confidence_Levels': [confidence_level]})
    final_df = pd.concat([merged_df, confidence_df], axis=1)
    return final_df


In [42]:
def dataframe_to_custom_target_string(df):

    output = ""  # Header with column names

    for _, row in df.iterrows():
        row_string = f"target_field: {row['Target_Level_1']}"
        if row['Target_Level_2'] != 'n/a':
            row_string += f".{row['Target_Level_2']}"
        if row['Target_Level_3'] != 'n/a':
            row_string += f".{row['Target_Level_3']}"
        if row['Target_Level_4'] != 'n/a':
            row_string += f".{row['Target_Level_4']}"
        row_string += f".{row['Target_Attribute']}; "
        if row['Target_Data_Type'] != '' and row['Target_Data_Type']:
            row_string += f"data_type: {row['Target_Data_Type']}; "
        if row['Target_Description'] != '' and row['Target_Description']:
            row_string += f"target_field_description: {row['Target_Description']}; "
        row_string += f"target_field_unique_ref: {row['Target_Unique_Ref']}"

        output += f"{row_string}\n"

    return output

In [49]:
objectId = "target-row-86-source-groups-52-104"

storage_client = storage.Client()
queued_jobs_bucket = storage_client.bucket(queued_jobs_bucket_name)
failed_jobs_bucket = storage_client.bucket(failed_jobs_bucket_name)
blob = queued_jobs_bucket.blob(objectId)

if not blob.exists():
    print(f"File '{objectId}' not found in bucket '{queued_jobs_bucket_name}'. Job already picked from queue. Container instance completing with 204 message")

pattern = r"^target-row-(\d{1,4})-source-groups-(\d{1,4})-(\d{1,4})$"
match = re.match(pattern, objectId)
if not match:
    msg = "objectId is not in the expected format: ^target-row-(\d{3})-source-groups-(\d{3})-(\d{3})$"
    print(msg)

target_row = int(match.group(1))
source_group_start = int(match.group(2))
source_group_end = int(match.group(3))

print(f"target_row {target_row} source_group_start {source_group_start} source_group_end {source_group_end}")

# Prepare target field
target_df_row = target_df.iloc[[target_row]]
target_string_row = dataframe_to_custom_target_string(target_df_row)
print("\ntarget_string_row")
print(target_string_row)


contents = blob.download_as_string().decode('utf-8')
unmapped_source_string_groups = []
unmapped_source_string_groups = contents.split("\n\n")  # Split by double newlines

print("\nunmapped_source_string_groups[0]")
print(unmapped_source_string_groups[0])

target_row 86 source_group_start 52 source_group_end 104

target_string_row
target_field: Policy.ncbIndicator; data_type: boolean; target_field_description: Flag to indicate presence of previous insurance details; target_field_unique_ref: 125


unmapped_source_string_groups[0]
source_field: dbo.error_tracker_archive.notification_sent; data_type: bit; source_field_unique_ref: 788
source_field: dbo.error_tracker_archive.id; data_type: int; source_field_unique_ref: 100
source_field: dbo.error_tracker_archive.product_id; data_type: int; source_field_unique_ref: 923
source_field: dbo.error_tracker_archive.error_message; data_type: varchar; source_field_unique_ref: 567
source_field: dbo.error_tracker_archive.stack_trace; data_type: varchar; source_field_unique_ref: 1390
source_field: dbo.error_tracker_archive.error_type; data_type: varchar; source_field_unique_ref: 1389
source_field: dbo.error_tracker_archive.error_page; data_type: varchar; source_field_unique_ref: 1388
source_field: dbo.err

In [48]:
target_df_row['Target_Level_1'].iloc[0] == 'Policy'

True

In [50]:
model = GenerativeModel("gemini-pro")

set_source_field_mapping_confidence_levels = generative_models.FunctionDeclaration(
    name="set_source_field_mapping_confidence_levels",
    description="""Sets the mapping confidence values for each source field for a given target field.

Here is a general example to help you understand how to use the set_source_field_mapping_confidences_tool correctly. This is only an example to show the source and target field structures.:

Assuming you had previously decided on the following mapping confidence levels (but it is important that you come up with your own values for mapping condifence level rather than specifically using these values):
a mapping confidence level of 2 for the field with source_field_unique_ref=158
a mapping confidence level of 1 for the field with source_field_unique_ref=159
a mapping confidence level of 1 for the field with source_field_unique_ref=1290
a mapping confidence level of 1 for the field with source_field_unique_ref=579
a mapping confidence level of 1 for the field with source_field_unique_ref=638
a mapping confidence level of 1 for the field with source_field_unique_ref=970
a mapping confidence level of 1 for the field with source_field_unique_ref=3317
a mapping confidence level of 3 for the field with source_field_unique_ref=160
a mapping confidence level of 1 for the field with source_field_unique_ref=1910
a mapping confidence level of 5 for the field with source_field_unique_ref=2280

Then this function would be used to set the mapping confidence levels for each of the source fields, where your input parameter source_field_mapping_confidences would be:
source_field_mapping_confidences = [
    {'source_field_unique_ref':158,'mapping_confidence_level':'2'},
    {'source_field_unique_ref':159,'mapping_confidence_level':'2'},
    {'source_field_unique_ref':1290,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':579,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':638,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':970,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':3317,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':160,'mapping_confidence_level':'3'},
    {'source_field_unique_ref':1910,'mapping_confidence_level':'1'},
    {'source_field_unique_ref':2280,'mapping_confidence_level':'5'}
]""",

    parameters={
        "type": "object",
        "properties": {
            "source_field_mapping_confidences": {
                "type": "array",
                "description": "A List of objects where each object in the list contains the source field's source_field_unique_ref and the mapping_confidence_level for that source field.",
                "items": {
                    "type": "object",
                    "properties": {
                        "source_field_unique_ref": {
                            "type": "integer",
                            "description": "The reference ID for the source field."
                        },
                        "mapping_confidence_level": {
                            "type": "string",
                            "enum": ["1", "2", "3", "4", "5"],
                            "description": "The confidence level for the mapping (an integer between 1 and 5)."
                        },
                        "mapping_confidence_level_reason": {
                            "type": "string",
                            "description": "The reason why the source field should have this mapping confidence level value"
                        }
                    },
                    "required": ["source_field_unique_ref", "mapping_confidence_level"]
                }
            },
        },
        "required": ["source_field_mapping_confidences"],
    },
)

set_source_field_mapping_confidence_levels_tool = generative_models.Tool(
    function_declarations=[set_source_field_mapping_confidence_levels]
)

In [61]:
unmapped_source_string_group = unmapped_source_string_groups[0]
field_count = unmapped_source_string_group.count('source_field:')

print("************************************")
print(f"{field_count} source fields in group")
print(unmapped_source_string_group)
print("mapping to")
print(target_string_row)

prompt = f"""You are Data Engineer working for an insurance company. As part of a data migration project you need to assist with mapping fields in a source data schema fields in a target data schema. Your job is to provide a mapping confidence level for how well you think each of the fields for the source schemas you see will map to the field for the target schema.
This will be used as part of an automated data migration process so your mapping confidence level describes how confident you are that the data in the source field could be directly loaded into this target field without modification and it would make logical sense and contextual sense for the data to be put into that target field.

Here is some information about the source fields:
The fields from the source schema are also custom complex nested objects. They will have two levels of nesting, for example: Contact.Preference.Method
Similar to the target field, each layer of nesting of the source fields is an important consideration for whether these source fields will map well to the target field.
Similar to the target field, you may also be given the data types of the source fields. These are the standard types (e.g. string, int, boolean, dateTime, etc.) of the lowest level of nesting for each source field. These are also an important consideration for the mapping.

Here is some information about the target field:
The field from the target schema is a custom complex nested object. It will be at a minimum one level of nesting, for example {target_df_row['Target_Level_1'].iloc[0]}.id, and at a maximum 4 layers of nesting, for example {target_df_row['Target_Level_1'].iloc[0]}.namedPerson.namedDriver.email.emailAddress.
Each layer of nesting is a vary important consideration for whether the source fields will map well to this target field, for example, consider the target fields Client.person.dateOfBirth and Policy.namedPerson.namedPerson (Motor).dateOfBirth. Although these have the same value for their lowest level of nesting in the field type (dateOfBirth), as first is referencing the date of birth of the client of the policy because its top level object is Client, and the second is referring to a named driver thats been added to a motor policy (which is not necessarily the same person), because its top level object is Policy. This example is to show you that its VERY important that you consider the ENTIRE nested structure of the target field to decide on the mapping confidence level.
In addition, you may also be given the data type of the target field. This is the standard type (e.g. string, int, boolean, dateTime, etc.) of the lowest level of nesting for the field. This is also an important consideration for the mapping.
In addition, there may also be a description of the target field. This is not present in all cases, but if it is present then please use this to help you better understand what this target field is referring to.
You may also get additional information such as the possible accepted values for this target field, any validation logic for this field, or other information. Please use this information when making your mapping decision. 

You are being shown shown multiple fields in the source schema which are here:
{unmapped_source_string_group}

And one field from the target schema which is here:
{target_string_row}

Here is some information about how you should go about this job:
Based on your knowledge of the insurance industry, home insurance, motor insurance, pets, pet insurance, and other related insurance industry concepts and data structures, you will provide a mapping confidence level for each of the source fields that describes how well you think that source field would map to the target field.
You must think very carefully about the mapping confidence level you apply for each source field, as it will be used in later process steps to implement the automated data migration pipelines, so any inaccuracies lead to very costly errors.
Remember that you need to consider EVERY NESTED LAYER of both the target field and the source fields to comprehend what kind of information they each represent, and therefore whether they are a good or bad mapping.
As the value of the top level of the nested object of the target field is {target_df_row['Target_Level_1'].iloc[0]}, this means that the target field is only relevant to 
{"the client that has taken out a policy." if target_df_row['Target_Level_1'].iloc[0] == 'Client' else ""}{"a household insurance or home insurance policy." if target_df_row['Target_Level_1'].iloc[0] == 'Household Policy' else ""}{"a motor insurance policy, or Motorcar policy data or car insurance data." if target_df_row['Target_Level_1'].iloc[0] == 'Motor Policy' else ""}{"a pet insurance policy, or the pet that is being insured against in the pet insurance policy." if target_df_row['Target_Level_1'].iloc[0] == 'Pet' else ""}{"a Policy that has been created or is being created by the client." if target_df_row['Target_Level_1'].iloc[0] == 'Policy' else ""}
So you should only give high confidence mapping levels to source fields that also refer to {target_df_row['Target_Level_1'].iloc[0]} data. If you think the source field is not referring specifically to {target_df_row['Target_Level_1'].iloc[0]} data, you should not give a high mapping confidence level.
You must also give a detailed reason for why you decided on that mapping confidence level.

The mapping confidence level you will apply must be a number between 1 and 5 where:
1 means there is a no chance that the source field matches the target field
2 means there is a small chance the source field matches the target field
3 means there is a good chance the source field matches the target field
4 means there is a very good chance the source field matches the target field
5 means there is a very very good chance the source field matches the target field

You should decide on a mapping confidence level for each of the source fields, then set the mapping confidence level for each field using and use the value for source_field_unique_ref for each source field to reference it with its corresponding mapping confidence level as well as the reason for why you gave that confidence level.
Then YOU MUST USE the available function set_source_field_mapping_confidence_levels in the set_source_field_mapping_confidence_levels_tool to set your mappings confidence level for each of the source fields.
YOU MUST USE THIS FUNCTION."""


************************************
11 source fields in group
source_field: dbo.error_tracker_archive.notification_sent; data_type: bit; source_field_unique_ref: 788
source_field: dbo.error_tracker_archive.id; data_type: int; source_field_unique_ref: 100
source_field: dbo.error_tracker_archive.product_id; data_type: int; source_field_unique_ref: 923
source_field: dbo.error_tracker_archive.error_message; data_type: varchar; source_field_unique_ref: 567
source_field: dbo.error_tracker_archive.stack_trace; data_type: varchar; source_field_unique_ref: 1390
source_field: dbo.error_tracker_archive.error_type; data_type: varchar; source_field_unique_ref: 1389
source_field: dbo.error_tracker_archive.error_page; data_type: varchar; source_field_unique_ref: 1388
source_field: dbo.error_tracker_archive.time_stamp; data_type: datetime; source_field_unique_ref: 678
source_field: dbo.lead_failure_reason.active; data_type: bit; source_field_unique_ref: 789
source_field: dbo.lead_failure_reason.id; d

In [None]:
print(prompt)

In [None]:
model_response = model.generate_content(
    prompt,
    generation_config={"temperature": 0},
    tools=[set_source_field_mapping_confidence_levels_tool],
)

if not model_response.candidates[0].content.parts[0].function_call:
    print("did not use fn call! retrying with a more explicit prompt")
    prompt += """
YOU MUST USE THIS FUNCTION."""
    model_response = model.generate_content(
        prompt,
        generation_config={"temperature": 0},
        tools=[set_source_field_mapping_confidence_levels_tool],
    )

function_call_json = parse_function_call(model_response.candidates[0].content.parts[0].function_call)
attributes_dict = function_call_json["attributes"]
print(f"Received mapping response from Gemini: {attributes_dict}")

In [35]:
def recurse_proto_repeated_composite(repeated_object):
    repeated_list = []
    for item in repeated_object:
        if isinstance(item, repeated.RepeatedComposite):
            item = recurse_proto_repeated_composite(item)
            repeated_list.append(item)
        elif isinstance(item, maps.MapComposite):
            item = recurse_proto_marshal_to_dict(item)
            repeated_list.append(item)
        else:
            repeated_list.append(item)

    return repeated_list

def recurse_proto_marshal_to_dict(marshal_object):
    new_dict = {}
    for k, v in marshal_object.items():
      if not v:
        continue
      elif isinstance(v, maps.MapComposite):
          v = recurse_proto_marshal_to_dict(v)
      elif isinstance(v, repeated.RepeatedComposite):
          v = recurse_proto_repeated_composite(v)
      new_dict[k] = v

    return new_dict

for attribute in attributes_dict['source_field_mapping_confidences']:
    print(recurse_proto_marshal_to_dict(attribute))

{'mapping_confidence_level': '1', 'source_field_unique_ref': 4689.0, 'mapping_confidence_level_reason': 'The data types of the source and target fields are different. The source field is smalldatetime and the target field is dateTime. This means that the source field may not be able to store all of the values that the target field can. Additionally, the source field is named applied_date, which suggests that it stores the date that a discount was applied, while the target field is named datePurchased, which suggests that it stores the date that a policy was purchased. These two concepts are not necessarily the same, so it is unlikely that the source field will map well to the target field.'}


In [14]:
df_list_for_bq_upload = []

for l, mapping_confidence in enumerate(attributes_dict['source_field_mapping_confidences']):
    mapping_confidence_dict = recurse_proto_marshal_to_dict(mapping_confidence)
    Source_Unique_Ref = mapping_confidence_dict['source_field_unique_ref']
    mapping_confidence_level_int = int(mapping_confidence_dict['mapping_confidence_level'])

    source_df_row = source_df[source_df['Source_Unique_Ref']==Source_Unique_Ref]
    
    mapping_output = merge_dataframes_and_string(target_df_row, source_df_row,mapping_confidence_level_int)
    df_list_for_bq_upload.append(mapping_output)

In [15]:
df_list_for_bq_upload[0].head()

Unnamed: 0,Original_Unique_Ref,Target_Tranche,Target_Level_1,Target_Level_2,Target_Level_3,Target_Level_4,Target_Complex_Type,Target_Attribute,Target_Description,Target_Mandatory__,...,Source_SchemaName,Source_TableName,Source_Column_Name,Source_Data_type,Source_Max_Length,Source_precision,Source_scale,Source_is_nullable,Source_Unique_Ref,Confidence_Levels
0,12,CLIENT,Client,,,,client,clientExec,The Operator responsible for this client,Optional,...,dbo,error_tracker_archive,notification_sent,bit,1,1,0,1,788,1


In [82]:
print(f"Prepared the mapping as a df ready for upload to bigquery")
print(f"df_list_for_bq_upload contains {len(df_list_for_bq_upload)} dataframes. Concatenating...")
df_for_bq_upload_concat = pd.concat(df_list_for_bq_upload)

Prepared the mapping as a df ready for upload to bigquery
df_list_for_bq_upload contains 9 dataframes. Concatenating...


In [None]:
df_for_bq_upload_concat.head(50)

In [None]:
print(df_for_bq_upload_concat.reset_index(drop=True))

In [None]:
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table("mapped-test")

print(f"Loading group_mapping into bigquery...")

job_config = bigquery.LoadJobConfig(
    schema=[],
    write_disposition="WRITE_APPEND",
    schema_update_options=[bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
)

job = client.load_table_from_dataframe(df_for_bq_upload_concat, table_ref, job_config=job_config)  
job.result()  # Wait for job completion