### Set Up Envioronment

In [None]:
!python3 -m venv venv
!source venv/bin/activate

In [None]:
!pip3 install "google-cloud-bigquery>=3.17"
!pip3 install "google-cloud-aiplatform>=1.38"
!pip3 install "pandas>=2.2.0"

In [None]:
import vertexai
from vertexai.preview import generative_models
from vertexai.preview.generative_models import GenerativeModel
from google.cloud import bigquery
import pandas as pd
import random
import time

project_id = ""
dataset_id = ""

raw_target_table = ""
target_table = ""

raw_source_tables = ["source-uipetmis","source-uispet"]
raw_source_tables_wildcard = 'source*'
source_table = ""

# Initialise BQ client
client = bigquery.Client(project=project_id)

### Get Source and target Schemas from BigQuery

To reproduce yourself, use the .csvs in this directory and upload to BigQuery in your own Project. Do not use the provided spreadsheets directly as I have done some (minimal) pre-processing on the spreadsheets

In [None]:
def add_unique_ref_and_create_new_table(project_id, dataset_id, raw_table, new_table, new_col):
    """Adds 'Source_Unique_Ref' column if missing, then creates a new BigQuery table.

    Args:
        project_id: BigQuery project ID.
        dataset_id: BigQuery dataset ID.
        raw_tables: Dict containing raw table names
        new_table: Desired name of finalised table
        new_col: the name of the reference column for the table
    """

    raw_query = f"""
        SELECT *  
        FROM `{project_id}.{dataset_id}.{raw_table}`
    """
    raw_df = client.query(raw_query).to_dataframe()
    raw_df[new_col] = range(1, len(raw_df) + 1)  
    new_table_id = f"{project_id}.{dataset_id}.{new_table}"

    job_config = bigquery.LoadJobConfig()  
    job = client.load_table_from_dataframe(raw_df, new_table_id, job_config=job_config)
    job.result()

    return job.result()

In [None]:
# Source table setup
source_table_ref = client.dataset(dataset_id).table(source_table) 
try:
    client.get_table(source_table_ref)  # Will raise NotFound if the table doesn't exist
    print("Source table '{}' exists.".format(source_table))
except:
    print("Source table '{}' does not exist.".format(source_table))
    print("creating Source table...")
    new_source_col = 'Source_Unique_Ref'
    add_unique_ref_and_create_new_table(project_id, dataset_id, raw_source_tables_wildcard, source_table, new_source_col)

source_query = f"""
    SELECT *  
    FROM `{project_id}.{dataset_id}.{source_table}`
"""
source_df = client.query(source_query).to_dataframe()
print(f"source_df length is {source_df.shape[0]}")


# Target table setup
target_table_ref = client.dataset(dataset_id).table(target_table) 
try:
    client.get_table(target_table_ref)  # Will raise NotFound if the table doesn't exist
    print(f"Target table {target_table} exists.")
except:
    print(f"Target table {target_table} does not exist.")
    print("creating Target table...")
    new_target_col = 'Target_Unique_Ref'   
    add_unique_ref_and_create_new_table(project_id, dataset_id, raw_target_table, target_table, new_target_col)

target_query = f"""
    SELECT *  
    FROM `{project_id}.{dataset_id}.{target_table}`
"""
target_df = client.query(target_query).to_dataframe()
print(f"source_df length is {target_df.shape[0]}")

In [None]:
source_df.head()

In [None]:
target_df.head()

### Helper functions to group and format data

In [None]:
# def create_target_df_groups(df, grouping_levels):
#     """Groups a DataFrame by nested schema paths up to a specified level.

#     Args:
#         df: The DataFrame to group.
#         grouping_levels: A List of strings specifying which columns should be included in the grouping.

#     Returns:
#         A dictionary of DataFrames, where keys are the nested paths, and
#         values are DataFrames containing fields sharing that path. 
#     """

#     grouped_dfs = {}
#     # levels = ['Tranche'] + [f'Level_{i}' for i in range(1, 4)] 

#     for _, row in df.iterrows():
#         path = '.'.join(row[col] for col in grouping_levels if row[col] != 'n/a')
#         if path not in grouped_dfs:
#             grouped_dfs[path] = pd.DataFrame(columns=df.columns)  
#         grouped_dfs[path] = pd.concat([grouped_dfs[path], row.to_frame().T], ignore_index=True)

#     return grouped_dfs

def create_df_groups(df, grouping_levels):
    """Groups a DataFrame by nested schema paths up to a specified level.

    Args:
        df: The DataFrame to group.

    Returns:
        A dictionary of DataFrames, where keys are the nested paths, and
        values are DataFrames containing fields sharing that path. 
    """

    grouped_dfs = {}
    # levels = ['SchemaName', 'TableName'] 

    for _, row in df.iterrows():
        path = '.'.join(row[col] for col in grouping_levels)
        if path not in grouped_dfs:
            grouped_dfs[path] = pd.DataFrame(columns=df.columns)  
        grouped_dfs[path] = pd.concat([grouped_dfs[path], row.to_frame().T], ignore_index=True)

    return grouped_dfs

def dataframe_to_string(df):
    """Converts a DataFrame to a string with column names and row values.

    Args:
        df: The pandas DataFrame to convert.

    Returns:
        A string representation of the DataFrame.
    """

    output = f"Column Names: {', '.join(df.columns)}\n"  # Header with column names

    for _, row in df.iterrows():
        row_string = ', '.join(str(value) for value in row)
        output += f"Row: {row_string}\n"

    return output

def chop_source_df_groups(source_df_groups, max_rows_per_group):
    """Chops source dataframe groups into smaller groups with a specified max number of rows.

    Args:
        source_df_groups: The dictionary of source dataframe groups.
        max_rows_per_group: The maximum number of rows allowed in each group.

    Returns:
        A modified dictionary of source dataframe groups with smaller groups.
    """
    chopped_source_df_groups = {}

    for path, source_group_df in source_df_groups.items():
        # Check if the group needs to be chopped
        if len(source_group_df) <= max_rows_per_group:
            chopped_source_df_groups[path] = source_group_df
        else:
            # Split the group into smaller groups with a maximum of max_rows_per_group rows
            num_subgroups = len(source_group_df) // max_rows_per_group
            remainder = len(source_group_df) % max_rows_per_group

            for i in range(num_subgroups):
                start_idx = i * max_rows_per_group
                end_idx = (i + 1) * max_rows_per_group
                sub_df = source_group_df.iloc[start_idx:end_idx]
                chopped_source_df_groups[f"{path}_subgroup_{i+1}"] = sub_df

            # Add the remainder as a separate subgroup
            if remainder > 0:
                sub_df = source_group_df.iloc[-remainder:]
                chopped_source_df_groups[f"{path}_subgroup_{num_subgroups+1}"] = sub_df

    return chopped_source_df_groups


def merge_source_df_groups(source_df_groups, max_rows_per_group):
    """Chops source dataframe groups and combines smaller groups.

    Args:
        source_df_groups: The dictionary of source dataframe groups.
        max_rows_per_group: The maximum number of rows allowed in each group.

    Returns:
        A modified dictionary of source dataframe groups with optimized sizing.
    """
    chopped_source_df_groups = {}
    group_paths = list(source_df_groups.keys())  # Get a list of group paths for iteration

    i = 0
    while i < len(group_paths):
        current_path = group_paths[i]
        current_group_df = source_df_groups[current_path]
            
        # Combine with subsequent groups while possible
        while i + 1 < len(group_paths) and len(current_group_df) + len(source_df_groups[group_paths[i + 1]]) <= max_rows_per_group:
            next_path = group_paths[i + 1]
            next_group_df = source_df_groups[next_path]
            current_group_df = pd.concat([current_group_df, next_group_df], ignore_index=True)
            del source_df_groups[next_path]  # Remove the merged group
            group_paths.pop(i + 1)  # Update the list of group paths

        # Add the combined (or original) group
        chopped_source_df_groups[current_path] = current_group_df
        i += 1

    return chopped_source_df_groups

def merge_small_groups(chopped_source_df_groups):
    """Merges small groups (length <= 3) with their preceding groups.

    Args:
        chopped_source_df_groups: The dictionary of chopped dataframe groups.

    Returns:
        A modified dictionary of dataframe groups with fewer small groups.
    """
    group_paths = list(chopped_source_df_groups.keys())
    i = 1  # Start from the second group
    while i < len(group_paths):
        current_path = group_paths[i]
        current_group_df = chopped_source_df_groups[current_path]

        if len(current_group_df) <= 3:
            prev_path = group_paths[i - 1]
            prev_group_df = chopped_source_df_groups[prev_path]

            # Merge with the previous group
            chopped_source_df_groups[prev_path] = pd.concat([prev_group_df, current_group_df], ignore_index=True)

            # Remove the current group
            del chopped_source_df_groups[current_path]
            group_paths.pop(i) 
        else:
            i += 1

    return chopped_source_df_groups


### Prepare the subdivided groups for the source and destination schemas

This is required so we can come in below the maximum token size for Gemini
(32K input, 2K output) https://ai.google.dev/models/gemini#model_variations

In [None]:
target_grouping_levels = ['Tranche', 'Level_1', 'Level_2', 'Level_3', 'Level_4']
target_df_groups = create_df_groups(target_df,target_grouping_levels)

target_string_groups = []
for path, target_df_group in target_df_groups.items():
    target_string = dataframe_to_string(target_df_group)
    target_string_groups.append(target_string)
    
print(f"Number of target schema dataframe groupings: {len(target_df_groups)}")
print(f"Number of target schema string groupings: {len(target_string_groups)}\n")
print("*************\n")

source_grouping_levels = ['SchemaName', 'TableName']
source_df_groups = create_df_groups(source_df, source_grouping_levels)

source_string_groups = []
for path, source_group_df in source_df_groups.items():
    source_group_sting = dataframe_to_string(source_group_df)
    source_string_groups.append(source_group_sting)
    
print(f"Number of source schema dataframe groupings: {len(source_df_groups)}")
print(f"Number of source schema string groupings: {len(source_string_groups)}\n")

#Further split up the source_df_groups to make sure there is no group larger than maximum_fields_per_request variable. This prevents LLM innacuacies when the number of requested field mappings is too high.
maximum_fields_per_request = 12

target_row_number = 42

chopped_source_df_groups = chop_source_df_groups(source_df_groups, maximum_fields_per_request)
print(f"Row {target_row_number}: Number of chopped source schema dataframe groupings: {len(chopped_source_df_groups)}")

chopped_length_counts = {}
for group_df in chopped_source_df_groups.values():
    group_length = len(group_df)
    if group_length in chopped_length_counts:
        chopped_length_counts[group_length] += 1
    else:
        chopped_length_counts[group_length] = 1
print(f"Row {target_row_number}: Distribution of chopped lengths:")
for length, count in chopped_length_counts.items():
    print(f"{count} x groups with length {length}")

merged_source_df_groups = merge_source_df_groups(chopped_source_df_groups, maximum_fields_per_request)
print(f"Row {target_row_number}: Number of merged source schema dataframe groupings: {len(merged_source_df_groups)}")

merged_length_counts = {}
for group_df in merged_source_df_groups.values():
    group_length = len(group_df)
    if group_length in merged_length_counts:
        merged_length_counts[group_length] += 1
    else:
        merged_length_counts[group_length] = 1
print(f"Row {target_row_number}: Distribution of merged lengths:")
for length, count in merged_length_counts.items():
    print(f"{count} x groups with length {length}")

combined_source_df_groups = merge_small_groups(merged_source_df_groups)
print(f"Row {target_row_number}: Number of combined source schema dataframe groupings: {len(combined_source_df_groups)}")

combined_length_counts = {}
for group_df in combined_source_df_groups.values():
    group_length = len(group_df)
    if group_length in combined_length_counts:
        combined_length_counts[group_length] += 1
    else:
        combined_length_counts[group_length] = 1
print(f"Row {target_row_number}: Distribution of combined lengths:")
for length, count in combined_length_counts.items():
    print(f"{count} x groups with length {length}")

unmapped_source_string_groups = []
for path, merged_source_df_group in merged_source_df_groups.items():
    merged_source_string_group = dataframe_to_string(merged_source_df_group)
    unmapped_source_string_groups.append(merged_source_string_group)
print(f"Number of merged source schema string groupings: {len(unmapped_source_string_groups)}\n")

# random_number_in_source_range = random.randint(0, len(unmapped_source_string_groups)-1)
# print(f"showing randomly chosen source string group {random_number_in_source_range}:")
# random_source_string_group = unmapped_source_string_groups[random_number_in_source_range]
# print(random_source_string_group)


### Prepare Gemini

- First we create the FunctionDeclaration. This helps us get a more structured and consistent output from the LLM which is helpful for use cases such as this when we are dealing with structured data. See [Function Calling](https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/function-calling)
- Then we prepare the prompt to send to Gemini. The prompt is intentionally very verbose and repetitive, as well as directly refering to the declared function. Further optimisations and improvements could be made if spent tuning the prompt.

In [None]:
model = GenerativeModel("gemini-pro")

get_mappings_func = generative_models.FunctionDeclaration(
  name="get_mappings",
  description="Get the mappings of source schema fields for a given target schema field. This function provides the mappings as a confidence level for each source field to the target field. A single response from this function includes mappings for each and every source field that is presented. Each seperate source field should be recorded as matching indexes in the arrays for Source_Column_Names, Source_TableNames and Source_SchemaNames, where the same index of each array describes the source field details for that field. The response MUST have array lengths for Source_Column_Names, Source_TableNames, Source_SchemaNames that are equal to the number of source fields that are shown for a mapping request. for example the values in Source_Column_Names[0] Source_TableNames[0] Source_SchemaNames[0] together describe the attributes of the first source field, with Confidence_Levels[0] being the the confidence level out of 10 for mapping this source field to the target field. Simiarly for index [1] for the second field, index [2] for the third field, and so on for each source field.",
  parameters={
      "type": "object",
      "properties": {
          "Source_SchemaNames": {
              "type": "array",
              "description": "The array of values for the field SchemaNames in the source schema that could map to the target schema with a likelihood based on the corresponding value of Confidence_Level property at the same index.",
              "items" : {
                    "type": "string"
              },
              "example": ["dbo","dbo","dbo"]
          },
          "Source_TableNames": {
              "type": "array",
              "description": "The array of values for the field TableNames in the source schema that could map to the target schema with a likelihood based on the corresponding value of Confidence_Level property at the same index.",
              "items" : {
                    "type": "string"
              },
              "example": ["user_message","CDLPolicyImportWrk","clinic"]
          },
          "Source_Column_Names": {
              "type": "array",
              "description": "The array of values for the field Column_Names in the source schema that could map to the target schema with a likelihood based on the corresponding value of Confidence_Level property at the same index.",
              "items" : {
                "type": "string"
              },
              "example": ["message_id","PremiumPrice","area_id"]
          },
          "Confidence_Levels": {
              "type": "array",
              "description": "The array of values for the confidence level out of 10 for the mappings of the source schema fields at the corresponding index. For example the array of ['3','8','2'] would mean a confidence level of 3/10 for the source>target schema mapping for the source field represented by the values in the first indexes of the Source_Column_Names, Source_TableNames and Source_SchemaNames arrays, a confidence level of 8/10 for the the source>target schema mapping for the source field represented by the values in the second indexes of the Source_Column_Names, Source_TableNames and Source_SchemaNames arrays, and so on for each of the indexes of the arrays.",
              "items" : {
                    "type": "string",
                    "enum": ["0","1","2","3","4","5","6","7","8","9","10"]
              },
              "example": ["1","2","3"]
          }
      },
      "required": [
          "Source_SchemaNames", "Source_TableNames", "Source_Column_Names", "Confidence_Levels"
      ]
  },
)

get_mappings_tool = generative_models.Tool(
  function_declarations=[get_mappings_func]
)

### Helper functions used post LLM-response

In [None]:
def parse_function_call(function_call):
    """Parses a FunctionCall object, adds a description, and returns a JSON-compatible dictionary.

    Args:
        function_call: The FunctionCall object to parse.

    Returns:
        A dictionary containing the function name, attributes, and description.
    """

    result = {
        "function_name": function_call.name,
        "attributes": {},
    }
    for key, value in function_call.args.items():
        result["attributes"][key] = value

    return result

def convert_dict_to_list_of_dicts(dict):
    """Converts a dictionary of lists and strings to a list of flat dictionaries.

    Args:
        data: The input dictionary containing lists and strings.

    Returns:
        A list of dictionaries, where each dictionary represents a  
        combination of elements from the input lists.
    """
    
    list_of_attribute_dicts = []
    string_keys = []
    list_keys = []

    
    for key, value in dict.items() :
        if isinstance(value, str):
            string_keys.append(key)
        else:
            list_keys.append(key)       
    
    for i in range (len(dict[list_keys[0]])):
        new_dict = {}
        
        for key in list_keys:
            new_dict[key] = dict[key][i]
        for key in string_keys:
            new_dict[key] = dict[key]

        list_of_attribute_dicts.append(new_dict)

    return list_of_attribute_dicts

def create_df_from_target_row_df_and_list_of_dicts(list_of_attribute_dicts, test_target_df_row):
    """
    Appends rows to a DataFrame, combining a base row with data from a list of dictionaries.

    Args:
        list_of_dicts: A list of dictionaries, each representing column values.
        test_target_df_row: A DataFrame row containing base columns.

    Returns:
        The modified DataFrame with the newly appended rows.
    """

    df_list = []
     
    for attribute_dict in list_of_attribute_dicts:
        # Combine the base row with the current dictionary

        df = test_target_df_row.copy()  # Make a copy 

        for key in attribute_dict.keys():
            df[key] = attribute_dict[key]

        df_list.append(df)

    return pd.concat(df_list)

### Iterate over all source fields groups (~500), for a single target field

- First we select a single Target schema field

In [None]:
# Prepare single target field
test_target_df_row = target_df.iloc[[44]]
test_target_string_row = dataframe_to_string(test_target_df_row)
test_target_df_row.head()

In [None]:
mapped_source_string_groups = []
errored_source_string_groups = []

In [None]:
df_list_for_bq_upload = []

for j, unmapped_source_string_group in enumerate(unmapped_source_string_groups):

    field_count = unmapped_source_string_group.count('Row:')
    
    print("************************************")
    print(f"Attempting source field group {j}/{len(unmapped_source_string_groups)} = {j/len(unmapped_source_string_groups):.0%} ...")
    print(f"{field_count} source fields in group")
    
    prompt = f"""You are Data Engineer working for an insurance company. As part of a data migration project you need to assist with mapping fields in a source data schema fields in a target data schema.
    The source and desination schemas are both complex and nested.
    You will be shown 1 field in the target schema and multiple fields in the source schema.
    The mappings will not be exactly one to one: Instead of providing a one-to-one mapping for a single source schema to a single destiation schema, you will be asked to provide a confidence rating for how well you think each of the fields for the source schemas you see will map to the field for the target schema.

    The field from the target schema is described here:
    {test_target_string_row}

    The fields taken from the source schema are described here:
    {unmapped_source_string_group}

    Based on what you can see, I want you to provide a confidence level for whether any of the fields in the source schema map to the field of the target schema.
    The confidence level is a number between 0 and 10 where 0 is very unlikely and 10 is a very strong match.

    Please use the get_mappings_tool to structure your response, where each seperate field of the target schema should be described in the same index of the arrays for the Confidence_Levels, Source_Column_Names, Source_TableNames and Source_SchemaNames properties.

    You are being shown {field_count} seperate source fields, so you need to have an array length of exactly {field_count} for each of the Confidence_Levels, Source_Column_Names, Source_TableNames and Source_SchemaNames properties in the response from the get_mappings tool.

    As a general example to help you:
    assuming a field the target schema is:
    Column Names: Unique_Ref, Tranche, Level_1, Level_2, Level_3, Level_4, Complex_Type, Attribute, Description, Mandatory__, Data_Type, Accepted_Values, Validation, Drop_Down_Metaval
    Row: 382, POLICY, Policy, quotesHubLifestyleFactor, n/a, n/a, quotesHubLifestyleFactor, code, The code identifying the Lifestyle factor, Mandatory, string (30), None, None, <NA>

    and the fields for the source schema to provide a confidence level mapping for are:
    Column Names: SchemaName, TableName, Column_Name, Data_type, Max_Length, precision, scale, is_nullable
    Row: dbo, user_message, message_id, int, 4, 10, 0, 0
    Row: dbo, CDLPolicyImportWrk, PremiumPrice, money, 8, 19, 4
    Row: dbo, clinic, area_id, int, 4, 10, 0

    You should structure the response with the get_mappings tool as follows:
    Confidence_Levels = ["X", "Y", "Z"]
    Source_SchemaNames = ["dbo","dbo","dbo"]
    Source_TableNames = ["user_message","CDLPolicyImportWrk","clinic"]
    Source_Column_Names = ["message_id","PremiumPrice","area_id"]

    Where X represents your confidence out of 10 for how the sorce field dbo.user_message.message_id maps to the target field POLICY.Policy.quotesHubLifestyleFactor.quotesHubLifestyleFactor
    Where Y represents your confidence out of 10 for how the sorce field dbo.CDLPolicyImportWrk.PremiumPrice maps to the target field POLICY.Policy.quotesHubLifestyleFactor.quotesHubLifestyleFactor
    Where Z represents your confidence out of 10 for how the sorce field dbo.clinic.area_id maps to the target field POLICY.Policy.quotesHubLifestyleFactor.quotesHubLifestyleFactor

    IT IS VERY IMPORTANT THAT IF THE LENGTHS OF THE ARRAYS YOU CREATE FOR THE PARAMETERS Source_Column_Names, Source_TableNames, Source_SchemaNames AND Confidence_Levels ARE EXACTLY THE SAME AS THE NUMBER OF SOURCE FIELDS YOU ARE SHOWN.
    YOU MUST CREATE EXACTLY ONE CONFIDENCE LEVEL MAPPING FROM EACH SOURCE FIELD TO THE TARGET FIELD, AND FOLLOW THE INSTRUCTIONS EXACTLY FOR HOW TO STRUCTURE THIS INFORMATION INTO THE get_mappings TOOL.

    There is a strong chance that none of the fields provide to you will match well - remember I am only sending you a small portion of the target schema and there may be other fields that you haven't seen which map much better.
    """

    model_response_test2 = model.generate_content(
        prompt,
        generation_config={"temperature": 0},
        tools=[get_mappings_tool],
    )
    
    try:
        function_call_json = parse_function_call(model_response_test2.candidates[0].content.parts[0].function_call)
        attributes_dict = function_call_json["attributes"]
        print(f"Received mapping response from Gemini: {attributes_dict}")
        
        con1 = len(attributes_dict['Confidence_Levels']) != len(attributes_dict['Source_Column_Names'])
        con2 = len(attributes_dict['Source_Column_Names']) != len(attributes_dict['Source_TableNames'])
        con3 = len(attributes_dict['Source_TableNames']) != len(attributes_dict['Source_SchemaNames'])
        if (con1 or con2 or con3):
            print("*****UNBALANCED RESPONSE FROM GEMINI!*****")
            raise Exception
        
        con4 = len(attributes_dict['Source_Column_Names']) != field_count
        if con4:
            print("*****FIELDS MAPPED BY GEMINI IS NOT EQUAL TO NUMBER OF INPUT FIELDS!*****")
            raise Exception

        list_of_attribute_dicts = convert_dict_to_list_of_dicts(attributes_dict) #flattens the response from the LLM so we now have a list containing 1 dict per source field > destination field mapping

        if (field_count != len(list_of_attribute_dicts)):
            print("*****INCORRECT MAPPING*****")
            raise Exception

        group_mapping_output = create_df_from_target_row_df_and_list_of_dicts(list_of_attribute_dicts,test_target_df_row)
        df_list_for_bq_upload.append(group_mapping_output)
        print(f"Prepared the mapping as a df ready for upload to bigquery")

        mapped_source_string_groups.append(unmapped_source_string_group)

    except Exception as error:
        print("*******FAILED SOURCE GROUP*********")
        print(unmapped_source_string_group)
        print("*******FAILED SOURCE GROUP*********")
        print("*******ERROR*********")
        print(error)
        print("*******ERROR*********")
        print("*******GEMINI RESPONSE*********")
        print(attributes_dict)
        print("*******GEMINI RESPONSE*********")
        
        errored_info = {
            'unmapped_source_string_group': unmapped_source_string_group,
            'error': error,
            'function_call_json': function_call_json
        }
        errored_source_string_groups.append(errored_info)
    
    print(f"error count is currently {len(errored_source_string_groups)}")

In [None]:
print(f"Loading group_mapping into bigquery...")
print(f"df_list_for_bq_upload contains {len(df_list_for_bq_upload)} dataframes. Concatenating...")
df_for_bq_upload = pd.concat(df_list_for_bq_upload)

In [None]:
df_for_bq_upload['Confidence_Levels'].describe()

In [None]:
dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table("mapped-test2-v2")

job_config = bigquery.LoadJobConfig(
    schema=[],
    write_disposition="WRITE_APPEND",
    schema_update_options=[bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
)

job = client.load_table_from_dataframe(df_for_bq_upload, table_ref, job_config=job_config)  
job.result()  # Wait for job completion