In [1]:
import os
from google.cloud import bigquery
import pandas as pd
import json

In [2]:
credentials = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ="ETL-setup.json"

In [3]:
#load data from local machine to bigquery - staging area
def load_csv_to_bigquery(csv_path, project_id, table_name):
    dataset_name = 'staging'
    # Create a BigQuery client using your service account key file
    #credentials = service_account.Credentials.from_service_account_file(key_path)
    client = bigquery.Client(project=project_id)

    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(csv_path, encoding='ISO-8859-1')

    # Create the BigQuery dataset if it doesn't exist
    dataset_ref = client.dataset(dataset_name)
    try:
        client.get_dataset(dataset_ref)
        print("Dataset {} already exists".format(dataset_name))
    except:
        print("Creating dataset {}".format(dataset_name))
        dataset = bigquery.Dataset(dataset_ref)
        client.create_dataset(dataset)

    # Set the destination table for the data
    table_ref = dataset_ref.table(table_name)

    # Define the schema of the table
    schema = []
    for column in df.columns:
        schema.append(bigquery.SchemaField(column, 'STRING'))

    # Create the table in BigQuery
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)

    # Load the data into the table
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = False # Set to True to automatically detect schema, False to use schema defined above
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()

    print("Data uploaded to BigQuery successfully.")

In [4]:

def load_csv_to_bigquery_autodetect(csv_path, project_id, table_name):
    dataset_name = 'staging'
    # Create a BigQuery client 
    bq_client = bigquery.Client(project=project_id)

    # Create the BigQuery dataset if it doesn't exist
    dataset_ref = bq_client.dataset(dataset_name)
    try:
        bq_client.get_dataset(dataset_ref)
        print("Dataset {} already exists".format(dataset_name))
    except:
        print("Creating dataset {}".format(dataset_name))
        dataset = bigquery.Dataset(dataset_ref)
        bq_client.create_dataset(dataset)

    # Set the destination table for the data
    table_ref = dataset_ref.table(table_name)

    # Create the table in BigQuery with schema autodetection
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = True # Set to True to automatically detect schema

    with open(csv_path, "rb") as source_file:
        job = bq_client.load_table_from_file(
            source_file,
            table_ref,
            job_config=job_config
        )
        job.result()  # Wait for the job to complete.

    print("Data uploaded to BigQuery successfully.")


In [5]:
def clean_bigquery_table(project_id, table_id, remove_nulls=False, remove_duplicates=False, date_columns=None, columns_to_check=None):
    """
    Clean a BigQuery table by removing null values and/or duplicates.

    Args:
        project_id (str): The Google Cloud Project ID.
        table_id (str): The BigQuery table ID.
        remove_nulls (bool, optional): Whether to remove rows with null values. Defaults to False.
        columns_to_check (list, optional): List of columns to check for null values or duplicates. Defaults to None (all columns).
        remove_duplicates (bool, optional): Whether to remove duplicate rows. Defaults to False.
        date_columns (list, optional): List of columns to convert to date format. Defaults to None.

    Returns:
        None
    """
    client = bigquery.Client()
    table_ref = client.get_table(table_id)
    table = client.get_table(table_ref)

    if columns_to_check is None:
        columns_to_check = [field.name for field in table.schema]

    sql_base = f"SELECT * FROM `{table_id}`"
    sql_conditions = []

    if remove_nulls:
        not_null_conditions = [f"{column} IS NOT NULL" for column in columns_to_check]
        sql_conditions.append(" AND ".join(not_null_conditions))

    if remove_duplicates:
        deduplicate_clause = "SELECT DISTINCT"
    else:
        deduplicate_clause = "SELECT"

    if sql_conditions:
        sql_condition = "WHERE " + " AND ".join(sql_conditions)
    else:
        sql_condition = ""

    # Handle date column transformation, and make all columns lower case
    select_columns = []
    for column in table.schema:
        if column.name in date_columns:
            select_columns.append(f"PARSE_DATE('%d-%m-%Y', REGEXP_REPLACE({column.name}, r'/', '-')) AS {column.name.lower()}")
        else:
            select_columns.append(column.name.lower())


        sql = f"{deduplicate_clause} {', '.join(select_columns)} FROM ({sql_base}) AS subquery {sql_condition}"

    # Execute the query and save the results to a new table
    new_table_id = f"{project_id}.{table_ref.dataset_id}.{table_ref.table_id}_cleaned"
    new_table_ref = client.dataset(table_ref.dataset_id).table(f"{table_ref.table_id}_cleaned")

    job_config = bigquery.QueryJobConfig(destination=new_table_ref)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()

    print(f"Cleaned table saved as {new_table_id}.")


In [115]:
load_csv_to_bigquery("C:\\Users\\UX501VW\\Desktop\\BigQuery-Data-Modeling\\Dastaset\\superstore_dataset2011-2015.csv", "snappy-nomad-382716", "superstore")

Dataset staging already exists
Data uploaded to BigQuery successfully.


In [116]:
clean_bigquery_table("snappy-nomad-382716", "snappy-nomad-382716.staging.superstore", remove_nulls=True, remove_duplicates=True, date_columns=["Order_Date", "Ship_Date"], columns_to_check=["Customer_ID", "Order_Date", "Order_ID", "Product_ID"])

Cleaned table saved as snappy-nomad-382716.staging.superstore_cleaned.


In [6]:

def create_warehouse_schema_from_json(project_id, json_path):
    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)

    # Load the schema information from the JSON file
    with open(json_path, 'r') as f:
        schema_info = json.load(f)

    # Create a dataset named "warehouse" (if it doesn't already exist)
    dataset_id = "warehouse"
    dataset_ref = client.dataset(dataset_id)
    try:
        client.get_dataset(dataset_ref)
    except:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)

    # Define fact table schema
    fact_table_name = schema_info['fact_table_name']
    fact_table_columns = [bigquery.SchemaField(field['name'], field['type'], mode=field.get('mode', 'NULLABLE')) for field in schema_info['fact_table_columns']]
    fact_table_ref = client.dataset(dataset_id).table(fact_table_name)
    fact_table = bigquery.Table(fact_table_ref, schema=fact_table_columns)
    fact_table = client.create_table(fact_table)  # API request

    # Create dimension tables
    for dimension_table_name, dimension_table_info in schema_info['dimension_tables'].items():
        dimension_table_columns = [bigquery.SchemaField(field['name'], field['type'], mode=field.get('mode', 'NULLABLE')) for field in dimension_table_info]
        dimension_table_ref = client.dataset(dataset_id).table(dimension_table_name)
        dimension_table = bigquery.Table(dimension_table_ref, schema=dimension_table_columns)
        dimension_table = client.create_table(dimension_table)  # API request

    # Create fact-dimension mapping tables
    for fact_column, dimension_map in schema_info['fact_dimension_key_map'].items():
        for dimension_column, dimension_table_name in dimension_map.items():
            mapping_table_name = f"{fact_table_name}_{dimension_table_name}_{dimension_column}"
            mapping_table_columns = [
                bigquery.SchemaField(fact_column, 'STRING', mode='REQUIRED'),
                bigquery.SchemaField(dimension_column, 'STRING', mode='REQUIRED'),
            ]
            mapping_table_ref = client.dataset(dataset_id).table(mapping_table_name)
            mapping_table = bigquery.Table(mapping_table_ref, schema=mapping_table_columns)
            mapping_table = client.create_table(mapping_table)  # API request

    print("Warehouse schema created successfully.")


In [182]:

def load_data_from_staging_to_warehouse(project_id, dataset_warehouse, dataset_staging, staging_table_id, warehouse_table_names):
    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)

    # Iterate through the warehouse table names
    for warehouse_table_name in warehouse_table_names:
        # Get the schema of the warehouse table
        warehouse_table = client.get_table(f"{project_id}.{dataset_warehouse}.{warehouse_table_name}")
        warehouse_columns = [field.name for field in warehouse_table.schema]

        # Create a query to select specific columns from the staging table
        source_columns = ', '.join(warehouse_columns)
        sql = f"""
            SELECT {source_columns}
            FROM `{staging_table_id}`
        """

        # Create table references
        destination_table_ref = f"{project_id}.{dataset_warehouse}.{warehouse_table_name}"

        # Create a load job configuration
        job_config = bigquery.QueryJobConfig()
        job_config.destination = destination_table_ref
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

        # Run the query job
        query_job = client.query(sql, job_config=job_config)
        query_job.result()

        print(f"Data copied from {dataset_staging}.{staging_table_id} to {dataset_warehouse}.{warehouse_table_name}")


In [8]:
from google.cloud import bigquery

def load_data_from_staging_to_warehouse(project_id, dataset_warehouse, dataset_staging, staging_table_id, warehouse_table_names):
    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)

    # Iterate through the warehouse table names
    for warehouse_table_name in warehouse_table_names:
        # Get the schema of the warehouse table
        warehouse_table = client.get_table(f"{project_id}.{dataset_warehouse}.{warehouse_table_name}")
        warehouse_columns = [field.name for field in warehouse_table.schema]

        # Create a query to select and cast specific columns from the staging table
        source_columns = ', '.join([f"CAST({field.name} AS {field.field_type}) AS {field.name}" for field in warehouse_table.schema])
        sql = f"""
            SELECT {source_columns}
            FROM `{project_id}.{dataset_staging}.{staging_table_id}`
        """

        # Create table references
        destination_table_ref = f"{project_id}.{dataset_warehouse}.{warehouse_table_name}"

        # Create a load job configuration
        job_config = bigquery.QueryJobConfig()
        job_config.destination = destination_table_ref
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

        # Run the query job
        query_job = client.query(sql, job_config=job_config)
        query_job.result()

        print(f"Data copied from {dataset_staging}.{staging_table_id} to {dataset_warehouse}.{warehouse_table_name}")


In [10]:

project_id = "snappy-nomad-382716"
dataset_warehouse = "warehouse"
dataset_staging = "staging"
staging_table_id = "superstore_cleaned"
warehouse_table_names = ["date_dim", "customer_dim", "product_dim", "sales_fact","sales_fact_customer_dim_customer_id", "sales_fact_date_dim_order_date", "sales_fact_product_dim_product_id"]
#["date_dim", "customer_dim", "product_dim", "order_fact"]

load_data_from_staging_to_warehouse(project_id, dataset_warehouse, dataset_staging, staging_table_id, warehouse_table_names)

Data copied from staging.superstore_cleaned to warehouse.date_dim
Data copied from staging.superstore_cleaned to warehouse.customer_dim
Data copied from staging.superstore_cleaned to warehouse.product_dim


BadRequest: 400 Type not found: FLOAT at [2:198]

Location: US
Job ID: 834dfd0e-dcb5-4c88-bb30-fc17a215165b


In [7]:
create_warehouse_schema_from_json("snappy-nomad-382716", "C:\\Users\\UX501VW\\Desktop\\BigQuery-Data-Modeling\\schema.json")

Warehouse schema created successfully.
