In [2]:
import os
from google.cloud import bigquery
import pandas as pd

In [3]:
credentials = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ="ETL-setup.json"

In [9]:
#load data from local machine to bigquery - staging area
def load_csv_to_bigquery(csv_path, project_id, table_name):
    dataset_name = 'staging'
    # Create a BigQuery client using your service account key file
    #credentials = service_account.Credentials.from_service_account_file(key_path)
    client = bigquery.Client(project=project_id)

    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(csv_path)

    # Create the BigQuery dataset if it doesn't exist
    dataset_ref = client.dataset(dataset_name)
    try:
        client.get_dataset(dataset_ref)
        print("Dataset {} already exists".format(dataset_name))
    except:
        print("Creating dataset {}".format(dataset_name))
        dataset = bigquery.Dataset(dataset_ref)
        client.create_dataset(dataset)

    # Set the destination table for the data
    table_ref = dataset_ref.table(table_name)

    # Define the schema of the table
    schema = []
    for column in df.columns:
        schema.append(bigquery.SchemaField(column, 'STRING'))

    # Create the table in BigQuery
    table = bigquery.Table(table_ref, schema=schema)
    table = client.create_table(table)

    # Load the data into the table
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.skip_leading_rows = 1
    job_config.autodetect = False # Set to True to automatically detect schema, False to use schema defined above
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()

    print("Data uploaded to BigQuery successfully.")

In [13]:
load_csv_to_bigquery("C:\\Users\\UX501VW\\Desktop\\BigQuery-Data-Modeling\\Dastaset\\demographic.csv", "snappy-nomad-382716", "demographic")

Creating dataset staging
Data uploaded to BigQuery successfully.


In [14]:
load_csv_to_bigquery("C:\\Users\\UX501VW\\Desktop\\BigQuery-Data-Modeling\\Dastaset\\referral.csv", "snappy-nomad-382716", "referal")
load_csv_to_bigquery("C:\\Users\\UX501VW\\Desktop\\BigQuery-Data-Modeling\\Dastaset\\transactions.csv", "snappy-nomad-382716", "transactions")

Dataset staging already exists
Data uploaded to BigQuery successfully.
Dataset staging already exists
Data uploaded to BigQuery successfully.


In [7]:

def clean_bigquery_table(project_id, table_id,remove_nulls=False,remove_duplicates=False,columns_to_check=None):
    """
    Clean a BigQuery table by removing null values and/or duplicates.

    Args:
        project_id (str): The Google Cloud Project ID.
        table_id (str): The BigQuery table ID.
        remove_nulls (bool, optional): Whether to remove rows with null values. Defaults to False.
        columns_to_check (list, optional): List of columns to check for null values or duplicates. Defaults to None (all columns).
        remove_duplicates (bool, optional): Whether to remove duplicate rows. Defaults to False.

    Returns:
        None
    """
    client = bigquery.Client()
    table_ref = client.get_table(table_id)
    table = client.get_table(table_ref)

    if columns_to_check is None:
        columns_to_check = [field.name for field in table.schema]

    sql_base = f"SELECT * FROM `{table_id}`"
    sql_conditions = []

    if remove_nulls:
        not_null_conditions = [f"{column} IS NOT NULL" for column in columns_to_check]
        sql_conditions.append(" AND ".join(not_null_conditions))

    if remove_duplicates:
        deduplicate_clause = "SELECT DISTINCT"
    else:
        deduplicate_clause = "SELECT"

    if sql_conditions:
        sql_condition = "WHERE " + " AND ".join(sql_conditions)
    else:
        sql_condition = ""

    sql = f"{deduplicate_clause} * FROM ({sql_base}) AS subquery {sql_condition}"

    # Execute the query and save the results to a new table
    #new_table_id = f"{table_id}_cleaned"
    #new_table_ref = client.dataset(table_ref.dataset_id).table(new_table_id)
    
    new_table_id = f"{project_id}.{table_ref.dataset_id}.{table_ref.table_id}_cleaned"
    new_table_ref = client.dataset(table_ref.dataset_id).table(f"{table_ref.table_id}_cleaned")

    job_config = bigquery.QueryJobConfig(destination=new_table_ref)
    query_job = client.query(sql, job_config=job_config)
    query_job.result()

    print(f"Cleaned table saved as {new_table_id}.")


In [8]:
clean_bigquery_table("snappy-nomad-382716",'snappy-nomad-382716.staging.demographic', remove_duplicates=True)

Cleaned table saved as snappy-nomad-382716.staging.demographic_cleaned.


In [9]:
clean_bigquery_table("snappy-nomad-382716",'snappy-nomad-382716.staging.referal', remove_duplicates=True)

Cleaned table saved as snappy-nomad-382716.staging.referal_cleaned.
