In [15]:
import pandas as pd
from dotenv import load_dotenv
import os

In [16]:
from helpers.metrics_server import start_metrics_server, files_extracted, rows_extracted, rows_transformed, rows_validated, missing_values_detected, rows_staged, rows_processed, rows_cleaned, data_quality_issues 
from helpers.logging_utils import setup_logging

setup_logging()
load_dotenv(override=True)

start_metrics_server(port=8002)

In [6]:
import os
from scripts.extract import extract_data
from helpers.s3_utils import get_s3_client

# Initialize S3 client and other variables
s3_client = get_s3_client()
s3_bucket = os.getenv("s3_bucket")
s3_prefix = os.getenv("s3_prefix")

# Call extract_data with the necessary parameters
df = extract_data('AgencyMaster.csv', s3_client, s3_bucket, s3_prefix)
print(df.head())

2024-08-30 19:38:22,897 - root - INFO - Extracting AgencyMaster.csv from S3


   AgencyID                      AgencyName
0      2001       ADMIN FOR CHILDREN'S SVCS
1      2002       ADMIN TRIALS AND HEARINGS
2      2003             BOARD OF CORRECTION
3      2004               BOARD OF ELECTION
4      2005  BOARD OF ELECTION POLL WORKERS


In [None]:
from helpers.db_utils import redshift_engine, stage_data

engine = redshift_engine()

In [7]:
import pandas as pd

from scripts.validate import validate_and_clean_master_data, validate_and_clean_transactional_data

attributes = [
    'FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID', 'LastName', 'FirstName',
    'AgencyStartDate', 'WorkLocationBorough', 'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30',
    'BaseSalary', 'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay'
]

df = validate_and_clean_master_data(df, attributes)

2024-08-30 19:38:27,974 - root - INFO - Validating and cleaning master data
2024-08-30 19:38:27,982 - root - INFO - Master Data Cleaning Summary: Converted non-numeric values in AgencyID to NaN.


In [8]:
df.head()

Unnamed: 0,AgencyID,AgencyName
0,2001,ADMIN FOR CHILDREN'S SVCS
1,2002,ADMIN TRIALS AND HEARINGS
2,2003,BOARD OF CORRECTION
3,2004,BOARD OF ELECTION
4,2005,BOARD OF ELECTION POLL WORKERS


In [9]:
df = extract_data('nycpayroll_2021.csv', s3_client, s3_bucket, s3_prefix)
df.head()

2024-08-30 19:38:46,547 - root - INFO - Extracting nycpayroll_2021.csv from S3


Unnamed: 0,FiscalYear,PayrollNumber,AgencyCode,AgencyName,EmployeeID,LastName,FirstName,AgencyStartDate,WorkLocationBorough,TitleCode,TitleDescription,LeaveStatusasofJune30,BaseSalary,PayBasis,RegularHours,RegularGrossPaid,OTHours,TotalOTPaid,TotalOtherPay
0,2021,996,2153,NYC HOUSING AUTHORITY,209184,MUSTACIUOLO,VITO,2/26/2018,MANHATTAN,40475,EXECUTIVE DIRECTOR,ACTIVE,258000.0,per Annum,1820,257260.3,0.0,0.0,258000.0
1,2021,996,2153,NYC HOUSING AUTHORITY,302330,RUSS,GREGORY,8/12/2019,MANHATTAN,41143,CHAIR,ACTIVE,414707.0,per Annum,1820,413518.05,0.0,0.0,500.0
2,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,49788,HALLAHAN,PATRICK,2/26/2018,BROOKLYN,40782,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080,132288.0,2115.25,218628.18,56616.07
3,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,251626,PETTIT,PATRICK,8/2/2010,MANHATTAN,40782,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080,132288.0,2152.75,218694.96,38611.82
4,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,364376,TELEHANY,STEPHEN,1/16/2007,QUEENS,40782,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080,132288.0,1876.25,192296.19,51160.2


In [10]:

df = validate_and_clean_transactional_data(df, attributes)
df.head()

2024-08-30 19:39:13,913 - root - INFO - Validating and cleaning transactional data


2024-08-30 19:39:14,134 - root - INFO - Transactional Data Cleaning Summary: Dropped rows with missing values in FiscalYear as it was <= 5%; Dropped rows with missing values in PayrollNumber as it was <= 5%; Dropped rows with missing values in AgencyID as it was <= 5%; Dropped rows with missing values in AgencyName as it was <= 5%; Dropped rows with missing values in EmployeeID as it was <= 5%; Dropped rows with missing values in LastName as it was <= 5%; Dropped rows with missing values in FirstName as it was <= 5%; Dropped rows with missing values in AgencyStartDate as it was <= 5%; Dropped rows with missing values in WorkLocationBorough as it was <= 5%; Dropped rows with missing values in TitleCode as it was <= 5%; Dropped rows with missing values in TitleDescription as it was <= 5%; Dropped rows with missing values in LeaveStatusasofJune30 as it was <= 5%; Dropped rows with missing values in BaseSalary as it was <= 5%; Dropped rows with missing values in PayBasis as it was <= 5%; D

Unnamed: 0,FiscalYear,PayrollNumber,AgencyID,AgencyName,EmployeeID,LastName,FirstName,AgencyStartDate,WorkLocationBorough,TitleCode,TitleDescription,LeaveStatusasofJune30,BaseSalary,PayBasis,RegularHours,RegularGrossPaid,OTHours,TotalOTPaid,TotalOtherPay
0,2021,996,2153,NYC HOUSING AUTHORITY,209184,Mustaciuolo,Vito,2018-02-26,MANHATTAN,40475,EXECUTIVE DIRECTOR,ACTIVE,258000.0,PER ANNUM,1820,257260.3,0.0,0.0,258000.0
1,2021,996,2153,NYC HOUSING AUTHORITY,302330,Russ,Gregory,2019-08-12,MANHATTAN,41143,CHAIR,ACTIVE,414707.0,PER ANNUM,1820,413518.05,0.0,0.0,500.0
2,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,49788,Hallahan,Patrick,2018-02-26,BROOKLYN,40782,STATIONARY ENGINEER,ACTIVE,508.8,PER DAY,2080,132288.0,2115.25,218628.18,56616.07
3,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,251626,Pettit,Patrick,2010-08-02,MANHATTAN,40782,STATIONARY ENGINEER,ACTIVE,508.8,PER DAY,2080,132288.0,2152.75,218694.96,38611.82
4,2021,816,2129,DEPT OF HEALTH/MENTAL HYGIENE,364376,Telehany,Stephen,2007-01-16,QUEENS,40782,STATIONARY ENGINEER,ACTIVE,508.8,PER DAY,2080,132288.0,1876.25,192296.19,51160.2


# Transformation and Ingest 

In [17]:
import pandas as pd
from dotenv import load_dotenv
import os
import logging
from helpers.logging_utils import setup_logging

from helpers.db_utils import stage_data
from scripts.transform_ingest import ensure_columns, transform_master_data, transform_transactional_data
from scripts.extract import extract_data
from helpers.s3_utils import get_s3_client
from scripts.validate import validate_and_clean_master_data, validate_and_clean_transactional_data
from helpers.db_utils import redshift_engine, stage_data 
from helpers.metrics_server import start_metrics_server, files_extracted, rows_extracted, rows_transformed, rows_validated, missing_values_detected, rows_staged, rows_processed, rows_cleaned, data_quality_issues 


setup_logging()

def transform_master_data(master_files):
    """
    Transform and stage master data from given files into dimension tables.
    
    Args:
        master_files (list of str): List of file names containing master data.
        
    Returns:
        None
    """
    # Initialize empty DataFrames with the required columns based on table schemas
    dim_employee_df = pd.DataFrame(columns=table_schemas['dim_employee'])
    dim_agency_df = pd.DataFrame(columns=table_schemas['dim_agency'])
    dim_title_df = pd.DataFrame(columns=table_schemas['dim_title'])

    # Define a mapping from file names to dimension table names and their schemas
    file_to_table_map = {
        'EmpMaster.csv': ('dim_employee', table_schemas['dim_employee']),
        'AgencyMaster.csv': ('dim_agency', table_schemas['dim_agency']),
        'TitleMaster.csv': ('dim_title', table_schemas['dim_title'])
    }

    for file_name in master_files:
        table_name, required_columns = file_to_table_map.get(file_name, (None, None))

        if table_name:
            try:
                # Extract data from the file
                df = extract_data(file_name, s3_client, s3_bucket, s3_prefix)

                # Validate and clean the master data
                df_cleaned = validate_and_clean_master_data(df, required_columns)

                # Ensure the DataFrame has all required columns
                df_cleaned = ensure_columns(df_cleaned, required_columns)

                # Append data to the appropriate dimension DataFrame
                if table_name == 'dim_employee':
                    dim_employee_df = pd.concat([
                        dim_employee_df,
                        df_cleaned.drop_duplicates()
                    ], ignore_index=True)

                elif table_name == 'dim_agency':
                    dim_agency_df = pd.concat([
                        dim_agency_df,
                        df_cleaned.drop_duplicates()
                    ], ignore_index=True)

                elif table_name == 'dim_title':
                    dim_title_df = pd.concat([
                        dim_title_df,
                        df_cleaned.drop_duplicates()
                    ], ignore_index=True)
            
            except Exception as e:
                logging.error(f"Error processing file {file_name}: {e}")
                continue

    # Stage the data into the database
    try:
        stage_data(engine, dim_employee_df, 'dim_employee')
        stage_data(engine, dim_agency_df, 'dim_agency')
        stage_data(engine, dim_title_df, 'dim_title')
    except Exception as e:
        logging.error(f"Error staging data: {e}")
        return

    total_master_rows = len(dim_employee_df) + len(dim_agency_df) + len(dim_title_df)

    # Update Prometheus metrics
    rows_transformed.set(total_master_rows)
    rows_staged.set(total_master_rows)

    # Log success message with details
    logging.info("Master data successfully transformed and staged.")
    logging.info(f" - dim_employee: {len(dim_employee_df)} rows")
    logging.info(f" - dim_agency: {len(dim_agency_df)} rows")
    logging.info(f" - dim_title: {len(dim_title_df)} rows")
    logging.info(f"Total master data staged: {total_master_rows} rows")





load_dotenv(override=True)

master_files = ['EmpMaster.csv', 'TitleMaster.csv', 'AgencyMaster.csv']

table_schemas = {
    'dim_employee': ['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30'],
    'dim_agency': ['AgencyID', 'AgencyName', 'AgencyStartDate'],
    'dim_title': ['TitleCode', 'TitleDescription'],
    'fact_payroll': ['PayrollNumber', 'EmployeeID','AgencyID', 'TitleCode','FiscalYear', 'BaseSalary', 'RegularHours',
                     'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay', 'WorkLocationBorough']
}

attributes = [
    'FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID', 'LastName', 'FirstName',
    'AgencyStartDate', 'WorkLocationBorough', 'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30',
    'BaseSalary', 'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay'
]
dim_table_names = ['dim_employee', 'dim_title', 'dim_agency']

s3_client = get_s3_client()
engine = redshift_engine()
setup_logging()

s3_bucket = os.getenv("s3_bucket")
s3_prefix = os.getenv("s3_prefix")

transform_master_data(master_files)

In [18]:
import pandas as pd
from dotenv import load_dotenv
import os
import logging
from helpers.logging_utils import setup_logging

from helpers.db_utils import stage_data
from scripts.transform_ingest import ensure_columns, transform_master_data, transform_transactional_data
from scripts.extract import extract_data
from helpers.s3_utils import get_s3_client
from scripts.validate import validate_and_clean_master_data, validate_and_clean_transactional_data
from helpers.db_utils import redshift_engine, stage_data 
from helpers.metrics_server import start_metrics_server, files_extracted, rows_extracted, rows_transformed, rows_validated, missing_values_detected, rows_staged, rows_processed, rows_cleaned, data_quality_issues 


def transform_transactional_data(payroll_files):
    """
    Transform and stage transactional data from the given payroll files into dimension and fact tables.

    Args:
        payroll_files (list of str): List of payroll file names to process.

    Returns:
        None
    """
    # Initialize empty DataFrames with the required columns based on table schemas
    dim_employee_df = pd.DataFrame(columns=table_schemas['dim_employee'])
    dim_agency_df = pd.DataFrame(columns=table_schemas['dim_agency'])
    dim_title_df = pd.DataFrame(columns=table_schemas['dim_title'])
    fact_payroll_df = pd.DataFrame(columns=table_schemas['fact_payroll'])

    for file_name in payroll_files:
        try:
            # Extract data from the transactional files
            df = extract_data(file_name, s3_client, s3_bucket, s3_prefix)

            # Clean and validate the transactional data
            df_cleaned = validate_and_clean_transactional_data(df, attributes)

            # Update dimension DataFrames
            # For dim_employee
            dim_employee_data = df_cleaned[['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30']]
            dim_employee_df = pd.concat([
                dim_employee_df,
                ensure_columns(dim_employee_data, table_schemas['dim_employee']).drop_duplicates()
            ], ignore_index=True)
            
            # For dim_agency
            dim_agency_data = df_cleaned[['AgencyID', 'AgencyName', 'AgencyStartDate']]
            dim_agency_df = pd.concat([
                dim_agency_df,
                ensure_columns(dim_agency_data, table_schemas['dim_agency']).drop_duplicates()
            ], ignore_index=True)
            
            # For dim_title
            dim_title_data = df_cleaned[['TitleCode', 'TitleDescription']]
            dim_title_df = pd.concat([
                dim_title_df,
                ensure_columns(dim_title_data, table_schemas['dim_title']).drop_duplicates()
            ], ignore_index=True)
            
            # Prepare fact_payroll DataFrame
            fact_payroll_data = df_cleaned[['PayrollNumber', 'EmployeeID', 'AgencyID', 'TitleCode','FiscalYear', 'BaseSalary',
                                            'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay',
                                            'WorkLocationBorough']]
            fact_payroll_df = pd.concat([
                fact_payroll_df,
                ensure_columns(fact_payroll_data, table_schemas['fact_payroll']).drop_duplicates()
            ], ignore_index=True)
        
        except Exception as e:
            logging.error(f"Error processing file {file_name}: {e}")
            continue

    # Stage the data into the database
    try:
        stage_data(engine, dim_employee_df, 'dim_employee')
        stage_data(engine, dim_agency_df, 'dim_agency')
        stage_data(engine, dim_title_df, 'dim_title')
        stage_data(engine, fact_payroll_df, 'fact_payroll')
    except Exception as e:
        logging.error(f"Error staging data: {e}")
        return

    # Calculate total rows staged
    total_transactional_rows = len(fact_payroll_df)
    #total_rows = total_master_rows + total_transactional_rows

    # Update Prometheus metrics
    rows_transformed.set(len(fact_payroll_df))
    rows_staged.set(total_transactional_rows)

    # Log success message with details
    logging.info("Transactional data successfully transformed and staged.")
    logging.info(f" - fact_payroll: {total_transactional_rows} rows")
    logging.info(f"Total transactional data staged: {total_transactional_rows} rows")
    logging.info("All data successfully transformed and staged.")
    #logging.info(f" - Total master data: {total_master_rows} rows")  # Ensure total_master_rows is defined
    logging.info(f" - Total transactional data: {total_transactional_rows} rows")
    #logging.info(f"Total data staged: {total_rows} rows")


load_dotenv(override=True)

master_files = ['EmpMaster.csv', 'TitleMaster.csv', 'AgencyMaster.csv']

payroll_files = ['nycpayroll_2021.csv','nycpayroll_2020.csv']

table_schemas = {
    'dim_employee': ['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30'],
    'dim_agency': ['AgencyID', 'AgencyName', 'AgencyStartDate'],
    'dim_title': ['TitleCode', 'TitleDescription'],
    'fact_payroll': ['PayrollNumber', 'EmployeeID','AgencyID', 'TitleCode','FiscalYear', 'BaseSalary', 'RegularHours',
                     'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay', 'WorkLocationBorough']
}

attributes = [
    'FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID', 'LastName', 'FirstName',
    'AgencyStartDate', 'WorkLocationBorough', 'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30',
    'BaseSalary', 'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay'
]
dim_table_names = ['dim_employee', 'dim_title', 'dim_agency']

s3_client = get_s3_client()
engine = redshift_engine()
setup_logging()

s3_bucket = os.getenv("s3_bucket")
s3_prefix = os.getenv("s3_prefix")


transform_transactional_data(payroll_files)

In [10]:
import pandas as pd
import os
import logging
from dotenv import load_dotenv

from helpers.logging_utils import setup_logging
from helpers.db_utils import stage_data, redshift_engine
from scripts.transform_ingest import ensure_columns, transform_master_data, transform_transactional_data
from scripts.extract import extract_data
from scripts.validate import validate_and_clean_master_data, validate_and_clean_transactional_data
from helpers.s3_utils import get_s3_client
from helpers.metrics_server import (
    start_metrics_server, files_extracted, rows_extracted, rows_transformed, rows_validated, 
    missing_values_detected, rows_staged, rows_processed, rows_cleaned, data_quality_issues
)

def transform_transactional_data(payroll_files):
    """
    Transform and stage transactional data from the given payroll files into dimension and fact tables.

    Args:
        payroll_files (list of str): List of payroll file names to process.

    Returns:
        None
    """
    # Initialize empty DataFrames with the required columns based on table schemas
    dim_employee_df = pd.DataFrame(columns=table_schemas['dim_employee'])
    dim_agency_df = pd.DataFrame(columns=table_schemas['dim_agency'])
    dim_title_df = pd.DataFrame(columns=table_schemas['dim_title'])
    fact_payroll_df = pd.DataFrame(columns=table_schemas['fact_payroll'])

    for file_name in payroll_files:
        try:
            # Extract data from the transactional files
            df = extract_data(s3_client, s3_bucket, s3_prefix, file_name)

            # Clean and validate the transactional data
            df_cleaned = validate_and_clean_transactional_data(df, attributes)

            # Update dimension DataFrames
            # For dim_employee
            dim_employee_data = df_cleaned[['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30']]
            dim_employee_df = pd.concat([
                dim_employee_df,
                ensure_columns(dim_employee_data, table_schemas['dim_employee']).drop_duplicates()
            ], ignore_index=True)
            
            # For dim_agency
            dim_agency_data = df_cleaned[['AgencyID', 'AgencyName', 'AgencyStartDate']]
            dim_agency_df = pd.concat([
                dim_agency_df,
                ensure_columns(dim_agency_data, table_schemas['dim_agency']).drop_duplicates()
            ], ignore_index=True)
            
            # For dim_title
            dim_title_data = df_cleaned[['TitleCode', 'TitleDescription']]
            dim_title_df = pd.concat([
                dim_title_df,
                ensure_columns(dim_title_data, table_schemas['dim_title']).drop_duplicates()
            ], ignore_index=True)
            
            # Prepare fact_payroll DataFrame
            fact_payroll_data = df_cleaned[['PayrollNumber', 'EmployeeID', 'AgencyID', 'TitleCode','FiscalYear', 'BaseSalary',
                                            'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay',
                                            'WorkLocationBorough']]
            fact_payroll_df = pd.concat([
                fact_payroll_df,
                ensure_columns(fact_payroll_data, table_schemas['fact_payroll']).drop_duplicates()
            ], ignore_index=True)
        
        except Exception as e:
            logging.error(f"Error processing file {file_name}: {e}")
            continue

    # Stage the data into the database
    try:
        stage_data(engine, dim_employee_df, 'dim_employee')
        stage_data(engine, dim_agency_df, 'dim_agency')
        stage_data(engine, dim_title_df, 'dim_title')
        stage_data(engine, fact_payroll_df, 'fact_payroll')
    except Exception as e:
        logging.error(f"Error staging data: {e}")
        return

    # Calculate total rows staged
    total_transactional_rows = len(fact_payroll_df)

    # Update Prometheus metrics
    rows_transformed.set(len(fact_payroll_df))
    rows_staged.set(total_transactional_rows)

    # Log success message with details
    logging.info("Transactional data successfully transformed and staged.")
    logging.info(f" - fact_payroll: {total_transactional_rows} rows")
    logging.info(f"Total transactional data staged: {total_transactional_rows} rows")
    logging.info("All data successfully transformed and staged.")


def main():
    load_dotenv(override=True)
    
    master_files = ['EmpMaster.csv', 'TitleMaster.csv', 'AgencyMaster.csv']
    payroll_files = ['nycpayroll_2021.csv','nycpayroll_2020.csv']

    global table_schemas, attributes, s3_client, engine, s3_bucket, s3_prefix
    table_schemas = {
        'dim_employee': ['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30'],
        'dim_agency': ['AgencyID', 'AgencyName', 'AgencyStartDate'],
        'dim_title': ['TitleCode', 'TitleDescription'],
        'fact_payroll': ['PayrollNumber', 'EmployeeID','AgencyID', 'TitleCode','FiscalYear', 'BaseSalary', 'RegularHours',
                        'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay', 'WorkLocationBorough']
    }

    attributes = [
        'FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID', 'LastName', 'FirstName',
        'AgencyStartDate', 'WorkLocationBorough', 'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30',
        'BaseSalary', 'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay'
    ]

    s3_client = get_s3_client()
    engine = redshift_engine()
    setup_logging()

    s3_bucket = os.getenv("s3_bucket")
    s3_prefix = os.getenv("s3_prefix")

    # Validate environment variables
    if not s3_bucket or not s3_prefix:
        logging.error("S3_BUCKET or S3_PREFIX environment variables are not set.")
        return

    # Start processing
    transform_transactional_data(payroll_files)


if __name__ == "__main__":
    main()


ERROR:root:Failed to extract <botocore.client.S3 object at 0x11b77e190>: 'str' object has no attribute 'get_object'
ERROR:root:Error processing file nycpayroll_2021.csv: 'str' object has no attribute 'get_object'
ERROR:root:Failed to extract <botocore.client.S3 object at 0x11b77e190>: 'str' object has no attribute 'get_object'
ERROR:root:Error processing file nycpayroll_2020.csv: 'str' object has no attribute 'get_object'


In [8]:
import subprocess
import logging
import os

def dbt_trigger():
    """
    Trigger the DBT process for further transformations and loading data into the final warehouse.

    This function assumes that the DBT environment is already set up and that the necessary
    DBT commands will execute successfully.
    """
    # Set the path to the dbt project directory
    dbt_project_dir = os.path.join(os.getcwd(), 'dbt')

    try:
        # Run DBT commands from the specified directory
        result = subprocess.run(
            ['dbt', 'run'],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            cwd=dbt_project_dir  # Set the working directory to the dbt project directory
        )
        
        logging.info("DBT run successful")
        logging.info("STDOUT:")
        logging.info(result.stdout.decode())
    except subprocess.CalledProcessError as e:
        logging.error("DBT run failed")
        logging.error("STDERR:")
        logging.error(e.stderr.decode())
        raise e

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)  # Set up logging
    dbt_trigger()


ERROR:root:DBT run failed
ERROR:root:STDERR:
ERROR:root:


CalledProcessError: Command '['dbt', 'run']' returned non-zero exit status 1.

In [9]:
from scripts.dbt_trigger import dbt_trigger

dbt_trigger()

ERROR:root:DBT run failed
ERROR:root:


CalledProcessError: Command '['dbt', 'run']' returned non-zero exit status 1.