In [3]:
import pandas as pd

In [1]:
import boto3
from dotenv import load_dotenv
import logging

import os
from io import StringIO

from helpers.metrics import start_metrics_server, files_extracted, rows_extracted, rows_transformed, rows_validated, missing_values_detected, rows_staged, fact_table_created, rows_processed, rows_cleaned, data_quality_issues 
from helpers.logging_utils import setup_logging

setup_logging()
load_dotenv(override=True)

start_metrics_server(port=8001)

OSError: [Errno 48] Address already in use

# EXTRACTION STAGE

In [19]:
import os
import pandas as pd
from helpers.s3_utils import s3_client

# File definitions
master_files = ['EmpMaster.csv', 'TitleMaster.csv', 'AgencyMaster.csv']
payroll_files = ['nycpayroll_2020.csv', 'nycpayroll_2021.csv']
master_table_names = ['DimEmployee', 'DimTitle', 'DimAgency']
dim_columns = [
    ['EmployeeID', 'LastName', 'FirstName', 'LeaveStatusasofJune30'],
    ['TitleCode', 'TitleDescription'],
    ['AgencyID', 'AgencyName', 'AgencyStartDate']
]

# AWS and S3 configuration
s3_bucket = os.getenv("s3_bucket")
s3_prefix = os.getenv("s3_prefix")
aws_region = os.getenv("aws_region")
aws_access_key_id = os.getenv("aws_access_key_id")
aws_secret_access_key = os.getenv("aws_secret_access_key")


def s3_client(aws_region, aws_access_key_id, aws_secret_access_key):
    return boto3.client(
        's3',
        region_name=aws_region,
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

s3_client = s3_client(aws_region, aws_access_key_id, aws_secret_access_key)

def extract_from_s3(s3_client, s3_bucket, s3_prefix, file_name):
    try:
        logging.info(f"Extracting {file_name} from S3")
        obj = s3_client.get_object(Bucket=s3_bucket, Key=s3_prefix + file_name)
        df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))

        # Update metrics
        files_extracted.inc()  # Increment the count of files extracted

        return df
    except Exception as e:
        logging.error(f"Failed to extract {file_name}: {str(e)}")
        raise
    
def extract_data(file_name):
    return extract_from_s3(s3_client, s3_bucket, s3_prefix, file_name)



In [20]:
df_test = extract_data('TitleMaster.csv')
df_test.head()

2024-08-26 20:17:51,759 - root - INFO - Extracting TitleMaster.csv from S3


Unnamed: 0,TitleCode,TitleDescription
0,40001,*ADM SCHOOL SECURITY MANAGER-U
1,40002,*ADMIN SCHL SECUR MGR-MGL
2,40003,*AGENCY ATTORNEY
3,40004,*ASSISTANT ADVOCATE-PD
4,40005,*ASSOCIATE EDUCATION OFFICER


In [32]:
len(df_test)

1446

In [33]:
new_df = df_test.drop_duplicates


In [35]:
new_df

<bound method DataFrame.drop_duplicates of       TitleCode                                   TitleDescription
0         40001                     *ADM SCHOOL SECURITY MANAGER-U
1         40002                          *ADMIN SCHL SECUR MGR-MGL
2         40003                                   *AGENCY ATTORNEY
3         40004                             *ASSISTANT ADVOCATE-PD
4         40005                       *ASSOCIATE EDUCATION OFFICER
...         ...                                                ...
1441      41442            DIRECTOR OF BUREAU OF CONSUMER SERVICES
1442      41443  ASSOC ADM FOR PURCHASING MATERIALS MGT & ENVIR...
1443      41444       PUBLIC HEALTH PREVENTATIVE MEDICINE RESIDENT
1444      41445                          HOUSING ASSISTANT TRAINEE
1445      41446                        SENIOR RACKETS INVESTIGATOR

[1446 rows x 2 columns]>

# TRANSFORM AND VALIDATE

from helpers.db_utils import read_table
from helpers.alert_utils import send_urgent_email
from helpers.metrics import rows_transformed, rows_validated, missing_values_detected

def validate_and_clean_data(df, dim_col):
    logging.info(f"Validating and cleaning data")

    total_rows = len(df)
    rows_validated.set(total_rows)  # Set the number of rows being validated

    for col in dim_col:
        if col not in df.columns:
            df[col] = None

    # Check for missing values
    missing = df.isnull().sum()
    missing_values_detected.set(missing.sum())  # Set the total number of missing values detected

    missing_percentage = (missing / total_rows) * 100

    # Log changes
    changes_log = []

    # Handling based on missing value percentages
    for col, pct in missing_percentage.items():
        if pct <= 5:
            df.dropna(subset=[col], inplace=True)
            changes_log.append(f"Dropped rows with missing values in {col} as it was <= 5%")
        elif 5 < pct <= 10:
            if df[col].dtype == 'object':  # Replace with 'UNKNOWN' for strings
                df[col].fillna('UNKNOWN', inplace=True)
                changes_log.append(f"Replaced missing string values in {col} with 'UNKNOWN'")
            else:
                mean_value = df[col].mean()
                df[col].fillna(mean_value, inplace=True)
                changes_log.append(f"Replaced missing numeric values in {col} with mean: {mean_value}")
        else:
            logging.error(f"Missing values in {col} exceed 10%. Manual intervention required.")
            send_urgent_email(
                subject=f"Data Quality Issue Detected in {col}",
                body=f"High percentage of missing values in {col}: {pct}%. Immediate attention required.",
                to_email="data.engineer@example.com"
            )
            raise ValueError(f"High percentage of missing values in {col}: {pct}%")

    # Anomaly detection
    for col in df.select_dtypes(include=['number']).columns:
        # Replace negative values
        if (df[col] < 0).any():
            df.loc[df[col] < 0, col] = df[col].mean()
            changes_log.append(f"Replaced negative values in {col} with mean.")

        # Replace values greater than 2 * standard deviation
        upper_bound = df[col].mean() + 2 * df[col].std()
        if (df[col] > upper_bound).any():
            df.loc[df[col] > upper_bound, col] = df[col].mean()
            changes_log.append(f"Replaced outliers in {col} (>{2} * SD) with mean.")

    # Log all changes made to a table or a file
    logging.info("Data Cleaning Summary: " + "; ".join(changes_log))

    df.drop_duplicates(inplace=True)
    rows_transformed.set(len(df))  # Set the number of rows transformed

    return df

def transform_master_data(df, required_columns):
    return validate_and_clean_data(df, required_columns)

def transform_transactional_data(df, engine):
    df = validate_and_clean_data(df, ['EmployeeID', 'AgencyID', 'TitleCode'])

    dim_employee = read_table(engine, 'DimEmployee')
    df = pd.merge(df, dim_employee[['EmployeeID']], on='EmployeeID', how='left')

    dim_agency = read_table(engine, 'DimAgency')
    df = pd.merge(df, dim_agency[['AgencyID']], on='AgencyID', how='left')

    dim_title = read_table(engine, 'DimTitle')
    df = pd.merge(df, dim_title[['TitleCode']], on='TitleCode', how='left')

    return df


# LOADING AND INGESTION STAGE

In [17]:
from helpers.db_utils import redshift_engine
redshift_engine()

2024-08-25 15:12:29,483 - root - INFO - Successfully created Redshift engine.


Engine(redshift+psycopg2://ridwanclouds:***@payroll-workgroup.637423632863.eu-west-2.redshift-serverless.amazonaws.com:5439/payrolldb)

In [18]:
from helpers.db_utils import redshift_engine, stage_data

engine = redshift_engine()

2024-08-25 15:12:32,536 - root - INFO - Successfully created Redshift engine.


In [44]:
import pandas as pd
import logging
from helpers.metrics import rows_validated, missing_values_detected, rows_transformed, data_quality_issues

import pandas as pd
import logging

def validate_and_clean_master_data(df, dim_columns):
    logging.info("Validating and cleaning master data")

    # Record total rows before cleaning
    total_rows = len(df)
    rows_validated.set(total_rows)  # Set the number of rows being validated

    # Ensure only columns in master_columns are present, drop any extra columns
    available_columns = [col for col in dim_columns if col in df.columns]
    df = df[available_columns]

    # Initialize changes log
    changes_log = []

    # Validate and clean specific columns
    for col in df.columns:
        if col in ['EmployeeID', 'TitleCode', 'AgencyID']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            changes_log.append(f"Converted non-numeric values in {col} to NaN.")
        elif col in ['LastName', 'FirstName']:
            df[col] = df[col].str.title()  # Standardize to title case
            changes_log.append(f"Standardized {col} to title case.")

    # Check for duplicates in key columns
    key_columns = [col for col in ['EmployeeID', 'TitleCode', 'AgencyID'] if col in df.columns]
    for col in key_columns:
        if df[col].duplicated().any():
            duplicated_values = df[col][df[col].duplicated()].tolist()
            logging.error(f"Duplicate values found in {col}: {duplicated_values}")
            data_quality_issues.inc()
            df.drop_duplicates(subset=[col], keep='first', inplace=True)

    # Check for missing values
    missing = df.isnull().sum()
    missing_values_detected.set(missing.sum())  # Set the total number of missing values detected

    # Log all changes made to the master data
    logging.info("Master Data Cleaning Summary: " + "; ".join(changes_log))

    # Record the number of rows after cleaning
    rows_transformed.set(len(df))  # Set the number of rows transformed

    return df



master_columns = ['EmployeeID', 'LastName', 'FirstName', 'TitleCode', 'TitleDescription', 'AgencyID', 'AgencyName']



#df = extract_data('TitleMaster.csv')

# Validate and clean the test DataFrame
#cleaned_df = validate_and_clean_master_data(df, master_columns)
#print("\nCleaned DataFrame:")
#print(cleaned_df)

In [39]:
import pandas as pd
from helpers.alert_utils import send_urgent_email
from helpers.metrics import rows_validated, missing_values_detected, rows_transformed, data_quality_issues

def print_summary_report(df, initial_row_count, changes_log):
    # Calculate current row count
    current_row_count = len(df)

    # Calculate number of duplicates
    num_duplicates = initial_row_count - current_row_count
    
    # Calculate missing values
    missing_values = df.isnull().sum()
    total_missing_values = missing_values.sum()
    
    # Missing values by column
    missing_values_report = missing_values[missing_values > 0]

    # Count of cleaned data actions
    cleaned_actions = [action for action in changes_log if 'Replaced' in action or 'Standardized' in action]

    # Print summary report
    print("Summary Report:")
    print(f"Initial number of rows: {initial_row_count}")
    print(f"Number of rows after cleaning: {current_row_count}")
    print(f"Number of removed duplicates: {num_duplicates}")
    print(f"Total missing values detected: {total_missing_values}")

    if not missing_values_report.empty:
        print("\nMissing values by column:")
        print(missing_values_report)
    else:
        print("No missing values detected.")

    if cleaned_actions:
        print("\nData Cleaning Actions:")
        for action in cleaned_actions:
            print(f" - {action}")
    else:
        print("No specific cleaning actions were performed.")


def harmonize_columns(df):
    # Map variations to a consistent column name
    column_mapping = {
        'AgencyCode': 'AgencyID',  # Harmonize AgencyCode to AgencyID
    }
    
    # Rename columns according to the mapping
    df.rename(columns=column_mapping, inplace=True)
    
    return df

def validate_and_clean_transactional_data(df, transaction_columns):
    logging.info("Validating and cleaning transactional data")
    
    df = harmonize_columns(df)
    
    total_rows = len(df)
    rows_validated.set(total_rows)  # Set the number of rows being validated

    # Ensure all required columns are present and drop any extra columns
    df = df[transaction_columns]

    # Check for missing values
    missing = df.isnull().sum()
    missing_values_detected.set(missing.sum())  # Set the total number of missing values detected

    missing_percentage = (missing / total_rows) * 100

    changes_log = []
    rows_before_cleaning = len(df)

    # Handling missing values based on percentage
    for col, pct in missing_percentage.items():
        if pct <= 5:
            df.dropna(subset=[col], inplace=True)
            changes_log.append(f"Dropped rows with missing values in {col} as it was <= 5%")
        elif 5 < pct <= 10:
            if df[col].dtype == 'object':
                df[col].fillna('UNKNOWN', inplace=True)
                changes_log.append(f"Replaced missing string values in {col} with 'UNKNOWN'")
            else:
                mean_value = df[col].mean()
                df[col].fillna(mean_value, inplace=True)
                changes_log.append(f"Replaced missing numeric values in {col} with mean: {mean_value}")
        else:
            logging.error(f"Missing values in {col} exceed 10%. Manual intervention required.")
            send_urgent_email(
                subject=f"Data Quality Issue Detected in {col}",
                body=f"High percentage of missing values in {col}: {pct}%. Immediate attention required.",
                to_email="data.engineer@example.com"
            )
            data_quality_issues.inc()
            raise ValueError(f"High percentage of missing values in {col}: {pct}%")

    # Handle anomalies for key columns separately
    key_columns = ['EmployeeID', 'TitleCode', 'AgencyID', 'PayrollNumber']
    for col in key_columns:
        if col in df.columns:
            # Convert non-numeric values to NaN for key columns
            df[col] = pd.to_numeric(df[col], errors='coerce')
            changes_log.append(f"Converted non-numeric values in {col} to NaN.")
            # Fill missing values with NaN for key columns
            df[col].fillna(pd.NA, inplace=True)

    # Handle anomalies for measure columns
    measure_columns = ['BaseSalary', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay']
    for col in measure_columns:
        if col in df.columns:
            # Replace negative values with their positive equivalents
            if (df[col] < 0).any():
                df.loc[df[col] < 0, col] = df[col].abs()
                changes_log.append(f"Replaced negative values in {col} with their positive equivalents.")
            
            # Handle outliers in measure columns
            mean_value = df[col].mean()
            std_dev = df[col].std()
            upper_bound = mean_value + 2 * std_dev
            lower_bound = mean_value - 2 * std_dev

            outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
            if outliers.any():
                df.loc[outliers, col] = mean_value  # Replace outliers with mean
                changes_log.append(f"Replaced outliers in {col} with mean value: {mean_value}")

    # Handle outliers in 'FiscalYear' column
    if 'FiscalYear' in df.columns:
        df['FiscalYear'] = pd.to_numeric(df['FiscalYear'], errors='coerce')
        mean_value = df['FiscalYear'].mean()
        std_dev = df['FiscalYear'].std()
        upper_bound = mean_value + 2 * std_dev
        lower_bound = mean_value - 2 * std_dev

        outliers = (df['FiscalYear'] < lower_bound) | (df['FiscalYear'] > upper_bound)
        if outliers.any():
            most_frequent_year = df['FiscalYear'].mode()[0]  # Get the most frequent year
            df.loc[outliers, 'FiscalYear'] = most_frequent_year
            changes_log.append(f"Replaced outlier FiscalYear values with most frequent year: {most_frequent_year}")

    # Standardize name columns
    if 'FirstName' in df.columns:
        df['FirstName'] = df['FirstName'].str.title()
        changes_log.append("Standardized FirstName to title case.")

    if 'LastName' in df.columns:
        df['LastName'] = df['LastName'].str.title()
        changes_log.append("Standardized LastName to title case.")

    # Standardize categorical columns
    categorical_columns = ['PayBasis', 'WorkLocationBorough']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].str.upper()
            changes_log.append(f"Standardized {col} to uppercase.")

    # Standardize date columns
    if 'AgencyStartDate' in df.columns:
        df['AgencyStartDate'] = pd.to_datetime(df['AgencyStartDate'], errors='coerce')
        changes_log.append("Standardized AgencyStartDate to datetime format.")

    # Remove duplicate rows based on all columns
    df.drop_duplicates(inplace=True)

    logging.info("Transactional Data Cleaning Summary: " + "; ".join(changes_log))

    rows_transformed.set(len(df))  # Set the number of rows transformed

    return df

transaction_columns = [
    'FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID', 'LastName', 'FirstName',
    'AgencyStartDate', 'WorkLocationBorough', 'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30',
    'BaseSalary', 'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay'
]

#df = extract_data('nycpayroll_2021.csv')

#initial_row_count = len(df)
# changes_log = [] 
#c_df = validate_and_clean_transactional_data(df, transaction_columns)

#print(c_df.head())
#print_summary_report(c_df, initial_row_count, changes_log)

In [40]:
def ensure_columns(df, columns):
    """Ensure the DataFrame has all the required columns, filling missing ones with NaN."""
    for col in columns:
        if col not in df.columns:
            df[col] = pd.NA
    return df[columns]


In [45]:
table_schemas = {
    'dim_employee': ['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30'],
    'dim_agency': ['AgencyID', 'AgencyName', 'AgencyStartDate'],
    'dim_title': ['TitleCode', 'TitleDescription'],
    'fact_payroll': ['PayrollNumber', 'EmployeeID', 'FiscalYear', 'BaseSalary', 'RegularHours',
                     'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay', 'WorkLocationBorough']
}

dim_columns = [
    ['EmployeeID', 'LastName', 'FirstName', 'LeaveStatusasofJune30'],  # Columns for dim_employee
    ['TitleCode', 'TitleDescription'],  # Columns for dim_title
    ['AgencyID', 'AgencyName', 'AgencyStartDate']  # Columns for dim_agency
]

master_files = ['EmpMaster.csv', 'TitleMaster.csv', 'AgencyMaster.csv']

dim_table_names = ['dim_employee', 'dim_title', 'dim_agency']


In [43]:
from helpers.db_utils import stage_data

def transform_master_data(master_files):
    # Initialize empty DataFrames with the required columns
    dim_employee_df = pd.DataFrame(columns=table_schemas['dim_employee'])
    dim_agency_df = pd.DataFrame(columns=table_schemas['dim_agency'])
    dim_title_df = pd.DataFrame(columns=table_schemas['dim_title'])

    # Define a mapping from file names to dimension table names and their schemas
    file_to_table_map = {
        'EmpMaster.csv': ('dim_employee', table_schemas['dim_employee']),
        'AgencyMaster.csv': ('dim_agency', table_schemas['dim_agency']),
        'TitleMaster.csv': ('dim_title', table_schemas['dim_title'])
    }

    for file_name in master_files:
        table_name, required_columns = file_to_table_map.get(file_name)

        if table_name:
            # Extract data from the file
            df = extract_data(file_name)

            # Validate and clean the master data
            df_cleaned = validate_and_clean_master_data(df, required_columns)

            # Ensure the DataFrame has all required columns
            df_cleaned = ensure_columns(df_cleaned, required_columns)

            # Append data to the appropriate dimension DataFrame
            if table_name == 'dim_employee':
                dim_employee_df = pd.concat([
                    dim_employee_df,
                    df_cleaned.drop_duplicates()
                ], ignore_index=True)

            elif table_name == 'dim_agency':
                dim_agency_df = pd.concat([
                    dim_agency_df,
                    df_cleaned.drop_duplicates()
                ], ignore_index=True)

            elif table_name == 'dim_title':
                dim_title_df = pd.concat([
                    dim_title_df,
                    df_cleaned.drop_duplicates()
                ], ignore_index=True)
    
    total_master_rows = len(dim_employee_df) + len(dim_agency_df) + len(dim_title_df)
    
    stage_data(engine, dim_employee_df, 'dim_employee')
    stage_data(engine, dim_agency_df, 'dim_agency')
    stage_data(engine, dim_title_df, 'dim_title')
    print(f"Master data successfully transformed and staged.")
    print(f" - dim_employee: {len(dim_employee_df)} rows")
    print(f" - dim_agency: {len(dim_agency_df)} rows")
    print(f" - dim_title: {len(dim_title_df)} rows")
    print(f"Total master data staged: {len(dim_employee_df) + len(dim_agency_df) + len(dim_title_df)} rows")

transform_master_data(master_files)

2024-08-25 16:18:24,156 - root - INFO - Extracting EmpMaster.csv from S3
2024-08-25 16:18:24,322 - root - INFO - Validating and cleaning master data
2024-08-25 16:18:24,338 - root - INFO - Master Data Cleaning Summary: Cleaned leading non-alphabetic characters from all string columns.; Converted non-numeric values in EmployeeID to NaN.; Standardized FirstName to title case.; Standardized LastName to title case.
2024-08-25 16:18:24,358 - root - INFO - Extracting TitleMaster.csv from S3
2024-08-25 16:18:24,440 - root - INFO - Validating and cleaning master data
2024-08-25 16:18:24,449 - root - INFO - Master Data Cleaning Summary: Cleaned leading non-alphabetic characters from all string columns.; Converted non-numeric values in TitleCode to NaN.
2024-08-25 16:18:24,459 - root - INFO - Extracting AgencyMaster.csv from S3
2024-08-25 16:18:24,529 - root - INFO - Validating and cleaning master data
2024-08-25 16:18:24,536 - root - INFO - Master Data Cleaning Summary: Cleaned leading non-alph

Master data successfully transformed and staged.
 - dim_employee: 1000 rows
 - dim_agency: 153 rows
 - dim_title: 1446 rows
Total master data staged: 2599 rows


In [46]:

import pandas as pd

def transform_transactional_data(payroll_files):
    # Initialize empty DataFrames for dimensions
    dim_employee_df = pd.DataFrame(columns=table_schemas['dim_employee'])
    dim_agency_df = pd.DataFrame(columns=table_schemas['dim_agency'])
    dim_title_df = pd.DataFrame(columns=table_schemas['dim_title'])
    
    fact_payroll_df = pd.DataFrame(columns=table_schemas['fact_payroll'])

    for file_name in payroll_files:
        # Extract data from the transactional files
        df = extract_from_s3(s3_client, s3_bucket, s3_prefix, file_name)

        # Clean and validate the transactional data
        df_cleaned = validate_and_clean_transactional_data(df, transaction_columns)

        # Update dimension DataFrames
        # For dim_employee
        dim_employee_data = df_cleaned[['EmployeeID', 'FirstName', 'LastName', 'LeaveStatusasofJune30']]
        dim_employee_df = pd.concat([
            dim_employee_df,
            ensure_columns(dim_employee_data, table_schemas['dim_employee']).drop_duplicates()
        ], ignore_index=True)
        
        # For dim_agency
        dim_agency_data = df_cleaned[['AgencyID', 'AgencyName', 'AgencyStartDate']]
        dim_agency_df = pd.concat([
            dim_agency_df,
            ensure_columns(dim_agency_data, table_schemas['dim_agency']).drop_duplicates()
        ], ignore_index=True)
        
        # For dim_title
        dim_title_data = df_cleaned[['TitleCode', 'TitleDescription']]
        dim_title_df = pd.concat([
            dim_title_df,
            ensure_columns(dim_title_data, table_schemas['dim_title']).drop_duplicates()
        ], ignore_index=True)
        
        # Prepare fact_payroll DataFrame
        fact_payroll_data = df_cleaned[['PayrollNumber', 'EmployeeID', 'FiscalYear', 'BaseSalary',
                                        'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay',
                                        'WorkLocationBorough']]
        fact_payroll_df = pd.concat([
            fact_payroll_df,
            ensure_columns(fact_payroll_data, table_schemas['fact_payroll']).drop_duplicates()
        ], ignore_index=True)

    
    total_transactional_rows = len(fact_payroll_df)
    
    stage_data(engine, dim_employee_df, 'dim_employee')
    stage_data(engine, dim_agency_df, 'dim_agency')
    stage_data(engine, dim_title_df, 'dim_title')
    stage_data(engine, fact_payroll_df, 'fact_payroll')
    
    print(f"Transactional data successfully transformed and staged.")
    print(f" - fact_payroll: {len(fact_payroll_df)} rows")
    print(f"Total transactional data staged: {len(fact_payroll_df)} rows")
    total_rows = total_master_rows + total_transactional_row
    print(f"All data successfully transformed and staged.")
    print(f" - Total master data: {total_master_rows} rows")
    print(f" - Total transactional data: {total_transactional_rows} rows")
    print(f"Total data staged: {total_rows} rows")
transform_transactional_data(payroll_files)

2024-08-25 17:32:20,359 - root - INFO - Extracting nycpayroll_2020.csv from S3
2024-08-25 17:32:20,565 - root - INFO - Validating and cleaning transactional data
2024-08-25 17:32:20,754 - root - INFO - Transactional Data Cleaning Summary: Dropped rows with missing values in FiscalYear as it was <= 5%; Dropped rows with missing values in PayrollNumber as it was <= 5%; Dropped rows with missing values in AgencyID as it was <= 5%; Dropped rows with missing values in AgencyName as it was <= 5%; Dropped rows with missing values in EmployeeID as it was <= 5%; Dropped rows with missing values in LastName as it was <= 5%; Dropped rows with missing values in FirstName as it was <= 5%; Dropped rows with missing values in AgencyStartDate as it was <= 5%; Dropped rows with missing values in WorkLocationBorough as it was <= 5%; Dropped rows with missing values in TitleCode as it was <= 5%; Dropped rows with missing values in TitleDescription as it was <= 5%; Dropped rows with missing values in Leav

Transactional data successfully transformed and staged.
 - fact_payroll: 201 rows
Total transactional data staged: 201 rows


NameError: name 'total_master_rows' is not defined

In [8]:
from helpers.db_utils import redshift_engine, stage_data, create_fact_table
from helpers.metrics import rows_staged,fact_table_created
from sqlalchemy import MetaData, Table, Column, Integer, String
import logging

metadata = MetaData()
engine = redshift_engine()

def load_master_data(df, table_name, engine):
    logging.info(f"Loading master data into {table_name}")
    stage_data(engine, df, table_name)
    rows_staged.set(len(df))

def load_transactional_data(df, engine):
    logging.info(f"Loading transactional data into FactPayroll")
    create_fact_table(engine, engine.metadata)
    fact_table_created.set(1)
    stage_data(engine, df, 'FactPayroll')
    rows_staged.set(len(df))


In [None]:


def ingest_master_data():


    for file_name, table_name, dim_col in zip(master_files, master_table_names, dim_columns):
        df = extract.extract_from_s3(s3_bucket, s3_prefix, file_name)
        df_transformed = transform.transform_master_data(df, dim_col)
        stage_data(df_transformed, table_name)


def ingest_transactional_data():
    logging.info(f"Ingesting transactional data")
    s3_bucket = 'your-s3-bucket-name'
    s3_prefix = 'your-folder-prefix/'
    payroll_files = ['nycpayroll_2020.csv', 'nycpayroll_2021.csv']

    create_fact_table()

    for file_name in payroll_files:
        df = extract.extract_from_s3(s3_bucket, s3_prefix, file_name)
        df_transformed = transform.transform_transactional_data(df, engine)
        stage_data(df_transformed, 'FactPayroll')