# WP-2 mdltour warning checks

<p style="color:rgb(0,162,219); font-family:Arial; font-size:16px;">Notebook Information </p>

<table style="color:rgb(88,89,91); font-family:Arial; float:left; font-size:13px; text-align:left;">
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Project</b></td>
        <td style="text-align:left;">NorMITs Demand Partner </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Primary Contact Name</b></td>
        <td style="text-align:left;"> </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Primary Contact Email</b></td>
        <td style="text-align:left;"> </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Document Sensitivity</b></td>
        <td style="text-align:left;"> </td>
    </tr>
</table>

In [1]:
import pandas as pd 
import numpy as np
import os
import logging
import sys
import time
import json
from utils import ChecksLogger

In [2]:
logger = ChecksLogger(log_file_name='checks_log.csv')

In [110]:
# Set paths etc.
model_version = 'v5'
known_correct_schema_model_version = 'v3' # Version of the tour model which we know has the correct schema in its output files for comparing new runs to

tour_dir = r'I:\NTS\outputs\tour'
reports_dir = os.path.join(tour_dir, 'reports', model_version)
imports_dir = r'I:\NTS\imports'
known_correct_schema_dir = os.path.join(
    tour_dir,
    'reports',
    known_correct_schema_model_version
)

mat_file = 'matrix_county_output.csv'
attr_county_file = 'attr_county_output.csv'
prod_county_file = 'prod_county_output.csv'

report_files = [attr_county_file,
                prod_county_file,
                mat_file, 
                'nts_distr_output.csv', 
                'nts_mts_output.csv', 
                'nts_tour_output.csv']
county_file = 'County_to_TAZ.csv'

script_cwd = os.getcwd()

In [180]:
# Checks to perform
checks_selection = {
    'TM001': False,
    'TM002': False,
    'TM003': False,
    'TM004': False,
    'TM005': False,
    'TM006': False,
    'TM008': False,
    'TM009': True,
    'TM010': False,
    'TM011': False,
    'TM012': False
}

### TM011 - Check expected output files exist

Check specified output files have been produced in expected location and contain data, have been last edited within a certain time window of the initial tour model script running (which is hopefully logged somewhere?) - in other words, checking these are definitely the outputs produced by the latest run. Probably wants to be the first thing we check


In [4]:
def check_output_files(report_files, reports_dir, time_window):
    """
    Check if a file exists, has data, and has been modified within time window.
    
    Parameters:
    -----------
    report_files: list of strings
        List of expected tour model output files
    reports_dir: str (file path)
        Directory for expected tour model output files
    time_window: datetime object
        Acceptable time window to be considered output from latest run
    
    Outputs
    -----------
    None, writes outcome of checks straight to log file
    """
    checkstring = f"TM011: Check output files exist"
    logger.info(checkstring, f"Commencing check TM011:")
    for file in report_files:
        file_path = os.path.join(reports_dir, file)
        # Check if file exist
        if os.path.exists(file_path):
            # Check if file has data
            if os.path.getsize(file_path) > 0:
                # Check if file has been modified within the time window
                # TODO: Work out a minimum time and implement that.
                # Is there an existing run log this could be obtained from?
                mod_time = os.path.getmtime(file_path)
                current_time = time.time()
                if current_time - mod_time <= time_window:
                    logger.success(checkstring, f"Success - {file} exists and contains data from latest run.")
                else:
                    logger.warning(checkstring, f"Warning - {file} exists and contains data, but was not produced by latest run.")
            else:
                logger.warning(checkstring, f"Warning - {file} exists but does not contain data.")
        else:
            logger.warning(checkstring, f"Warning - {file} does not exist.")
    logger.info(checkstring, f"Check TM011 complete")
    logger.save_logs()

### TM012 - Check output schemas

List the expected column names and schemas in all output files, then check the outputs match these. Might end up wrapping some of the other checks into this depending on things like table shapes and how descriptive Python schemas are. Probably also wants to be quite an early check

In [5]:
def get_columns_datatypes(csv_file):
    """
    Get the columns and data types of a CSV file
    
    Parameters:
    -----------
    csv_file: str (file path)
        Path to CSV file to check
    
    Outputs:
    -----------
    columns_datatypes: list
        List of column datatypes for the input CSV file
    """
    df = pd.read_csv(csv_file)
    columns_datatypes = {column: str(df[column].dtype) for column in df.columns}
    return columns_datatypes

# def process_csv_files(report_files, reports_dir, script_cwd):
def find_correct_schemas(report_files, known_correct_schema_dir, script_cwd, checkstring):
    """
    Find the expected schemas of tour model ouput files
    
    Parameters:
    -----------
    report_files: list of strings
        List of expected tour model output files
    known_correct_schema_dir: str (file path)
        Directory containing tour model output files which are known to have the correct schema
    script_cwd: str (path)
        Location the script is currently running.
        JSON files will be dumped here during running
    checkstring: str
        String identifying the check that has called this function
    
    Outputs:
    -----------
    dumps_path: str (filepath)
        Path where the expected schemas are stored
    """
    result = {}
    for f in report_files:
        if f[-4:] == '.csv':
            file_path = os.path.join(known_correct_schema_dir, f)
            if os.path.exists(file_path):
                result[f] = get_columns_datatypes(file_path)
            else:
                result[f] = "File not found"
                logger.warning(
                    checkstring,
                    f"Warning - {f} not present in directory that should contain all the expected outputs with the correct schemas."
                )
                logger.warning(
                    checkstring,
                    f"Are you sure {known_correct_schema_dir} is the correct directory for this?"
                )
    
    dumps_path = os.path.join(script_cwd, 'checks_json_dump_temp')
    if not os.path.exists(dumps_path):
        os.makedirs(dumps_path)
    json_filename = os.path.join(dumps_path, 'columns_datatypes.json')
    with open(json_filename, 'w') as json_file:
        json.dump(result, json_file, indent=4)
    logger.save_logs()
    return json_filename

def compare_with_json(json_filename, checkstring, folder_path, expected_files):
    """
    Compare JSON output with new CSV files to find and report any issues with schemas
    
    Parameters:
    -----------
    json_filename: str (filepath)
        Path to find the json file which contains the expected schema inforamtion
    checkstring: str
        String identifying the check that has called this function
    folder_path: str (filepath)
        Directory containing the tour model outputs
    expected_files: list of strings
        Files that should exist in the tour model outputs
    
    Outputs:
    -----------
    column_errors: list
        List of errors resulting from mismatched columns
    datatype_errors: list
        List of cases where field datatypes are not as expected
    matching_files: list
        List of all files that match the expected schemas
    """
    column_errors = []
    datatype_errors = []
    matching_files = []
    
    if not os.path.isfile(json_filename):
        # If json containing expected schemas does not exist, try to create it
        logger.info(
            checkstring,
            f"Info - Specified location for json expected schema file {json_filename} did not exist, now trying to create it..."
        )
        json_filename = find_correct_schemas(report_files, known_correct_schema_dir, script_cwd, checkstring)
        skip2check = False
    else:
        skip2check = True
    if skip2check == False or not os.path.exists(json_filename):
        # If the json file is still missing, create warning
        logger.warning(
            checkstring,
            f"Warning - Even after trying to create {json_filename} it still does not exist!"
        )
        logger.warning(
            checkstring,
            f"Warning - {checkstring} cannot be completed!"
        )
        
    else:
        with open(json_filename, 'r') as f:
            json_data = json.load(f)

        for file_name, expected_columns in json_data.items():
            file_path = os.path.join(folder_path, file_name)
            if not os.path.exists(file_path):
                logger.warning(
                    checkstring,
                    f"Warning - {file_name}, which is expected to exist, is not found in the specified folder."
                )
            else:
                actual_columns = get_columns_datatypes(file_path)
                if expected_columns.keys() != actual_columns.keys():
                    column_errors.append(file_name)
                else:
                    if all(column in actual_columns and actual_columns[column] == datatype
                           for column, datatype in expected_columns.items()):
                        matching_files.append(file_name)
                    else:
                        for column, datatype in expected_columns.items():
                            if column not in actual_columns or actual_columns[column] != datatype:
                                datatype_errors.append((file_name, column))

    logger.save_logs()
    return column_errors, datatype_errors, matching_files

### TM005 - Intrasector trips should be highest

The maximum value for any given row/column in the output matrix is expected to be the intrasector cell. Allow some leeway here though - exception is probably Air travel as a mode, and rail in some instances. Calculate % average, flag if not inline.

In [6]:
def check_intras(tour_mat, threshold):
    """
    Check if the intra-county cell is the highest for each
    mode, purpose, direction, period combination
    
    Parameters:
    -----------
    tour_mat: pandas df
        tour matrix output from the tour model
    threshold: float
        Proportion of the intra-county value above the
        intra-county value that the script will allows as a
        tolerance before flagging a warning
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    logger.info(checkstring, f"Commencing check TM005:")
    
    md = tour_mat['mode'].unique()
    pr = tour_mat['purpose'].unique()
    di = tour_mat['direction'].unique()
    tp = tour_mat['period'].unique()
    og = tour_mat['tmz_o'].unique()

    n=0
    checkstring = 'TM005: Intrasector trips should be highest'

    for m in md:
        for p in pr:
            for dr in di:
                for t in tp:
                    tour_mat_filtered = tour_mat[
                        (tour_mat['mode'] == m) &
                        (tour_mat['purpose'] == p) &
                        (tour_mat['direction'] == dr) &
                        (tour_mat['period'] == t)
                    ]
                    tour_mat_filtered = tour_mat_filtered.drop(
                        columns=[
                            'mode',
                            'purpose',
                            'direction',
                            'period'
                        ])
                    for o in og:
                        # Only working 1 dimensionally to avoid duplicate warning in some cases
                        # I.e. not repeating this in repacing all o's with d's and vice versa
                        # Already generates many warnings!
                        tour_mat_o = tour_mat_filtered[
                            (tour_mat_filtered['tmz_o'] == o)
                        ]
                        tour_mat_intra = tour_mat_o[
                            (tour_mat_o['tmz_d'] == o)
                        ].trips.sum()

                        # Filter out cases where the mode is not long distance (i.e. air and rail) and 
                        # there is a small matrix total, then warn where the intrasectors are 0.
                        if (m != 'Air') & (m != 'Rail') & (tour_mat_o.trips.sum() > 100) & (tour_mat_intra <= 0):
                            logger.warning(
                                checkstring,
                                f'Warning - No {p} {m} {dr} intras in county: {o} in time period {t}'
                            )
                        elif(tour_mat_intra > 0):
                            tour_mat_max = tour_mat_o.trips.max()
                            if tour_mat_intra < tour_mat_max: 
                                for d in og:
                                    d_trips = tour_mat_o[(tour_mat_o['tmz_d'] == d)].trips.sum()
                                    prop = d_trips/tour_mat_intra
                                    if prop > (1 + threshold):
                                        logger.warning(
                                            checkstring,
                                            f"Warning - for {p} {m} {dr} trips in time period {t}, {o} {d} trips are {prop*100:.2f}% of the intra-county trips in {o}"
                                        )

                        if n % 10000 == 0:
                            print(n) # Show progress is occuring whilst running
                        n = n + 1
    logger.info(checkstring, f"Check TM005 complete")
    logger.save_logs()

### TM001 - Check tour model outputs non-zero

Check that the total in the matrix output file is > 0
Also check each row and column in the output matrix sums to a > 0 value (will cover some of the checks to ensure everything is there too - rest of this check covered by TM010)

In [7]:
def warn_nonzero(tour_mat_filtered, m, p, dr, t, og, checkstring):
    """
    Check a given m, p, dr, t, o or d combination
    has at least 0 trips. Warn if exactly 0 and warn
    fatally if it has less than 0
    
    Parameters:
    -----------
    tour_mat_filtered: pandas df
        tour_matrix by m, p, dr and t to filter by o and d
    og: list
        List of counties to work though for o and d
    m: str
        Mode
    p:  str
        Purpose
    dr: str
        Direction (hb_fr, hb_to, nhb)
    t: int
        Time period (1-6)
    checkstring: str
        String identifying the check that has called this function
        
    Outputs:
    -----------
    None, writes warnings to log files
    """
    for od_col in ['tmz_o', 'tmz_d']:
        for od in og:
            tour_mat_f2 = tour_mat_filtered[
                (tour_mat_filtered[od_col] == od)                    
            ]
            tour_mat_f2 = tour_mat_f2.drop(
                columns=[od_col])
            od_trips = tour_mat_f2['trips'].sum()
            if od_col == 'tmz_o':
                od_str = 'from'
            elif od_col == 'tmz_d':
                od_str = 'to'
            else:
                od_str = '????'
            if od_trips <= 0: # <= 0 here as we have a cell level check for values < 0.
                logger.warning(
                    checkstring,
                    f"Warning - The {p} {m} {dr} trips {od_str} {od} in time period {t}, sum to 0. At this low level, it may not be an issue, but check how often this occurs."
                            )
    logger.save_logs()

In [8]:
def check_nonzero(tour_mat):
    """
    Check the matrix totals at the m, p, dr, t level
    are > 0 and also call the check at the o/d level
    beyond that
    
    Parameters:
    -----------
    tour_mat: pandas df
        tour matrix output from the tour model
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    n=0
    checkstring = 'TM001: Tour model outputs non-zero'
    logger.info(checkstring, f"Commencing check TM005:")
    
    md = tour_mat['mode'].unique()
    pr = tour_mat['purpose'].unique()
    di = tour_mat['direction'].unique()
    tp = tour_mat['period'].unique()
    og = tour_mat['tmz_o'].unique()
    
    combos = (
        tour_mat['mode'].nunique() *
        tour_mat['purpose'].nunique() *
        tour_mat['direction'].nunique() *
        tour_mat['period'].nunique()
    )
    tenth = combos/10
    tentot = tenth
    tencount = 0
    print('0%') # Show progress is occuring whilst running

    for m in md:
        for p in pr:
            for dr in di:
                for t in tp:
                    if n > tentot:
                        tentot = tentot + tenth
                        tencount = tencount + 10
                        print(f'{tencount}%') # Show progress is occuring whilst running
                    n = n + 1
                    tour_mat_filtered = tour_mat[
                        (tour_mat['mode'] == m) &
                        (tour_mat['purpose'] == p) &
                        (tour_mat['direction'] == dr) &
                        (tour_mat['period'] == t)                 
                    ]
                    tour_mat_filtered = tour_mat_filtered.drop(
                        columns=[
                            'mode',
                            'purpose',
                            'direction',
                            'period'
                        ]
                    )
                    mpdt_trips = tour_mat_filtered['trips'].sum()
                    if mpdt_trips <= 0: # <= 0 here as we have a cell level check for values < 0
                        logger.warning(
                            checkstring,
                            f"Serious Warning - The {p} {m} {dr} matrix in time period {t}, sums to 0. Is this expected?"
                                    )
                        logger.save_logs() 
                    else:
                        warn_nonzero(tour_mat_filtered, m, p, dr, t, og, checkstring)
    print('100%') # Show progress is occuring whilst running
    logger.info(checkstring, f"Check TM005 complete")
    logger.save_logs()               

### TM002 - Check tour model outputs not negative

Check that no cell in the matrix output is < 0

In [9]:
# Function to check for negative values and raise an error with details
def check_negative_values(tour_mat):
    """
    Check the trip totals at the m, p, dr, t, o, d
    level are not negative
    
    Parameters:
    -----------
    tour_mat: pandas df
        tour matrix output from the tour model
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    errors = False
    checkstring = 'TM002: Check tour model outputs not negative'
    logger.info(checkstring, f"Commencing check TM005:")
    tour_mat_filtered = tour_mat[tour_mat['trips'] < 0]
    if len(tour_mat_filtered) > 0:
        errors = True
        for index, row in tour_mat_filtered.iterrows():
            md = row['mode']
            pr = row['purpose']
            di = row['direction']
            tp = row['period']
            og = row['tmz_o']
            dt = row['tmz_d']
            tr = row['trips']
            logger.warning(checkstring, f"FATAL ERROR - The entry for {pr} {md} {di} trips in time period {tp}, between {og} and {dt} is {tr}, which is less than 0!")
    else:
        logger.success(checkstring, 'Success - No trips less than 0 predicted')
        logger.info(checkstring, f"Check TM002 complete")
    logger.save_logs()
    if errors == True:
        raise ValueError(f'Negative trips reported in {len(tour_mat_filtered)} rows. See warning log for details.')

### TM003 - Check for NULLs/NaNs in tour model outputs

Check for NULLs/NaNs in tour model outputs

In [10]:
def nullwarn(tour_mat_filtered, err_type):
    """
    Take a filtered (to rows error only) version
    of the tour matrix and report errors for each
    row
    
    Parameters:
    -----------
    tour_mat_filtered: pandas df
        tour matrix output from the tour model
        filtered to only rows containing an
        error
    err_type: str
        error description
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    for index, row in tour_mat_filtered.iterrows():
        md = row['mode']
        pr = row['purpose']
        di = row['direction']
        tp = row['period']
        og = row['tmz_o']
        dt = row['tmz_d']
        tr = row['trips']
        logger.warning(checkstring, f"FATAL ERROR - The entry for {pr} {md} {di} trips in time period {tp}, between {og} and {dt} is {err_type}!")
        logger.save_logs()

In [99]:
def check_null_values(tour_mat):
    """
    Check the trip totals at the m, p, dr, t, o, d
    level are not null, n/a or empty
    
    Parameters:
    -----------
    tour_mat: pandas df
        tour matrix output from the tour model
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    errors = False
    checkstring = 'TM003: Check for NULLs/NaNs in tour model outputs'
    logger.info(checkstring, f"Commencing check TM003:")
    logger.save_logs()
    
    tour_na = tour_mat[tour_mat['trips'].isna()]
    tour_null = tour_mat[tour_mat['trips'].isnull()]
    tour_blank = tour_mat[tour_mat['trips'] == '']
    
    if len(tour_na) < 0:
        errors = True
        nullwarn(tour_na, 'N/A')
    if len(tour_null) < 0:
        errors = True
        nullwarn(tour_na, 'NULL')
    if len(tour_blank) < 0:
        errors = True
        nullwarn(tour_na, 'empty')
    
    if errors == True:
        raise ValueError(f'Null, N/A or empty trip cells reported in tour matrix, See warning log for details.')
    else:
        logger.success(checkstring, 'Success - No Null, N/A or empty trip cells reported')
        logger.info(checkstring, f"Check TM003 complete")
        logger.save_logs()

### TM010 - Check zones/sectors in output

Check list of zones/sectors in output against a list of zones we expect to see - will account for any zones/sectors that have been dropped entirely rather than just 0-ed (as tested for by TM001)

In [100]:
def check_zones_sectors(imports_dir, county_file, reports_dir, report_files):
    """
    Check every county is present in every output.
    Checks county names and numbers.
    Also check to see if any unspecified counties
    are present.
    
    Parameters:
    -----------
    imports_dir, str (path)
        Directory containing the county file
    county_file, str (file name)
        File containing list of expected counties
    reports_dir, str (path)
        Directory containing the tour model outputs
    report_files,
        List of files in the reports directory
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    checkstring = 'TM010: Check zones/sectors in output'
    logger.info(checkstring, f"Commencing check TM010:")

    expected_counties = pd.read_csv(os.path.join(imports_dir, county_file))
    e_counties = expected_counties[['county_de', 'county_no']].copy()
    e_counties['flag'] = 1

    county_name_cols = [
        'tmz_o',
        'tmz_d'
    ]

    county_no_cols = [
        'triporigua1998_b01id',
        'tripdestua1998_b01id',
        'taz'
    ]

    check_opts = {
        'name': ['county_de', county_name_cols],
        'number': ['county_no', county_no_cols]
    }

    for file in report_files:
        warn_file = False
        test_file_r = pd.read_csv(os.path.join(reports_dir, file))
        for opt in check_opts.keys():
            test_file = test_file_r.loc[:, test_file_r.columns.isin(check_opts[opt][1])]
            for column in test_file.columns:
                test_col = test_file[[column]].drop_duplicates().reset_index(drop=True)
                exp_col = e_counties.rename(columns={check_opts[opt][0]: column})
                test_col = test_col.merge(exp_col, how='outer', on=column)
                test_col = test_col[test_col['flag'] != 1]
                if len(test_col) > 0:
                    warn_file = True
                    for index, row in test_col.iterrows():
                        err_county = row[column]
                        logger.warning(
                            checkstring,
                            f"Warning - Value '{err_county}' is either unexpectedly missing from or present in column '{column}' in file: {file}"
                        )
        if warn_file == False:
            logger.success(
                checkstring,
                f"Success - All and only the expected counties present in {file}"
            )
    logger.info(checkstring, f"Check TM010 complete")
    logger.save_logs()

### TM008 - Check output productions against attractions

Check the total productions of hb_fr trips match the total hb_to attractions (and vice versa?) within a tolerance. We expect most people leaving home to return home within 24 hours. Some exceptions though - overnight stays for work, holidays, etc. so won't be a perfect match. Might also be worth doing a check like this specifically with car/van modes too as we expect the cars and vans to have to return "home" and not get abandoned somewhere - less of an issue with other modes as the vehicle itself is less likely to get stuck

In [101]:
def check_prod_attract(threshold, tour_mat):
    """
    Check that for each mode, within a tolerance,
    the trips originating from and destined to
    each zone (county) match.
    
    Parameters:
    -----------
    threshold: str
        String ending in '%' indicating the max
        percentage difference that is permitted
        between the trips originating from and
        destined for the zone. Should be
        formatted as a numerical percentage in
        string format, e.g. '5%' or '7.25%'
    tour_mat: pandas df
        tour matrix output from the tour model
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    test = float(threshold.strip('%'))

    checkstring = 'TM008: Check output productions against attractions'
    logger.info(checkstring, f"Commencing check TM008:")

    modes = list(tour_mat['mode'].unique())
    for mode in modes:
        daily_trips = tour_mat[tour_mat['mode'] == mode]
        daily_o = daily_trips.groupby(['tmz_o'])['trips'].sum().reset_index()
        daily_o = daily_o.rename(columns={'tmz_o':'tmz', 'trips': 'trips_o'})
        daily_d = daily_trips.groupby(['tmz_d'])['trips'].sum().reset_index()
        daily_d = daily_d.rename(columns={'tmz_d':'tmz', 'trips': 'trips_d'})
        daily_comp = daily_o.merge(daily_d, how='outer', on='tmz')
        daily_comp = daily_comp.fillna(0)
        daily_comp['trips_%diff'] = 100 * abs(daily_comp['trips_d'] - daily_comp['trips_o']) / daily_comp['trips_o']
        daily_comp = daily_comp[daily_comp['trips_%diff'] > test]
        if len(daily_comp) > 0:
            for index, row in daily_comp.iterrows():
                county = row['tmz']
                trips_o = row['trips_o']
                trips_d = row['trips_d']
                trips_pdiff = row['trips_%diff']
                logger.warning(
                    checkstring,
                    f"Warning - For the {mode} mode, there were {trips_o:.0f} trips from and {trips_d:.0f} trips to the {county} zone. The %age difference of {trips_pdiff:.2f}% is greater than the allowed threshold of {threshold}"
                )
        else:
            logger.success(
                checkstring,
                f"Success - For the {mode} mode, all county's weekly trip origins and destinations balance within a threshold of {threshold}"
            )
    logger.info(checkstring, f"Check TM008 complete")
    logger.save_logs()

### TM006 - Check input productions and attractions against output productions/attractions

For county, direction, purpose, mode, etc. combinations, check the input origins match/are very close to the output ones. Ditto for destinations. Currently running at the county level with a 5% tolerance.

In [169]:
def check_in_out_prod_attract(threshold,
                              prod_county_file,
                              attr_county_file,
                              reports_dir):
    """
    Check that the county level input productions
    match the county level output productions
    for the tour model. Ditto for the attractions
    
    Parameters:
    -----------
    threshold: str
        String ending in '%' indicating the max
        percentage difference that is permitted
        between the inputs and outputs of the
        productions and attractions. Should be
        formatted as a numerical percentage in
        string format, e.g. '5%' or '7.25%'
    prod_county_file: str
        Name of the file containing the
        production data
    attr_county_file: str
        Name of the file containing the
        attraction data
    reports_dir: str (path)
        Directory for expected tour model output
        files
    
    Outputs:
    ----------
    None, writes warnings to log files
    """

    test = float(threshold.strip('%'))
    prod_attr = {
        'productions': [prod_county_file, 'tmz_o'],
        'attractions': [attr_county_file, 'tmz_d']
    }

    checkstring = 'TM006: Check input productions and attractions against output productions/attractions'
    logger.info(checkstring, f"Commencing check TM006:")

    for pa in prod_attr.keys():
        check_df = pd.read_csv(os.path.join(reports_dir, prod_attr[pa][0]))
        check_df = check_df.groupby(prod_attr[pa][1])[['input', 'output']].sum().reset_index()
        check_df['%diff'] = 100 * abs(check_df['output'] - check_df['input']) / check_df['output']
        check_df = check_df[check_df['%diff'] > test]
        if len(check_df) > 0:
            for index, row in check_df.iterrows():
                county = row[prod_attr[pa][1]]
                trips_in = row['input']
                trips_out = row['output']
                pdiff = row['%diff']
                logger.warning(
                    checkstring,
                    f"Warning - The {pa} for {county} in the tour model input are {trips_in:.0f} vs. output {trips_out:.0f}, with a %age diff. of {pdiff:.2f}%, which is greater than the threshold of {threshold}"
                )
        else:
            logger.success(
                checkstring,
                f"Success - All the {pa} for all counties have a percentage difference of less than {threshold} between the tour model inputs and outputs"
            )

    logger.info(checkstring, f"Check TM006 complete")
    logger.save_logs()

### TM009 - Check commuting flow directions

Check the commuting hb_fr home productions are highest in the AM peak and that the hb_to home attractions are highest in the PM peak

In [178]:
def check_commuting_flow_dir(threshold,
                             prod_county_file,
                             attr_county_file,
                             reports_dir):
    """
    Check commuting flows (particularly hb_fr
    productions and hb_to attractions) match
    the profile of the inputs within a
    threshold.
    
    Parameters:
    -----------
    threshold: str
        String ending in '%' indicating the max
        percentage difference that is permitted
        between the input and output profiles.
        Should be formatted as a numerical
        percentage in string format, e.g. '5%'
        or '7.25%'
    prod_county_file: str
        Name of the file containing the
        production data
    attr_county_file: str
        Name of the file containing the
        attraction data
    reports_dir: str (path)
        Directory for expected tour model output
        files
    
    Outputs:
    ----------
    None, writes warnings to log files
    """
    test = float(threshold.strip('%'))
    prod_attr = {
        'origin': [prod_county_file, 'hb_fr', 'tmz_o'],
        'destination': [attr_county_file, 'hb_to', 'tmz_d']
    }

    checkstring = 'TM009: Check commuting flow directions'
    logger.info(checkstring, f"Commencing check TM009:")

    for pa in prod_attr.keys():
        flow_df = pd.read_csv(os.path.join(reports_dir, prod_attr[pa][0]))
        flow_df = flow_df[
            (flow_df['purpose'] == 'Commuting') &
            (flow_df['direction'] == prod_attr[pa][1]) &
            (flow_df['mode'] != 'Van') # No Van in input
        ]
        profile = flow_df.groupby(['period'])[['input']].sum()
        total_in = profile['input'].sum()
        profile['input'] = 100 * profile['input']/total_in

        zone_mode = flow_df.groupby([prod_attr[pa][2], 'mode'])[['input']].sum().reset_index()
        zone_mode = zone_mode.rename(columns={'input': 'z_m_tot'})

        test_df = flow_df.merge(zone_mode, how='left', on=[prod_attr[pa][2], 'mode'])
        test_df['profile'] = 100 * test_df['output'] / test_df['z_m_tot']
        test_df = test_df.drop(columns=['purpose', 'direction', 'input', 'output', 'z_m_tot'])
        test_df = test_df.merge(profile, how='left', on='period')
        test_df['diff'] = abs(test_df['profile'] - test_df['input'])
        test_df = test_df[test_df['diff'] > test]
        if len(test_df) > 0:
            for index, row in test_df.iterrows():
                county = row[prod_attr[pa][2]]
                mode = row['mode']
                period = row['period']
                diff = row['diff']
                logger.warning(
                    checkstring,
                    f"Warning - The proportion of {county} {pa} {prod_attr[pa][1]} {mode} commuting trips in period {period} differ from the input profile by {diff:.2f}%, which is more that the threshold of {threshold}"
                )
        else:
            logger.success(
                checkstring,
                f"Success - All the county level {pa} {prod_attr[pa][1]} coummuting trips by period differ by less than {threshold} from the input profile"
            )
    logger.info(checkstring, f"Check TM009 complete")
    logger.save_logs()

In [181]:
# CELL TO CALL ALL OF THE ABOVE CHECKS

# Several of the checks use this file.
# Read it in once here to save a little time.
tour_mat = pd.read_csv(os.path.join(reports_dir, mat_file))

checkid = 'TM011'
if checks_selection[checkid] == True:
    # Check TM011 - Check expected output files exist
    time_window = time.time() # Confirm time frame
    time_window
    check_output_files(report_files, reports_dir, time_window)
    print('Check TM011 complete and logged') # For info when running

checkid = 'TM012'
if checks_selection[checkid] == True:
    # Check TM012 - Check output schemas
    tm012 = 'TM012: Check output schemas'
    logger.info(tm012, f"Commencing check TM012:")
    json_file = find_correct_schemas(report_files, known_correct_schema_dir, script_cwd, tm012)
    column_errors, datatype_errors, matching_files = compare_with_json(json_file, tm012, reports_dir, report_files)

    if column_errors:
        for i in column_errors:
            logger.warning(
                tm012,
                f"Warning - Column errors found in {i}"
            )

    if datatype_errors:
        for file_name, column in datatype_errors:
            logger.warning(
                tm012,
                f"Warning - Data type errors found in {file_name}: {column}"
            )

    if matching_files:
        for i in matching_files:
            logger.success(
                tm012,
                f"Sucsess - {i} matches the expected columns and data types"
            )
            
    logger.info(tm012, f"Check TM012 complete")
    logger.save_logs()
    print('Check TM012 complete and logged') # For info when running

checkid = 'TM005'
if checks_selection[checkid] == True:
    # Check TM005 - Intrasector trips should be highest
    print('Check TM005 commencing. This can take a while to run') # For info when running
    check_intras(tour_mat, 0.1)
    print('Check TM005 complete and logged') # For info when running

checkid = 'TM001'
if checks_selection[checkid] == True:
    # Check TM001 - Check tour model outputs non-zero
    print('Check TM001 commencing. This can take a while to run') # For info when running
    check_nonzero(tour_mat)
    print('Check TM001 complete and logged') # For info when running

checkid = 'TM002'
if checks_selection[checkid] == True:
    # Check TM002 - Check tour model outputs not negative
    try:
        check_negative_values(tour_mat)
    except ValueError as e:
        print(e)
    print('Check TM002 complete and logged') # For info when running

checkid = 'TM003'
if checks_selection[checkid] == True:
    # Check TM003 - Check for NULLs/NaNs in tour model outputs
    try:
        check_null_values(tour_mat)
    except ValueError as e:
        print(e)
    print('Check TM003 complete and logged') # For info when running

checkid = 'TM010'
if checks_selection[checkid] == True:
    # Check TM010 - Check zones/sectors in output
    print('Commencing check TM010:') # For info when running
    check_zones_sectors(imports_dir, county_file, reports_dir, report_files)
    print('Check TM010 complete and logged') # For info when running
    
checkid = 'TM008'
if checks_selection[checkid] == True:
    # Check TM008 - Check output productions against attractions
    print('Commencing check TM008:') # For info when running
    threshold = '5%'
    check_prod_attract(threshold, tour_mat)
    print('Check TM008 complete and logged') # For info when running
    
checkid = 'TM006'
if checks_selection[checkid] == True:
    # Check TM008 - Check output productions against attractions
    print('Commencing check TM006:') # For info when running
    threshold = '5%'
    check_in_out_prod_attract(threshold,
                              prod_county_file,
                              attr_county_file,
                              reports_dir)
    print('Check TM006 complete and logged') # For info when running
    
checkid = 'TM009'
if checks_selection[checkid] == True:
    # Check TM008 - Check output productions against attractions
    print('Commencing check TM009:') # For info when running
    threshold = '10%'
    check_in_out_prod_attract(threshold,
                              prod_county_file,
                              attr_county_file,
                              reports_dir)
    print('Check TM009 complete and logged') # For info when running

Commencing check TM009:
Check TM009 complete and logged
