# WP-2 mdltour warning checks

<p style="color:rgb(0,162,219); font-family:Arial; font-size:16px;">Notebook Information </p>

<table style="color:rgb(88,89,91); font-family:Arial; float:left; font-size:13px; text-align:left;">
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Project</b></td>
        <td style="text-align:left;">NorMITs Demand Partner </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Primary Contact Name</b></td>
        <td style="text-align:left;"> </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Primary Contact Email</b></td>
        <td style="text-align:left;"> </td>
    </tr>
    <tr>
        <td style="color:rgb(0,90,132);font-size:13px; text-align:left;"><b>Document Sensitivity</b></td>
        <td style="text-align:left;"> </td>
    </tr>
</table>

In [1]:
import pandas as pd 
import numpy as np
import os
import logging
import sys
import time
import json
from utils import ChecksLogger

In [2]:
logger = ChecksLogger(log_file_name='checks_log.csv')

In [3]:
# Set paths etc.
model_version = 'v5'
tour_dir = r'I:\NTS\outputs\tour'
reports_dir = os.path.join(tour_dir, 'reports', model_version)
report_files = ['attr_county_output.csv', 
                'matrix_county_output.csv', 
                'nts_distr_output.csv', 
                'nts_mts_output.csv', 
                'nts_tour_output.csv', 
                'prod_county_output.csv']

In [4]:
# ALL CONTENT WITHIN THIS CELL TO BE DELETED EVENTUALLY
# CHECK IT IS NOT CALLED ANYWHERE BEFORE DOING SO THOUGH
# Should be replaced by things in cells above it

# Define the expected output files and directory
output_files = ['TourModelAnalysisCombinedInputs.csv',
                'County_to_TAZ.csv',
                'STB-Correspondence.xlsx', 
                'attr_county_output.csv', 
                'matrix_county_output.csv', 
                'nts_distr_output.csv', 
                'nts_mts_output.csv', 
                'nts_tour_output.csv', 
                'prod_county_output.csv']
output_dir = reports_dir #r'C:\Users\Elucidate\Documents\Git\lp-nts-processing\python\mdltour_warning_checks'

### TM011 - Check expected output files exist

Check specified output files have been produced in expected location and contain data, have been last edited within a certain time window of the initial tour model script running (which is hopefully logged somewhere?) - in other words, checking these are definitely the outputs produced by the latest run. Probably wants to be the first thing we check


In [5]:
time_window = time.time() # Confirm time frame
time_window

1719416310.2545857

In [6]:
# # Function to check if a file exists, has a data, and has been modified within the time window
# def check_output_files(output_files,output_dir, time_window):
#     for file in output_files:
#         file_path = os.path.join(output_dir,file)
#         #check if file exist
#         if os.path.exists(file_path):
#             #check if file has data
#             if os.path.getsize(file_path) > 0:
#                  #check if file has been modified within the time window
#                  mod_time = os.path.getmtime(file_path)
#                  current_time = time.time()
#                  if current_time - mod_time <= time_window:
#                      print(f"{file} has been produced, contains data, and produced by the latest run.")
#                  else:
#                      print(f"Warning: {file} exists and contains data, but was not produced by the latest run.")
#             else:
#                 print(f"Warning: {file} exists but does not contain data.")
#         else:
#             print(f"Warning: {file} does not exist")

In [7]:
## RM EXAMPLE - THIS IS HOW YOU CAN INCLUDE THE LOGGER FUNCTION IN THE ABOVE ##

# Function to check if a file exists, has a data, and has been modified within the time window
def check_output_files(output_files,output_dir, time_window):
    """
    TM011: Check if a file exists, has data, and has been modified within time window.
    Args: 
        output_files: List of expected output files
        output_dir: Directory for expected output files
        time_window: Acceptable time window to be considered output from latest run
    """
    for file in output_files:
        file_path = os.path.join(reports_dir,file)
        #check if file exist
        if os.path.exists(file_path):
            #check if file has data
            if os.path.getsize(file_path) > 0:
                #check if file has been modified within the time window
                mod_time = os.path.getmtime(file_path)
                current_time = time.time()
                if current_time - mod_time <= time_window:
                    print('here')
                    logger.success(f"TM011: Check output files exist", f"Success - {file} exists and contains data from latest run.")
                else:
                    print('there')
                    logger.warning(f"TM011: Check output files exist", f"Warning - {file} exists and contains data, but was not produced by latest run.")
            else:
                print('where')
                logger.warning(f"TM011: Check output files exist", f"Warning - {file} exists but does not contain data.")
        else:
            print('so')
            logger.warning(f"TM011: Check output files exist", f"Warning - {file} does not exist.")
    logger.save_logs()
    print('logged')

In [8]:
check_output_files(output_files,output_dir, time_window)

so
so
so
here
here
here
here
here
here
logged


### TM012 - Check output schemas

List the expected column names and schemas in all output files, then check the outputs match these. Might end up wrapping some of the other checks into this depending on things like table shapes and how descriptive Python schemas are. Probably also wants to be quite an early check

In [6]:
# Function to get columns and data types of a CSV file

def get_columns_datatypes(csv_file):
    df = pd.read_csv(csv_file)
    columns_datatypes = {column: str(df[column].dtype) for column in df.columns}
    return columns_datatypes

# Function to process each CSV file in a folder
def process_csv_files(folder_path):
    csv_files = ['attr_county_output.csv', 'matrix_county_output.csv', 'nts_distr_output.csv', 'nts_mts_output.csv', 'nts_tour_output.csv', 'prod_county_output.csv']
    result = {}
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        if os.path.exists(file_path):
            result[csv_file] = get_columns_datatypes(file_path)
        else:
            result[csv_file] = "File not found"
    return result

# Folder containing CSV files
folder_path = r'C:\Users\Elucidate\Documents\Git\lp-nts-processing\python\mdltour_warning_checks\output'

# Process CSV files and save results to JSON
result = process_csv_files(folder_path)
with open('columns_datatypes.json', 'w') as json_file:
    json.dump(result, json_file, indent=4)
    

In [7]:
# Function to get columns and data types of a CSV file
def get_columns_datatypes(csv_file):
    df = pd.read_csv(csv_file)
    columns_datatypes = {column: str(df[column].dtype) for column in df.columns}
    return columns_datatypes

# Function to compare JSON output with new CSV files
def compare_with_json(json_file, folder_path, expected_files):
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    column_errors = []
    datatype_errors = []
    matching_files = []
    
    for file_name, expected_columns in json_data.items():
        if file_name not in expected_files:
            continue
        
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"{file_name} not found in the specified folder.")
        
        actual_columns = get_columns_datatypes(file_path)
        
        if expected_columns.keys() != actual_columns.keys():
            column_errors.append(file_name)
        else:
            if all(column in actual_columns and actual_columns[column] == datatype
                   for column, datatype in expected_columns.items()):
                matching_files.append(file_name)
            else:
                for column, datatype in expected_columns.items():
                    if column not in actual_columns or actual_columns[column] != datatype:
                        datatype_errors.append((file_name, column))
    
    return column_errors, datatype_errors, matching_files

# Folder containing newly produced CSV files
new_folder_path = r'C:\Users\Elucidate\Documents\Git\lp-nts-processing\python\mdltour_warning_checks'

# JSON file containing the expected columns and data types
json_file = r'C:\Users\Elucidate\Documents\Git\lp-nts-processing\python\mdltour_warning_checks\columns_datatypes.json'

# Compare new CSV files with JSON output
try:
    column_errors, datatype_errors, matching_files = compare_with_json(json_file, new_folder_path, output_files)
    
    if column_errors:
        for i in column_errors:
            print("Warning: Column errors found in ", i)
    
    if datatype_errors:
        print("Warning: Data type errors found in")
        for file_name, column in datatype_errors:
            print(f"{file_name}: {column}")
    
    if matching_files:
        for i in matching_files:
            print(i, " file matches the expected columns and data types")
    
    if not column_errors and not datatype_errors and not matching_files:
        for i in output_files:
            print(i, " file matches the expected columns and data types.")
except FileNotFoundError as e:
    print("Error:", e)


attr_county_output.csv  file matches the expected columns and data types
matrix_county_output.csv  file matches the expected columns and data types
nts_mts_output.csv  file matches the expected columns and data types
nts_tour_output.csv  file matches the expected columns and data types
prod_county_output.csv  file matches the expected columns and data types


### TM005 - Intrasector trips should be highest

The maximum value for any given row/column in the output matrix is expected to be the intrasector cell. Allow some leeway here though - exception is probably Air travel as a mode, and rail in some instances. Calculate % average, flag if not inline.

In [8]:
df = pd.read_excel(r'C:\Users\Elucidate\Documents\Git\lp-nts-processing\python\mdltour_warning_checks\Tour Model Analysis v2.0.xlsm', sheet_name = 'Raw_Data', usecols='H:U',skiprows = 3)
counties = df['county_origin'].unique()
#df[County_ID_origin] = df[County_ID_origin].astype('Int64')
#df[County_ID_destination] = df[County_ID_destination].astype('Int64')
df.head()

Unnamed: 0,SOURCE,County_ID_origin,County_ID_destination,county_origin,county_destination,mode_name,purpose_name,direction,period,trips,SectorID_origin,Sector_origin,SectorID_destination,Sector_destination
0,NTEM7_prod,1.0,,West Yorkshire,,Walk,Commuting,hb_fr,1,194024.6,13.0,West Yorkshire,,
1,NTEM7_prod,1.0,,West Yorkshire,,Walk,Commuting,nhb,1,12258.35,13.0,West Yorkshire,,
2,NTEM7_prod,1.0,,West Yorkshire,,Walk,Commuting,hb_to,1,12364.7,13.0,West Yorkshire,,
3,NTEM7_prod,1.0,,West Yorkshire,,Walk,Commuting,nhb,1,0.0,13.0,West Yorkshire,,
4,NTEM7_prod,1.0,,West Yorkshire,,Walk,Commuting,hb_fr,2,74716.55,13.0,West Yorkshire,,


In [9]:
# Create a pivot table
pivot_table = pd.pivot_table(df, 
                             values='trips', 
                             index=['County_ID_origin','county_origin'], 
                             columns=['County_ID_destination','county_destination'], 
                             aggfunc='sum', 
                             fill_value=0)

#Pivot table with formatting
pd.options.display.float_format = '{:,.2f}'.format


# Display the pivot table
print("Pivot Table:")
pivot_table

Pivot Table:


Unnamed: 0_level_0,County_ID_destination,1.00,2.00,3.00,4.00,5.00,6.00,7.00,8.00,9.00,10.00,...,46.00,47.00,48.00,49.00,50.00,51.00,52.00,53.00,54.00,55.00
Unnamed: 0_level_1,county_destination,West Yorkshire,South Yorkshire,North Yorkshire,Humberside,Derbyshire,Nottinghamshire,Lincolnshire,Leicestershire,Northamptonshire,Cambridgeshire,...,Mid Wales,South West Wales,South East Wales,North Wales,Staffordshire,Shropshire,Hereford & Worcester,Warwickshire,West Midlands county,Bristol
County_ID_origin,county_origin,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,West Yorkshire,70507614.35,929796.68,1471474.68,179900.81,63375.54,50842.22,29722.15,35166.77,6567.07,12940.09,...,984.02,2267.58,2096.57,21247.73,20115.36,4460.48,223.29,9788.19,46909.57,2252.06
2.0,South Yorkshire,889359.5,40567706.76,178443.49,294520.84,862840.43,413438.11,95111.39,41351.84,7467.4,8549.94,...,2368.11,1996.1,3323.68,10076.22,15495.34,4928.01,219.36,10304.79,31941.7,1786.11
3.0,North Yorkshire,1347381.6,152431.91,24598105.15,547432.86,18904.15,19058.15,16067.0,10067.39,4493.42,5606.73,...,1199.78,460.95,434.17,6537.3,7507.26,1034.66,172.84,1647.49,9983.65,1069.11
4.0,Humberside,187546.14,287333.07,594511.94,28468098.56,27473.88,43715.06,449152.64,28653.41,4635.91,6520.19,...,0.0,1185.16,1895.19,1031.63,6229.49,2283.28,411.08,2605.58,9277.65,182.54
5.0,Derbyshire,63766.66,765757.09,21599.69,24709.34,29187328.68,1879548.34,56042.17,594001.32,21343.42,6421.76,...,999.01,1389.46,1943.04,5380.28,723544.12,9613.23,736.14,33084.86,125745.07,2128.91
6.0,Nottinghamshire,55272.14,425503.23,24616.42,50742.46,2078208.11,32359217.61,421043.34,481686.23,9679.7,28000.57,...,1106.27,329.33,1513.31,4578.26,62981.48,7608.19,832.6,24034.85,104903.54,2188.18
7.0,Lincolnshire,28481.64,90217.62,16597.79,446619.5,67498.87,401802.26,21487861.76,299844.24,72742.62,689779.87,...,255.03,324.03,130.78,2690.17,12637.14,3431.44,2356.09,5703.25,33627.75,456.62
8.0,Leicestershire,26948.29,43911.57,15103.63,17007.28,629703.18,451830.95,287472.35,31798641.44,278077.4,83310.55,...,763.2,778.45,5803.58,2239.59,201564.26,11088.9,1296.95,566303.7,380308.21,4979.16
9.0,Northamptonshire,4986.38,9608.96,5251.63,6001.26,30272.43,13665.39,94111.94,352457.39,21533415.76,329628.49,...,2199.05,0.0,1298.45,2838.68,14461.01,8724.73,2810.11,204405.48,130253.03,1893.19
10.0,Cambridgeshire,10936.18,11049.74,7446.37,6061.91,12233.36,30672.24,763856.44,99315.96,305809.81,25195365.12,...,1068.17,2907.26,4116.8,1271.42,8905.38,1431.54,196.0,6428.31,19714.62,1486.27


In [10]:
# Check for inter-county trips greater than intra-county trips
warnings = []

for origin in pivot_table.index:
    origin_id, origin_name = origin
    intra_county_trip = pivot_table.loc[origin, (origin_id, origin_name)]
    
    for destination in pivot_table.columns:
        dest_id, dest_name = destination
        
        if origin_id != dest_id:
            inter_county = pivot_table.loc[origin, destination]
            if inter_county > intra_county_trip:
                percentage = (inter_county/intra_county_trip)*100 #Calculate percentage of error
                warnings.append(f"Warning: Inter-county trip from {origin_name} to {dest_name} is greater than intra-county trip from {origin_name} to {origin_name} by {percentage:.2f}%.")

# Print warnings if any
if warnings:
    print("\nWarnings:")
    for warning in warnings:
        print(warning)
else:
    print("\nNo warnings.")





### TM001 - Check tour model outputs non-zero

Check that the total in the matrix output file is > 0
Also check each row and column in the output matrix sums to a > 0 value (will cover some of the checks to ensure everything is there too - rest of this check covered by TM010)

In [11]:
dest_col_sums = pivot_table.sum(axis=0)
origin_col_sums = pivot_table.sum(axis=1)
    
try:
    if (dest_col_sums <= 0).any():
        probl_col = pivot_table.columns[dest_col_sums <= 0]
        raise ValueError(f'No value in destination column: {probl_col}')
        
    if (origin_col_sums <= 0).any():
        probl_row = pivot_table.index[origin_col_sums <= 0]
        raise ValueError(f'No value in origin row: {probl_row}')
        
    print('All columns and rows sum to greater than 0')

except ValueError as e:
    print(f'Error: {e}, Column sum: {dest_col_sums}, Row sum: {origin_col_sums}') 

All columns and rows sum to greater than 0


### TM002 - Check tour model outputs not negative

Check that no cell in the matrix output is < 0

In [12]:
# Function to check for negative values and raise an error with details
def check_negative_values(pivot_table):
    errors = []
    for r in pivot_table.index:
        for c in pivot_table.columns:
            if pivot_table.at[r, c] < 0:
                origin_id, origin_name = r
                destination_id, destination_name = c
                errors.append(f"Negative value found at Origin: {origin_name} (ID: {origin_id}), Destination: {destination_name} (ID: {destination_id})")

    if errors:
        raise ValueError("Errors found:\n" + "\n".join(errors))

# Run the function to check for negative values
try:
    check_negative_values(pivot_table)
    print('No negative value in model output')
except ValueError as e:
    print(e)


No negative value in model output


### TM003 - Check for NULLs/NaNs in tour model outputs

Check for NULLs/NaNs in tour model outputs

In [13]:
def check_null_values(pivot_table):
    errors = []
    for r in pivot_table.index:
        for c in pivot_table.columns:
            if pd.isnull(pivot_table.at[r, c]):
                origin_id, origin_name = r
                destination_id, destination_name = c
                errors.append(f"Null value found at Origin: {origin_name} (ID: {origin_id}), Destination: {destination_name} (ID: {destination_id})")

    if errors:
        raise ValueError("Errors found:\n" + "\n".join(errors))

# Run the function to check for null values
try:
    check_null_values(pivot_table)
    print('No null value in model output')
except ValueError as e:
    print(e)

No null value in model output


In [14]:
def check_null_values(pivot_table):
    errors = []
    for r in pivot_table.index:
        for c in pivot_table.columns:
            if pd.isna(pivot_table.at[r, c]):
                origin_id, origin_name = r
                destination_id, destination_name = c
                errors.append(f"NaN value found at Origin: {origin_name} (ID: {origin_id}), Destination: {destination_name} (ID: {destination_id})")

    if errors:
        raise ValueError("Errors found:\n" + "\n".join(errors))

# Run the function to check for NaN values
try:
    check_null_values(pivot_table)
    print('No NaN value in model output')
except ValueError as e:
    print(e)

No NaN value in model output


### TM010 - Check zones/sectors in output

Check list of zones/sectors in output against a list of zones we expect to see - will account for any zones/sectors that have been dropped entirely rather than just 0-ed (as tested for by TM001)