In [1]:
"""
This script builds on P:\GBCBA\HandT\CQ\Projects\5227104-NorMITs Demand 2024-ADDY4067\40 Technical\02 TourModel\Develop Tour Model\Rail Coverage Analysis\01_processing\ProcessMatrix\JoinMatrixToGeography_v0.2.ipynb
It is designed to ultimately obtain a furnessed version of the rail output matrix based on the rail ticketing data proportions
Versioning to be handled by Git (hopefully) once I've spoken to Rachel about where to put it
"""

"\nThis script builds on P:\\GBCBA\\HandT\\CQ\\ProjectsŒ7104-NorMITs Demand 2024-ADDY4067  Technical\x02 TourModel\\Develop Tour Model\\Rail Coverage Analysis\x01_processing\\ProcessMatrix\\JoinMatrixToGeography_v0.2.ipynb\nIt is designed to ultimately obtain a furnessed version of the rail output matrix based on the rail ticketing data proportions\nVersioning to be handled by Git (hopefully) once I've spoken to Rachel about where to put it\n"

## Imports

In [None]:
# Existing packages
import pandas as pd
import os
import numpy as np

# TfN packages
# from caf.distribute import furness

## Import data in files

In [None]:
# Set directories
inputs_dir = r'P:\GBCBA\HandT\CQ\Projects\5227104-NorMITs Demand 2024-ADDY4067\40 Technical\02 TourModel\Develop Tour Model\Rail Coverage Analysis\00_inputs'
gis_dir = r'P:\GBCBA\HandT\CQ\Projects\5227104-NorMITs Demand 2024-ADDY4067\40 Technical\02 TourModel\Develop Tour Model\Rail Coverage Analysis\01_processing\GIS\GISjoinOutput'

# Set file names
odm_file = 'ODM_for_rdm_2022-23.csv'
msoa_county_file = 'msoa11cd_correspondence.csv'
stn_geo_file = 'station_attributes_on_TfN_geography.csv'
sector_file = 'bespoke_sectors_v1.1.csv'
lrtu_file = 'lrt0101.csv'

# Import data
odm_in_df = pd.read_csv(os.path.join(inputs_dir, odm_file))
msoa_county_in_df = pd.read_csv(os.path.join(inputs_dir, msoa_county_file))
stn_geo_in_df = pd.read_csv(os.path.join(gis_dir, stn_geo_file))
sector_in_df = pd.read_csv(os.path.join(inputs_dir, sector_file))
lrtu_in_df = pd.read_csv(os.path.join(inputs_dir, lrtu_file), skiprows=7)
lrtu_in_df.columns = lrtu_in_df.columns.str.split('[').str[0].str.strip() # Some processing required here to make column names tidier

## Other inputs
Some manual inputs that set values later in the process

In [None]:
# Set light rail inputs
lrtu_year_in = 2023 # Year for which to extract the Light Rail, Tramway and Underground data
lrtu_london_scale_in = 0.25 # Proportion of trips on the London Underground, London Trams and Docklands Light Railway that are considered to be "unique" (i.e. not double counted with another rail mode)
lrtu_nonlondon_scale_in = 0.5 # Proportion of trips on Light Rail, Tramway and Underground systems outside of London that are considered to be "unique" (i.e. not double counted with another rail mode)
# For each Light Rail, Tramway or Underground system in GB, set the sector in which it is located. Done at sector level as some of these systems cross county borders
lrtu_systems_in = {
    'Docklands Light Railway': 'London',
    'London Trams': 'London',
    'Nottingham Express Transit': 'East Midlands North',
    'West Midlands Metro': 'West Midlands South',
    'Sheffield Supertram': 'South Yorkshire',
    'Tyne and Wear Metro': 'Tyne and Wear',
    'Manchester Metrolink': 'Greater Manchester',
    'Blackpool Tramway': 'Lancashire',
    'Edinburgh Trams': 'Scotland',
    'London Underground': 'London',
    'Glasgow Subway': 'Scotland'
}

## Functions to process rail ticketing/journey data

In [None]:
def process_lrtu_data(lrtu_df, lrtu_year, lrtu_systems, lrtu_london_scale, lrtu_nonlondon_scale):
    """
    Parameters
    ----------
    lrtu_df: pandas df
        Light Rail, Tramway and Underground annual journey data by system as
        read in by this script
    lrtu_year
    """
    # Process to account for odd formatting of source
    lrtu_df = lrtu_df.dropna(axis=1, how='all')
    lrtu_df = lrtu_df.dropna(axis=0, how='all')
    lrtu_df = lrtu_df.rename(columns={'Financial year ending March': 'Year'})
    lrtu_df['Year'] = lrtu_df['Year'].astype(int)

    # Select data we are interested in and reformat to a system-based index
    lrtu_df = lrtu_df.loc[lrtu_df['Year'] == lrtu_year]
    lrtu_df = lrtu_df.set_index(['Year'])
    lrtu_df = lrtu_df.transpose().reset_index()
    lrtu_df = lrtu_df.rename_axis(None, axis=1)
    lrtu_df = lrtu_df.rename(
        columns={'index': 'System', lrtu_year: 'Yearly Journeys'})

    # Convert yearly journeys to absolutes (and make sure they are numeric!)
    # Note this bit will fall over if you pick a year before all systems
    #   were returning data (i.e. some cells are '[w]')
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(str)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].str.replace(
        ',', '')
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(float) * 10 # Just clear float to minimise rounding error risk
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(int)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'] * 100000 # Not 1 million as we've times by 10 about to get out of float

    # Apply sectors to data
    lrtu_df['Sector'] = lrtu_df['System'].map(lrtu_systems)
    lrtu_df = lrtu_df.dropna(axis=0) # Drop rows where system name is not found (expected to be some total rows like all of GB)
    if lrtu_df.shape[0] != len(lrtu_systems):
        print('WARNING: The systems you have specified sectors for and the systems in the input file do not match!')
    lrtu_df = lrtu_df.groupby(
        ['Sector'])['Yearly Journeys'].sum().reset_index()

    # Apply scaling factors to account for overlap with other rail modes (e.g. national rail, other light rail systems)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'] * np.where(
        lrtu_df['Sector'] == 'London', lrtu_london_scale, lrtu_nonlondon_scale)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(int)