In [1]:
"""
This script builds on P:\GBCBA\HandT\CQ\Projects\5227104-NorMITs Demand 2024-ADDY4067\40 Technical\02 TourModel\Develop Tour Model\Rail Coverage Analysis\01_processing\ProcessMatrix\JoinMatrixToGeography_v0.2.ipynb
It is designed to ultimately obtain a furnessed version of the rail output matrix based on the rail ticketing data proportions
Versioning to be handled by Git (hopefully) once I've spoken to Rachel about where to put it
"""

"\nThis script builds on P:\\GBCBA\\HandT\\CQ\\ProjectsŒ7104-NorMITs Demand 2024-ADDY4067  Technical\x02 TourModel\\Develop Tour Model\\Rail Coverage Analysis\x01_processing\\ProcessMatrix\\JoinMatrixToGeography_v0.2.ipynb\nIt is designed to ultimately obtain a furnessed version of the rail output matrix based on the rail ticketing data proportions\nVersioning to be handled by Git (hopefully) once I've spoken to Rachel about where to put it\n"

## Imports

In [7]:
# Existing packages
import pandas as pd
import os
import numpy as np
from datetime import datetime

# TfN packages
# import sys
# # caution: path[0] is reserved for script path (or '' in REPL)
# sys.path.insert(1, r'C:\Users\Jimny\Documents\GitHub\caf.distribute\src')

# from caf.distribute import furness

## Import data in files

In [3]:
# Set directories
inputs_dir = r'I:\NTS\imports\tour_adjust_imports'
msoa_dir = r'I:\NTS\imports'

# Set file names
odm_file = 'ODM_for_rdm_2022-23.csv'
msoa_county_file = 'msoa11cd_correspondence.csv'
stn_geo_file = 'station_attributes_on_TfN_geography.csv'
sector_file = 'bespoke_sectors_v1.1.csv'
lrtu_file = 'lrt0101.csv'

# Import data
odm_in_df = pd.read_csv(os.path.join(inputs_dir, odm_file))
msoa_county_in_df = pd.read_csv(os.path.join(msoa_dir, msoa_county_file))
stn_geo_in_df = pd.read_csv(os.path.join(inputs_dir, stn_geo_file))
sector_in_df = pd.read_csv(os.path.join(inputs_dir, sector_file))
lrtu_in_df = pd.read_csv(os.path.join(inputs_dir, lrtu_file), skiprows=7)
lrtu_in_df.columns = lrtu_in_df.columns.str.split('[').str[0].str.strip() # Some processing required here to make column names tidier

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Other inputs
Some manual inputs that set values later in the process

In [4]:
# Set light rail inputs
lrtu_year_in = 2023 # Year for which to extract the Light Rail, Tramway and Underground data
lrtu_london_scale_in = 0.25 # Proportion of trips on the London Underground, London Trams and Docklands Light Railway that are considered to be "unique" (i.e. not double counted with another rail mode)
lrtu_nonlondon_scale_in = 0.5 # Proportion of trips on Light Rail, Tramway and Underground systems outside of London that are considered to be "unique" (i.e. not double counted with another rail mode)

# For each Light Rail, Tramway or Underground system in GB,
# set the sector in which it is located.
# Done at sector level as some of these systems cross county borders
lrtu_systems_in = {
    'Docklands Light Railway': 'London',
    'London Trams': 'London',
    'Nottingham Express Transit': 'East Midlands North',
    'West Midlands Metro': 'West Midlands South',
    'Sheffield Supertram': 'South Yorkshire',
    'Tyne and Wear Metro': 'Tyne and Wear',
    'Manchester Metrolink': 'Greater Manchester',
    'Blackpool Tramway': 'Lancashire',
    'Edinburgh Trams': 'Scotland',
    'London Underground': 'London',
    'Glasgow Subway': 'Scotland'
}

# Set counties for stations that are located outside of the MSOA shapefile,
# so get missed off the correspondence. This is a table here in case the station
# shapefile is updated to add new stations

# Need to account for:
#  - Blackfriars (5112) - Bad join in the GIS as it's in the middle of the Thames!
#  - Portsmouth Harbour (5540) - Bad join in the GIS as it's in the harbour...
#  - Ryde Pier Head (5541) - Bad join as in the GIS as it is quite far out to sea!

# Counties to allocate stations to:
#  - Blackfriars -> Inner London (County 17)
#  - Portsmouth Harbour -> Hampshire (County 35)
#  - Ryde Pier Head -> Hampshire (County 35)

stn_county_infill_df = pd.DataFrame(
    columns=['National Location Code', 'county', 'county_nm'],
    data=[[5112, 17, 'Inner London'],
          [5540, 35, 'Hampshire'],
          [5541, 35, 'Hampshire']]
    )

## Functions to process rail ticketing/journey data

In [6]:
def process_lrtu_data(lrtu_df, lrtu_year, lrtu_systems, lrtu_london_scale, lrtu_nonlondon_scale):
    """
    Parameters
    ----------
    lrtu_df: pandas df
        Light Rail, Tramway and Underground annual journey data by system as
        read in by this script
    lrtu_year: int
        Year for which to extract the Light Rail, Tramway and Underground data
        It is the year in which the finacial year ends
        It should match the year for which the national rail odm is downloaded
    lrtu_systems: dict
        Dictionary relating each Light Rail, Tramway or Underground system in
        GB to the sector in which it is located
    lrtu_london_scale: Float
        Expected range 0.0 to 1.0
        Proportion of trips on the London Underground, London Trams and
        Docklands Light Railway that are considered to be "unique" (i.e. not
        double counted with another rail mode)
    lrtu_nonlondon_scale:
        Expected range 0.0 to 1.0
        Proportion of trips on Light Rail, Tramway and Underground systems
        outside of London that are considered to be "unique" (i.e. not double
        counted with another rail mode)
    
    Returns
    ----------
    lrtu_df: pandas df
        For the selected year, the estimate of the number of "unique" (i.e. not
        double counted with another rail mode) journeys by Light Rail, Tramway
        and Underground for the sectors in which such systems are located.
        This is an annual total
    """
    
    # Basic logic checks on inputs
    yearnow = datetime.now().year
    if not 2013 < lrtu_year <= yearnow or type(lrtu_year) not int:
        print('WARNING: Unexpected input year for Light Rail, Tramway and Underground data')
        print(f'Expected an interger year between 2014 and {yearnow}')
        print(f'Instead, got {lrtu_year})
    if not 0 < lrtu_london_scale <= 1:
        print('WARNING: London scaling factor expected to be greater than 0, less the or equal to 1')
        print(f'Instead got London scaling factor of {lrtu_london_scale}')
    if not 0 < lrtu_nonlondon_scale <= 1:
        print('WARNING: Outside London scaling factor expected to be greater than 0, less the or equal to 1')
        print(f'Instead got outside London scaling factor of {lrtu_nonlondon_scale}')
    
    # Process to account for odd formatting of source
    lrtu_df = lrtu_df.dropna(axis=1, how='all')
    lrtu_df = lrtu_df.dropna(axis=0, how='all')
    lrtu_df = lrtu_df.rename(columns={'Financial year ending March': 'Year'})
    lrtu_df['Year'] = lrtu_df['Year'].astype(int)

    # Select data we are interested in and reformat to a system-based index
    lrtu_df = lrtu_df.loc[lrtu_df['Year'] == lrtu_year]
    lrtu_df = lrtu_df.set_index(['Year'])
    lrtu_df = lrtu_df.transpose().reset_index()
    lrtu_df = lrtu_df.rename_axis(None, axis=1)
    lrtu_df = lrtu_df.rename(
        columns={'index': 'System', lrtu_year: 'Yearly Journeys'})

    # Convert yearly journeys to absolutes (and make sure they are numeric!)
    # Note this bit will fall over if you pick a year before all systems
    #   were returning data (i.e. some cells are '[w]')
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(str)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].str.replace(
        ',', '')
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(float) * 10 # Just clear float to minimise rounding error risk
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(int)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'] * 100000 # Not 1 million as we've times by 10 about to get out of float

    # Apply sectors to data
    lrtu_df['Sector'] = lrtu_df['System'].map(lrtu_systems)
    lrtu_df = lrtu_df.dropna(axis=0) # Drop rows where system name is not found (expected to be some total rows like all of GB)
    if lrtu_df.shape[0] != len(lrtu_systems):
        print('WARNING: The systems you have specified sectors for and the systems in the input file do not match!')
    lrtu_df = lrtu_df.groupby(
        ['Sector'])['Yearly Journeys'].sum().reset_index()

    # Apply scaling factors to account for overlap with other rail modes
    # (e.g. national rail, other light rail systems)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'] * np.where(
        lrtu_df['Sector'] == 'London', lrtu_london_scale, lrtu_nonlondon_scale)
    lrtu_df['Yearly Journeys'] = lrtu_df['Yearly Journeys'].astype(int)
    
    return lrtu_df

In [None]:
def process_station_geography(odm_df, msoa_county_df, stn_geo_df, stn_infill_df):
    """
    Parameters
    ----------
    
    
    Returns
    ----------
    
    """
    
    # Cut down to just the columns of interest
    # Note nlc (National Location Code) is a unique numerical code for each
    # station
    odm_df = odm_df[['origin_nlc',
                     'origin_station_name',
                     'destination_nlc',
                     'destination_station_name',
                     'journeys']]
    msoa_county_df = msoa_county_df[['msoa11cd',
                                     'county',
                                     'county_nm']]
    stn_geo_df = stn_geo_df[['National Location Code', 'msoa11cd']]
    
    # Assign counties to stations that are allocated MSOAs by the geospatial
    # processing
    stn_geo_df = stn_geo_df.merge(msoa_county_df, how='left', on='msoa11cd')
    stn_geo_df = stn_geo_df.drop(columns=['msoa11cd'], axis=1)
    
    # Add on the stations that exist outside of the MSOA shapefile
    # Drop rows containing nulls
    if stn_geo_df[stn_geo_df.isnull().any(axis=1)].shape == stn_county_infill_df.shape:
        # We are infilling something the same size as the NULL rows,
        # which we want to do
        # Drop the NULL rows, then append the replacements
        stn_geo_df = stn_geo_df.dropna(how='any', axis=0)
        stn_geo_df = pd.concat([stn_geo_df, stn_county_infill_df])
        stn_geo_df.reset_index(inplace=True, drop=True)
    else:
        print('WARNING: The NULL infilling table you are trying to append is not the same dimensions as the NULL rows in the table')
        print('Operation therefore not attempted and NULL rows are still in place')
    
    # Rename the National Location Code to make it a bit less unweildly
    stn_geo_df = stn_geo_df.rename(columns={'National Location Code': 'nlc'})