In [1]:
"""
Script to process nts data to point were it can be input to regression modelling
Based on build_attractions_6_v0.3 (or any build_attractions R script - I hope!)
Adds in SOC classification, which the R script does not have
Note this will need aggregating with Pythonic versions of other elements of the script
Then testing, then converting to TfN's coding style
v0.1: ART, 02/05/2024 - Originated
v0.2: ART, 09/05/2024 - Moving all filtering clauses to after the expansion factor calculation
v0.3: ART, 09/05/2024 - Adding check on weighting person weighting in NTS data
v0.4: ART, 09/05/2024 - Implementing weighting of population by the w2 weight prior to the expansion factor process
                      - Removing the testing of weighting from v0.3
v0.5: ART, 10/05/2024 - Merges UA 830 into UA 320 by picking up the revised correspondence from TfN
TRANSFER TO GIT AND RENAMED FROM: NTS_Data_Processing_v0.5.ipynb
                              TO: nts_data_processing.ipynb
                              ART, 23/05/2024 - QA beyond this point in Git commit messages
"""

"\nScript to process nts data to point were it can be input to regression modelling\nBased on build_attractions_6_v0.3 (or any build_attractions R script - I hope!)\nAdds in SOC classification, which the R script does not have\nNote this will need aggregating with Pythonic versions of other elements of the script\nThen testing, then converting to TfN's coding style\nv0.1: ART, 02/05/2024 - Originated\nv0.2: ART, 09/05/2024 - Moving all filtering clauses to after the expansion factor calculation\nv0.3: ART, 09/05/2024 - Adding check on weighting person weighting in NTS data\nv0.4: ART, 09/05/2024 - Implementing weighting of population by the w2 weight prior to the expansion factor process\n                      - Removing the testing of weighting from v0.3\nv0.5: ART, 10/05/2024 - Merges UA 830 into UA 320 by picking up the revised correspondence from TfN\n"

In [2]:
import pandas as pd

In [3]:
# Set variables that should probably come from the function call in the final script when integrated in the TfN scripts
incl_hb_fr = True
incl_hb_to = True
incl_nhb_trips = True
soc_selection = {
    'S1': [1, 2, 3, 4],
    'S2': [1, 2, 3]
}
purpose_aggregation = {
    'P1': 1,
    'P2': 2,
    'P3': 3,
    'P4': 4,
    'P5': 5,
    'P6': 6,
    'P7': 7,
    'P8': 8
}

In [4]:
# Read in NTS Classified Build Data
nts_in = pd.read_csv('I:/NTS/classified builds/cb_tfn_v13.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
# Read in LU data
land_use_in = pd.read_csv('I:/NorMITs Land Use/base_land_use/iter3e/outputs/land_use_2018_pop.csv')

In [6]:
# Read in MSOA to GOR conversion table
msoa_to_gor_in = pd.read_csv('I:/Data/Zone Translations/msoa_to_gor_correspondence.csv')

In [7]:
# Read in UA to GOR conversion table
msoa_to_ua_in = pd.read_csv('I:/NTS/imports/msoa11cd_correspondence.csv') 

In [8]:
# The classified build uses the wrong UA codes (so includes UA 830) so fix that straight away...
nts = nts_in.rename({'tripdestua1998_b01id': 'ua'}, axis=1)
ua_fix = msoa_to_ua_in[['ua', 'ua1998']].drop_duplicates()
ua_fix = ua_fix.rename({'ua1998': 'tripdestua1998_b01id'}, axis=1)
nts = nts.merge(ua_fix, on=['ua'], how='left').drop('ua', axis=1)

In [9]:
# This step might be avoidable in the final version integrated with the rest of TfN's scripts
gorNum2Name = {
    1: "north east",
    2: "north west",
    3: "yorkshire & the humber",
    4: "east midlands",
    5: "west midlands",
    6: "east of england",
    7: "london",
    8: "south east",
    9: "south west",
    10: "wales",
    11: "scotland"
}

nts['gor'] = nts['hholdgor_b02id'].map(gorNum2Name)

In [10]:
# Derive population

# Pop by GOR
land_use = land_use_in[['msoa_zone_id', 'people']]
msoa_to_gor = msoa_to_gor_in[['msoa_zone_id', 'gor']]
pop_by_gor = land_use.merge(msoa_to_gor, on='msoa_zone_id', how='left')
pop_by_gor = pop_by_gor.groupby(['gor'])['people'].sum().reset_index()

# GOR level expansion factor
# Get nts population sample by GOR
# Need to apply the w2 weight prior to this calculation
nts_sample_stats = nts.groupby(['gor', 'w2'])['individualid'].nunique().reset_index()
nts_sample_stats = nts_sample_stats.rename({'individualid': 'people_unweighted'}, axis=1)
nts_sample_stats['people*w2'] = nts_sample_stats['people_unweighted'] * nts_sample_stats['w2']
nts_sample_stats = nts_sample_stats.groupby(['gor'])[['people*w2']].sum().reset_index()
nts_sample_stats = nts_sample_stats.rename({'people*w2': 'people_sample'}, axis=1)

gor_expand_factors = nts_sample_stats.merge(pop_by_gor, on=['gor'], how='left')
gor_expand_factors['expansion_factor'] = gor_expand_factors['people'] /  gor_expand_factors['people_sample']
gor_expand_factors = gor_expand_factors.drop(columns=['people', 'people_sample'])

In [11]:
display(gor_expand_factors)

Unnamed: 0,gor,expansion_factor
0,east midlands,183.123133
1,east of england,182.858847
2,london,194.472076
3,north east,176.085675
4,north west,178.9312
5,scotland,306.828139
6,south east,184.24148
7,south west,181.572298
8,wales,302.231794
9,west midlands,183.869518


In [12]:
# Create UA to GOR lookup
msoa_to_ua = msoa_to_ua_in[['msoa11cd', 'ua1998']]
msoa_to_ua = msoa_to_ua.rename({'msoa11cd': 'msoa_zone_id', 'ua1998': 'ua_1998_zone_id'}, axis=1)
ua_to_gor = msoa_to_gor.merge(
    msoa_to_ua, on='msoa_zone_id', how='outer')[['ua_1998_zone_id', 'gor']].drop_duplicates().reset_index(drop=True)

In [13]:
# NTS trips by dest ua and purpose
ua_trips_agg = nts[['direction', 'purpose', 'soc', 'tripdestua1998_b01id', 'trips']]

# Filter out cases with unassigned purpose
ua_trips_agg = ua_trips_agg.loc[(ua_trips_agg['tripdestua1998_b01id'] >= 0) &
                                (ua_trips_agg['purpose'].isin(range(1, 9)))]
ua_trips_agg = ua_trips_agg.groupby(
    ['direction', 'purpose', 'soc', 'tripdestua1998_b01id'])['trips'].sum().reset_index()
ua_trips_agg = ua_trips_agg.dropna()
ua_trips_agg = ua_trips_agg.rename({'tripdestua1998_b01id': 'ua_1998_zone_id'}, axis=1)

# Attach GOR to UA based NTS df
ua_trips_agg = ua_trips_agg.merge(ua_to_gor, on='ua_1998_zone_id', how='left')

# Attach expansion factors by GOR
ua_trips_agg = ua_trips_agg.merge(gor_expand_factors, on=['gor'], how='left')

# Create expanded_trips by multiplying trips by the expansion factors
ua_trips_agg['expanded_trips'] = ua_trips_agg['trips'] * ua_trips_agg['expansion_factor']

display(ua_trips_agg)

Unnamed: 0,direction,purpose,soc,ua_1998_zone_id,trips,gor,expansion_factor,expanded_trips
0,hb_fr,1,1,101.0,527.979567,south west,181.572298,95866.463444
1,hb_fr,1,1,102.0,2613.888164,south west,181.572298,474609.681434
2,hb_fr,1,1,103.0,578.325544,south west,181.572298,105007.898131
3,hb_fr,1,1,104.0,891.402777,south west,181.572298,161854.050993
4,hb_fr,1,1,110.0,1229.325861,east of england,182.858847,224793.109693
...,...,...,...,...,...,...,...,...
13642,nhb,8,4,732.0,73.843450,scotland,306.828139,22657.248396
13643,nhb,8,4,800.0,844.356530,london,194.472076,164203.767160
13644,nhb,8,4,810.0,9.677480,east of england,182.858847,1769.612775
13645,nhb,8,4,820.0,93.890139,east of england,182.858847,17168.642517


In [15]:
# Save output for testing
# No v0.3 of this as there was no change to the process that created v0.2 vs. v0.3 in v0.3 of this notebook
ua_trips_agg.to_csv('I:/NTS/NorMITs WP1/Built Attractions/PythonTesting/NTS_Data_Processed_v0.5.csv')