# Spatial Interaction

The notebook for handling spatial interaction for the urban simulation module.

## Overview

The structure of this notebook is as follows.

- Preparation of required libraries and data and defining functions for spatial interaction models
- Predicting the parameters by estimating the total flows using the original data

## Preparation

First, we will prepare the required libraries and load the data.

In [1]:
# load libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats
from math import sqrt

In [2]:
# load data

data_path = os.path.join('data', 'london_flows.csv')
od_tube_df = pd.read_csv(data_path)


In [44]:
od_tube_df

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance
0,Abbey Road,Bank and Monument,0,599,78549,8131.525097
1,Abbey Road,Beckton,1,599,442,8510.121774
2,Abbey Road,Blackwall,3,599,665,3775.448872
3,Abbey Road,Canary Wharf,1,599,58772,5086.514220
4,Abbey Road,Canning Town,37,599,15428,2228.923167
...,...,...,...,...,...,...
61469,Woolwich Arsenal,Tower Gateway,127,7892,3342,13401.795549
61470,Woolwich Arsenal,West Ham,608,7892,5487,8701.454361
61471,Woolwich Arsenal,West India Quay,6,7892,400,9536.720451
61472,Woolwich Arsenal,West Silvertown,81,7892,893,5355.248554


In [45]:
# create matrix table of observed

od_matrix = od_tube_df.pivot_table(
    values = 'flows', index = 'station_origin', 
    columns = 'station_destination', aggfunc = 'sum', margins = True
)

od_matrix

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,32.0,599
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1224
Acton Town,,,,3.0,17.0,,35.0,0.0,,11.0,...,77.0,3.0,6.0,9.0,,0.0,,0.0,,3745
Aldgate,,,0.0,,0.0,,,0.0,,17.0,...,0.0,,4.0,8.0,,0.0,,0.0,,2886
Aldgate East,,,2.0,0.0,,,0.0,0.0,,20.0,...,24.0,0.0,0.0,12.0,,1.0,,1.0,,3172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,2.0,5.0,47.0,,,,,22.0,...,2.0,,1.0,,,,,,,4868
Woodgrange Park,,0.0,,,,,,,,,...,,,,,,,,,,530
Woodside Park,,,1.0,26.0,11.0,,0.0,,,59.0,...,0.0,,0.0,,,,,,,3093
Woolwich Arsenal,20.0,,,,,7.0,,,,,...,,,,,,,,,,7892


### Defining Spatial Interaction Functions

Define the functions that run the spatial interaction model. This section will be considered later.


In [5]:
# define the spatial interaction models

def spatial_interaction(
        df: pd.DataFrame,
        subset = 'all',
        orig_field = 'station_origin',
        dest_field = 'station_destination',
        Oi_field = 'population',
        Dj_field = 'jobs',
        cij_field = 'distance',
        actual = 'flows',
        cost_function = 'pow'
):
    """
    Runs the models of spatial interaction and annotates results to the original dataframe.
    Returns the original dataframe with results annotated, along with the regression model for each of the spatial interaction model.
    Requires statsmodels.api as sm, statsmodels.formula.api as smf

    Parameters
    ----------
    df : pd.DataFrame
        pandas DataFrame that includes the data for OD analysis
    subset : list
        list of names of boroughs used for analysis, or 'all' (default) to use the whole dataset
    orig_field : str
        the name of column for origin
    dest_field : str
        the name of column for destination
    Oi_field : str
        the name of column for origin statistic
    Dj_field : str
        the name of column for destination statistic
    cij_field : str
        the name of column for distance statistic
    actual : str
        the name of column for the actual value observed
    cost_function : str
        string showing which cost function to use. 'exp' for negative exponential, 'pow' for inverse power

    Returns
    -------
    return_df
        a dataframe with annotated data
        adds the following 4 columns to the original dataframe:
        'unconstrained_est', 
        'origin_constrained_est', 
        'destination_constrained_est', 
        'doubly_constrained_est'

    models
        a dictionary of Generalized Linear Model wrapper objects, including 4 models indexed as:
        'unconstrained', 
        'origin_constrained', 
        'destination_constrained', 
        'doubly_constrained'
    }       

    """

    # create new dataframe with only the required rows and columns
    columns = [orig_field, dest_field, Oi_field, Dj_field, cij_field, actual]
    new_df = df[columns].copy().reset_index().drop(columns = 'index')
 
    # get rid of the internal flows for now
    new_df = new_df[new_df[orig_field] != new_df[dest_field]].copy()

    # subset if specified
    if (subset != 'all'):
        new_df = new_df[
            (new_df[orig_field].isin(subset)) & 
            (new_df[dest_field].isin(subset))
        ].copy()

    # get the log of origin and destination
    new_df['log_Oi'] = np.log(new_df[Oi_field])
    new_df['log_Dj'] = np.log(new_df[Dj_field])

    # get log of cost function
    # the inverse power cij ** (-beta), logged as -beta * np.log(cij)
    new_df['pow_cost'] = np.log(new_df[cij_field])

    # the negative exponential exp(-beta * cij), logged as -beta * cij
    new_df['exp_cost'] = new_df[cij_field]          

    # create formulas
    formulas = []

    for c in ['pow_cost', 'exp_cost']:
        formulas.extend(
            [
                f'{actual} ~ log_Oi + log_Dj + {c}',
                f'{actual} ~ {orig_field} + log_Dj + {c} -1',
                f'{actual} ~ log_Oi + {dest_field} + {c} -1',
                f'{actual} ~ {orig_field} + {dest_field} + {c} -1'
            ]
        ) 
    
    # run regression models
    models = []    
    
    for f in formulas:
        models.append(
            smf.glm(formula = f, data = new_df, family = sm.families.Poisson()).fit()
        )
        new_df[f'results']



    # ----- unconstrained model -----
        
    # create formula for unconstrained model
    formula_unconstrained = f'{actual} ~ log_Oi + log_Dj + log_cost'

    # run regression
    unco_sim = smf.glm(
        formula = formula_unconstrained,
        data = new_df,
        family = sm.families.Poisson()
    ).fit()

    # assign the parameter values
    K_unconstrained = unco_sim.params['Intercept']
    alpha_unconstrained = unco_sim.params['log_Oi']
    gamma_unconstrained = unco_sim.params['log_Dj']
    beta_unconstrained = -unco_sim.params['log_cost']

    # calculated the unconstrained value
    new_df['unconstrained_est'] = round(
        np.exp(
            (alpha_unconstrained * new_df['log_Oi'])
            + (gamma_unconstrained * new_df['log_Dj']) 
            - (beta_unconstrained * new_df['log_cost'])
            + K_unconstrained
        ), 0).astype(int)
    
    # append column to the returning dataframe
    columns.append('unconstrained_est')

    # ----- Origin Constrained Model -----

    # create formula for origin constrained model
    formula_origin_constrained = f'Total ~ {orig_field} + log_Dj + log_cost -1'

    orig_sim = smf.glm(
        formula = formula_origin_constrained,
        data = new_df,
        family = sm.families.Poisson()
    ).fit()

    # assign parameter values
    alpha_i_orco = pd.DataFrame(orig_sim.params).reset_index().rename(columns = {0:'alpha_i', 'index': 'coef'})
    gamma_orco = orig_sim.params['log_Dj']
    beta_orco = -orig_sim.params['log_cost']

    # fix indeces
    to_repl = ["(" + orig_field + ")\[", "(" + dest_field + ")\[", "\]"]
    for x in to_repl:
        alpha_i_orco['coef'] = alpha_i_orco['coef'].str.replace(x, '', regex = True)

    # join with original dataframe
    new_df = new_df.merge(alpha_i_orco, left_on = orig_field, right_on = 'coef', how = 'left').drop(columns = ['coef'])

    # calculated the origin-constrained estimated value
    new_df['origin_constrained_est'] = round(
        np.exp(
            new_df['alpha_i'] 
            + (gamma_orco * new_df['log_Dj']) 
            - (beta_orco * new_df['log_cost'])
        ), 0).astype(int)
    
    # append column to the returning dataframe
    columns.append('origin_constrained_est')

    # ----- Destination Constrained Model -----

    # create formula for destination constrained model
    formula_dest_constrained = f'Total ~ log_Oi + {dest_field} + log_cost -1'

    dest_sim = smf.glm(
        formula = formula_dest_constrained,
        data = new_df,
        family = sm.families.Poisson()
    ).fit()

    # assign parameter values
    alpha_deco = dest_sim.params['log_Oi']
    gamma_j_deco = pd.DataFrame(dest_sim.params).reset_index().rename(columns = {0:'gamma_j', 'index': 'coef'})
    beta_deco = -dest_sim.params['log_cost']

    # fix indeces
    for x in to_repl:
        gamma_j_deco['coef'] = gamma_j_deco['coef'].str.replace(x, '', regex = True)

    # join with original dataframe
    new_df = new_df.merge(gamma_j_deco, left_on = dest_field, right_on = 'coef', how = 'left').drop(columns = ['coef'])

    # calculated the origin-constrained estimated value
    new_df['destination_constrained_est'] = round(
        np.exp(
            (alpha_deco * new_df['log_Oi']) 
            + new_df['gamma_j'] 
            - (beta_deco * new_df['log_cost'])
        ), 0).astype(int)
    
    # append column to the returning dataframe
    columns.append('destination_constrained_est')

    # ----- Doubly Constrained Model -----

    # create formula for doubly constrained model
    formula_double_constrained = f'Total ~ {orig_field} + {dest_field} + log_cost -1'

    double_sim = smf.glm(
        formula = formula_double_constrained,
        data = new_df,
        family = sm.families.Poisson()
    ).fit()

    # assign parameter values
    coefs_dbl = pd.DataFrame(double_sim.params).reset_index().rename(columns = {0:'value', 'index': 'coef'})
    alpha_i_dbl = coefs_dbl[coefs_dbl.coef.str.startswith(orig_field)].rename(columns = {'value': 'alpha_i_dbl'})
    gamma_j_dbl = coefs_dbl[coefs_dbl.coef.str.startswith(dest_field)].rename(columns = {'value': 'gamma_j_dbl'})
    beta = -double_sim.params['log_cost']

    # calculated the origin-constrained estimated value
    new_df['doubly_constrained_est'] = np.round(double_sim.mu, 0)
    
    # append column to the returning dataframe
    columns.append('doubly_constrained_est')
    # create returning dataframe
    return_df = new_df[columns].copy()

    # create a dictionary of returning models
    models = {
        'unconstrained': unco_sim,
        'origin_constrained': orig_sim,
        'destination_constrained': dest_sim,
        'doubly_constrained': double_sim
    }

    # return dataframe and summaries for each model
    return return_df, models

In [46]:
# define the goodness-of-fit models

# R-squared
def CalcRSquared(observed, estimated):
    """Calculate the r^2 from a series of observed and estimated target values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    r, p = scipy.stats.pearsonr(observed, estimated)
    R2 = r **2
    
    return R2


In [47]:
od_tube_df[od_tube_df.station_origin == 'Abbey Road']

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance
0,Abbey Road,Bank and Monument,0,599,78549,8131.525097
1,Abbey Road,Beckton,1,599,442,8510.121774
2,Abbey Road,Blackwall,3,599,665,3775.448872
3,Abbey Road,Canary Wharf,1,599,58772,5086.51422
4,Abbey Road,Canning Town,37,599,15428,2228.923167
5,Abbey Road,Crossharbour,1,599,1208,6686.47556
6,Abbey Road,Custom House,0,599,845,3824.85563
7,Abbey Road,Cutty Sark,2,599,1748,8503.898909
8,Abbey Road,Cyprus,7,599,850,6532.099618
9,Abbey Road,Devons Road,1,599,611,3958.324171


## Running the doubly constrained spatial interaction model

I will run the doubly constrained spatial interaction model to consider the spatial decay.

In [48]:
od_df_new = od_tube_df[od_tube_df['station_origin'] != od_tube_df['station_destination']].copy()

# calculate cost functions

# inverse power cij ** (-beta), logged as -beta * np.log(cij)
od_df_new['pow_cost'] = np.log(od_df_new['distance'])
# the negative exponential exp(-beta * cij), logged as -beta * cij
od_df_new['exp_cost'] = od_df_new['distance']   

# do the calculation for doubly constrained models - to calculate the -beta and the optimal exponential relationship
formula_pow = 'flows ~ station_origin + station_destination + pow_cost -1'
formula_exp = 'flows ~ station_origin + station_destination + exp_cost -1'

dbl_pow_model = smf.glm(
    formula = formula_pow,
    data = od_df_new,
    family = sm.families.Poisson()
).fit()

od_df_new['pred_pow'] = dbl_pow_model.mu

dbl_exp_model = smf.glm(
    formula = formula_exp,
    data = od_df_new,
    family = sm.families.Poisson()
).fit()

od_df_new['pred_exp'] = dbl_exp_model.mu



In [95]:
dbl_exp_model.summary()

0,1,2,3
Dep. Variable:,flows,No. Observations:,61456.0
Model:,GLM,Df Residuals:,60658.0
Model Family:,Poisson,Df Model:,797.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-851050.0
Date:,"Mon, 12 Feb 2024",Deviance:,1529900.0
Time:,01:13:20,Pearson chi2:,2020000.0
No. Iterations:,27,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
station_origin[Abbey Road],1.3541,0.068,20.016,0.000,1.222,1.487
station_origin[Acton Central],2.9653,0.061,48.422,0.000,2.845,3.085
station_origin[Acton Town],2.7807,0.056,49.310,0.000,2.670,2.891
station_origin[Aldgate],1.5372,0.057,26.942,0.000,1.425,1.649
station_origin[Aldgate East],1.6255,0.057,28.628,0.000,1.514,1.737
station_origin[All Saints],1.2836,0.065,19.669,0.000,1.156,1.411
station_origin[Alperton],2.7155,0.059,45.686,0.000,2.599,2.832
station_origin[Amersham],5.2245,0.063,83.195,0.000,5.101,5.348
station_origin[Anerley],3.1015,0.067,46.035,0.000,2.969,3.234


In [49]:
print(f"beta for power model: {dbl_pow_model.params['pow_cost']}")
print(f"beta for exp model: {dbl_exp_model.params['exp_cost']}")

beta_power = -dbl_pow_model.params['pow_cost']
beta_exp = -dbl_exp_model.params['exp_cost']

beta for power model: -0.9096317604932787
beta for exp model: -0.0001543696921559684


In [50]:
od_df_new

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,pow_cost,exp_cost,pred_pow,pred_exp
0,Abbey Road,Bank and Monument,0,599,78549,8131.525097,9.003504,8131.525097,54.840694,76.846663
1,Abbey Road,Beckton,1,599,442,8510.121774,9.049012,8510.121774,2.061926,2.677506
2,Abbey Road,Blackwall,3,599,665,3775.448872,8.236275,3775.448872,2.697948,3.665771
3,Abbey Road,Canary Wharf,1,599,58772,5086.514220,8.534348,5086.514220,76.517784,111.884860
4,Abbey Road,Canning Town,37,599,15428,2228.923167,7.709274,2228.923167,45.398702,47.768459
...,...,...,...,...,...,...,...,...,...,...
61469,Woolwich Arsenal,Tower Gateway,127,7892,3342,13401.795549,9.503144,13401.795549,256.673280,185.497591
61470,Woolwich Arsenal,West Ham,608,7892,5487,8701.454361,9.071245,8701.454361,152.447409,176.151791
61471,Woolwich Arsenal,West India Quay,6,7892,400,9536.720451,9.162905,9536.720451,19.538537,26.589485
61472,Woolwich Arsenal,West Silvertown,81,7892,893,5355.248554,8.585832,5355.248554,115.297649,131.895731


In [51]:
# save the logged cost function with calibrated beta

od_df_new['beta_dist_pow'] = -beta_power * od_df_new['pow_cost']
od_df_new['beta_dist_exp'] = -beta_exp * od_df_new['exp_cost']


In [39]:
od_df_new.to_csv('data/london_flows_pred1.csv')

In [52]:
pred_exp_matrix = od_df_new.pivot_table(
    values = 'pred_exp', index = 'station_origin', 
    columns = 'station_destination', aggfunc = 'sum', margins = True
)

pred_exp_matrix

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,30.622014,5.990000e+02
Acton Central,,,,,,,,,,,...,,,,,,,0.469541,,,1.224000e+03
Acton Town,,,,10.886281,9.791210,,16.650312,0.070891,,12.193996,...,40.470717,3.999578,2.137517,18.521361,,0.216578,,0.866650,,3.745000e+03
Aldgate,,,1.436803,,32.113611,,,0.006153,,23.242135,...,6.912031,,3.070407,2.092522,,0.708506,,0.966589,,2.886000e+03
Aldgate East,,,1.511900,37.571466,,,0.369446,0.006473,,24.450375,...,7.273300,0.718794,3.230022,2.201301,,0.966044,,1.016837,,3.172000e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,1.538469,38.132867,44.441013,,,,,25.378139,...,7.401117,,7.019473,,,,,,,4.868000e+03
Woodgrange Park,,1.531378,,,,,,,,,...,,,,,,,,,,5.300000e+02
Woodside Park,,,2.019784,17.068105,15.347081,,0.493552,,,25.327956,...,9.716580,,4.439803,,,,,,,3.093000e+03
Woolwich Arsenal,27.585104,,,,,28.731167,,,,,...,,,,,,,,,,7.892000e+03


In [41]:
print(f"Exponential: {CalcRSquared(od_df_new['flows'], od_df_new['pred_exp'])}")
print(f"Power: {CalcRSquared(od_df_new['flows'], od_df_new['pred_pow'])}")


Exponential: 0.4979027747356171
Power: 0.4077121206134792


### Result of Calibration

The exponential model seems to have a better fit, with the beta value being $\beta = 0.00015436969215638512$

Further analysis will be done using this parameter.

Things to consider - should we use the doubly constrained model to calibrate, or the singly constrained?


In [72]:
# load data with the above saved

od_df_new = pd.read_csv('data/london_flows_pred1.csv')


### Calibrating using the doubly constrained model

Using the cost function calibrated by the doubly constrained model, now consider the singly constrained model for further calibration.

In [73]:
od_df_new = od_df_new[(od_df_new['population'] != 0) & (od_df_new['jobs'] != 0)]

In [99]:
# origin constrained model, taking into account the beta value

# calculate the flow and the beta values
od_df_new['flow_beta_dist'] = od_df_new['flows'] * np.exp(od_df_new['beta_dist_exp'])

formula_orig_constrained2 = 'flow_beta_dist ~ station_origin + np.log(jobs) -1' 

orig_model2= smf.glm(
    formula = formula_orig_constrained2,
    data = od_df_new,
    family = sm.families.Poisson()    
).fit()

In [98]:
formula_orig_constrained_offset = 'flows ~ station_origin + np.log(jobs) -1' 

orig_model2_offset = smf.glm(
    formula = formula_orig_constrained2,
    data = od_df_new,
    family = sm.families.Poisson(),
    offset = od_df_new['beta_dist_exp']    
).fit()

In [100]:
# save the above results back into the data frame

od_df_new['flow_pred_model2'] = orig_model2.mu
od_df_new['flow_pred_model2_offset'] = orig_model2.mu


In [108]:
# get the parameters

Ai_offset = pd.DataFrame(orig_model2_offset.params[:-1]).reset_index().rename(columns = {'index': 'name', 0: 'Ai_model2_offset'})
gamma = orig_model2_offset.params['np.log(jobs)']

In [103]:
# remove annotations

replace = ['station_origin[', ']']

for r in replace:
    Ai_offset['name'] = Ai_offset['name'].str.replace(r, '')

Ai_offset

Unnamed: 0,name,Ai_model2_offset
0,Abbey Road,-2.865649
1,Acton Central,-1.908231
2,Acton Town,-2.633229
3,Aldgate,-3.287137
4,Aldgate East,-3.274695
...,...,...
393,Wood Street,-1.462466
394,Woodford,-2.203368
395,Woodgrange Park,-1.783360
396,Woodside Park,-2.758853


In [104]:
# merge A_i with original dataframe

od_df_merged = od_df_new.merge(Ai_offset, how = 'left', left_on = 'station_origin', right_on = 'name').drop(columns = 'name').copy()

od_df_merged

Unnamed: 0.1,Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,pow_cost,exp_cost,pred_pow,pred_exp,beta_dist_pow,beta_dist_exp,flow_beta_dist,flow_pred_model2,flow_pred_model2_offset,Ai_model2_offset
0,0,Abbey Road,Bank and Monument,0,599,78549,8131.525097,9.003504,8131.525097,54.840694,76.846663,-8.189873,-1.255261,0.000000,119.376960,119.376960,-2.865649
1,1,Abbey Road,Beckton,1,599,442,8510.121774,9.049012,8510.121774,2.061926,2.677506,-8.231268,-1.313705,0.268822,1.410516,1.410516,-2.865649
2,2,Abbey Road,Blackwall,3,599,665,3775.448872,8.236275,3775.448872,2.697948,3.665771,-7.491977,-0.582815,1.674974,2.001578,2.001578,-2.865649
3,3,Abbey Road,Canary Wharf,1,599,58772,5086.514220,8.534348,5086.514220,76.517784,111.884860,-7.763114,-0.785204,0.456027,93.108679,93.108679,-2.865649
4,4,Abbey Road,Canning Town,37,599,15428,2228.923167,7.709274,2228.923167,45.398702,47.768459,-7.012600,-0.344078,26.228320,29.601437,29.601437,-2.865649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61408,61469,Woolwich Arsenal,Tower Gateway,127,7892,3342,13401.795549,9.503144,13401.795549,256.673280,185.497591,-8.644362,-2.068831,16.044338,33.685281,33.685281,-0.549793
61409,61470,Woolwich Arsenal,West Ham,608,7892,5487,8701.454361,9.071245,8701.454361,152.447409,176.151791,-8.251493,-1.343241,158.687054,51.514808,51.514808,-0.549793
61410,61471,Woolwich Arsenal,West India Quay,6,7892,400,9536.720451,9.162905,9536.720451,19.538537,26.589485,-8.334869,-1.472181,1.376548,5.464156,5.464156,-0.549793
61411,61472,Woolwich Arsenal,West Silvertown,81,7892,893,5355.248554,8.585832,5355.248554,115.297649,131.895731,-7.809946,-0.826688,35.437163,10.873394,10.873394,-0.549793


In [109]:
# calculate the way we should be doing it

od_df_merged['flow_pred_model2o_value'] = np.exp(od_df_merged['Ai_model2_offset']  + gamma * np.log(od_df_merged['jobs']) - od_df_merged['beta_dist_exp'])

In [110]:
od_df_merged.pivot_table(values = 'flow_pred_model2o_value', index = 'station_origin', columns = 'station_destination', aggfunc = 'sum', margins = True)

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,102.451571,2.153981e+03
Acton Central,,,,,,,,,,,...,,,,,,,484.374129,,,4.960614e+04
Acton Town,,,,638.700679,672.231295,,18.384271,1283.668919,,548.116827,...,4.512994e+02,63.586404,645.137662,39.258862,,1070.437886,,482.214881,,1.186469e+05
Aldgate,,,133.337591,,30.735591,,,2217.814343,,43.123974,...,3.962558e+02,,67.350631,52.109417,,49.069177,,64.836302,,3.968131e+04
Aldgate East,,,140.146814,30.693813,,,137.419966,2331.696977,,45.338349,...,4.164916e+02,58.682114,70.809022,54.785186,,39.802625,,68.165584,,3.238935e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,3744.356462,822.184463,667.825385,,,,,1187.548058,...,1.112757e+04,,885.827869,,,,,,,5.028579e+05
Woodgrange Park,,1243.201656,,,,,,,,,...,,,,,,,,,,3.905025e+04
Woodside Park,,,930.960421,599.588948,631.235281,,912.846644,,,388.401599,...,2.766650e+03,,457.151627,,,,,,,1.652824e+05
Woolwich Arsenal,164.062899,,,,,222.509967,,,,,...,,,,,,,,,,5.409344e+04


In [107]:
od_df_merged.pivot_table(values = 'flows', index = 'station_origin', columns = 'station_destination', aggfunc = 'sum', margins = True)

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,32.0,599
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1224
Acton Town,,,,3.0,17.0,,35.0,0.0,,11.0,...,77.0,3.0,6.0,9.0,,0.0,,0.0,,3745
Aldgate,,,0.0,,0.0,,,0.0,,17.0,...,0.0,,4.0,8.0,,0.0,,0.0,,2886
Aldgate East,,,2.0,0.0,,,0.0,0.0,,20.0,...,24.0,0.0,0.0,12.0,,1.0,,1.0,,3172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,2.0,5.0,47.0,,,,,22.0,...,2.0,,1.0,,,,,,,4868
Woodgrange Park,,0.0,,,,,,,,,...,,,,,,,,,,530
Woodside Park,,,1.0,26.0,11.0,,0.0,,,59.0,...,0.0,,0.0,,,,,,,3093
Woolwich Arsenal,20.0,,,,,7.0,,,,,...,,,,,,,,,,7892


## Singly Constrained model

So appently the first way didn't work, although to calibrate the beta value this should be the ideal way to do it.


In [112]:
# origin constrained model without calibrating beta

formula_orig_constrained = 'flows ~ station_origin + np.log(jobs) + exp_cost -1'

orig_model1 = smf.glm(
    formula = formula_orig_constrained,
    data = od_df_new,
    family = sm.families.Poisson()
).fit()

In [121]:
# do the singly constrained model without calibrating the beta value

beta_single = -orig_model1.params['exp_cost']
gamma_single = orig_model1.params['np.log(jobs)']
Ai_single = pd.DataFrame(orig_model1.params[:-2]).reset_index().rename(columns = {'index': 'name', 0:'Ai_single'})

In [127]:
replace = ['station_origin[', ']']

for r in replace:
    Ai_single['name'] = Ai_single['name'].str.replace(r, '')

In [128]:
Ai_single

Unnamed: 0,name,Ai_single
0,Abbey Road,-2.914322
1,Acton Central,-1.162092
2,Acton Town,-1.613081
3,Aldgate,-2.943047
4,Aldgate East,-2.854752
...,...,...
393,Wood Street,-0.942621
394,Woodford,-0.633605
395,Woodgrange Park,-0.896422
396,Woodside Park,-1.149110


In [129]:
od_df_new

Unnamed: 0.1,Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,pow_cost,exp_cost,pred_pow,pred_exp,beta_dist_pow,beta_dist_exp,flow_beta_dist,flow_pred_model2,flow_pred_model2_offset
0,0,Abbey Road,Bank and Monument,0,599,78549,8131.525097,9.003504,8131.525097,54.840694,76.846663,-8.189873,-1.255261,0.000000,119.376960,119.376960
1,1,Abbey Road,Beckton,1,599,442,8510.121774,9.049012,8510.121774,2.061926,2.677506,-8.231268,-1.313705,0.268822,1.410516,1.410516
2,2,Abbey Road,Blackwall,3,599,665,3775.448872,8.236275,3775.448872,2.697948,3.665771,-7.491977,-0.582815,1.674974,2.001578,2.001578
3,3,Abbey Road,Canary Wharf,1,599,58772,5086.514220,8.534348,5086.514220,76.517784,111.884860,-7.763114,-0.785204,0.456027,93.108679,93.108679
4,4,Abbey Road,Canning Town,37,599,15428,2228.923167,7.709274,2228.923167,45.398702,47.768459,-7.012600,-0.344078,26.228320,29.601437,29.601437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61451,61469,Woolwich Arsenal,Tower Gateway,127,7892,3342,13401.795549,9.503144,13401.795549,256.673280,185.497591,-8.644362,-2.068831,16.044338,33.685281,33.685281
61452,61470,Woolwich Arsenal,West Ham,608,7892,5487,8701.454361,9.071245,8701.454361,152.447409,176.151791,-8.251493,-1.343241,158.687054,51.514808,51.514808
61453,61471,Woolwich Arsenal,West India Quay,6,7892,400,9536.720451,9.162905,9536.720451,19.538537,26.589485,-8.334869,-1.472181,1.376548,5.464156,5.464156
61454,61472,Woolwich Arsenal,West Silvertown,81,7892,893,5355.248554,8.585832,5355.248554,115.297649,131.895731,-7.809946,-0.826688,35.437163,10.873394,10.873394


In [130]:
od_df_merged2 = od_df_new.merge(Ai_single, left_on = 'station_origin', right_on = 'name', how = 'left').drop(columns = 'name').copy()

In [132]:
od_df_merged2['pred_singly'] = orig_model1.mu

In [136]:
# pivot table of prediction
od_df_merged2.pivot_table(values = 'pred_singly', index = 'station_origin', columns = 'station_destination', aggfunc = 'sum', margins = True)

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,7.476459,5.990000e+02
Acton Central,,,,,,,,,,,...,,,,,,,0.330341,,,1.224000e+03
Acton Town,,,,12.911555,12.622334,,13.676987,0.042334,,15.949025,...,13.335965,2.897455,1.918004,19.609826,,0.225785,,0.539138,,3.745000e+03
Aldgate,,,1.316174,,37.256255,,,0.003402,,27.473160,...,2.097498,,2.495388,2.046833,,0.664719,,0.545704,,2.886000e+03
Aldgate East,,,1.385347,40.112840,,,0.284141,0.003580,,28.909365,...,2.207735,0.479666,2.625838,2.153834,,0.904761,,0.574232,,3.172000e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,1.419594,40.998999,51.830424,,,,,30.212414,...,2.262311,,5.712650,,,,,,,4.868000e+03
Woodgrange Park,,1.012373,,,,,,,,,...,,,,,,,,,,5.300000e+02
Woodside Park,,,1.943810,19.300828,18.863474,,0.398684,,,31.515747,...,3.097719,,3.790032,,,,,,,3.093000e+03
Woolwich Arsenal,33.672341,,,,,36.082694,,,,,...,,,,,,,,,,7.892000e+03


In [137]:
# compare with original
od_df_merged2.pivot_table(values = 'flows', index = 'station_origin', columns = 'station_destination', aggfunc = 'sum', margins = True)

station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,32.0,599
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1224
Acton Town,,,,3.0,17.0,,35.0,0.0,,11.0,...,77.0,3.0,6.0,9.0,,0.0,,0.0,,3745
Aldgate,,,0.0,,0.0,,,0.0,,17.0,...,0.0,,4.0,8.0,,0.0,,0.0,,2886
Aldgate East,,,2.0,0.0,,,0.0,0.0,,20.0,...,24.0,0.0,0.0,12.0,,1.0,,1.0,,3172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,2.0,5.0,47.0,,,,,22.0,...,2.0,,1.0,,,,,,,4868
Woodgrange Park,,0.0,,,,,,,,,...,,,,,,,,,,530
Woodside Park,,,1.0,26.0,11.0,,0.0,,,59.0,...,0.0,,0.0,,,,,,,3093
Woolwich Arsenal,20.0,,,,,7.0,,,,,...,,,,,,,,,,7892


In [143]:
np.exp(od_df_merged2['Ai_single'] + gamma_single * np.log(od_df_merged2['jobs']) - beta_single * od_df_merged2['exp_cost'])

0         77.685360
1          1.465951
2          4.121373
3         99.484532
4         56.125688
            ...    
61408     98.835050
61409    295.253340
61410     35.953316
61411    125.112902
61412     79.691588
Length: 61413, dtype: float64

In [144]:
beta_single

0.0001531661934631571