## Data Matching - VOA

This notebook carries out the data matching process necessary to geo-reference Valuation Office Agency (VOA) Non-domestic rating data with building polygons. Refer to Section 3.5 of the Dissertation document to review the methodology carried out below. 

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import glob
import matplotlib 
import re
%matplotlib inline

In [2]:
# Read geojson file of buildings 
buildings= gpd.read_file("Data/BuildingData/FinalBuildings.geojson")

In [None]:
cols=['record_type',
      'ref',
      'UARN',
      'billing_authority_code',
      'firm_name',
      'number_name',
      'subst3',
      'subst2',
      'subst1',
      'street',
      'town',
      'postal_district',
      'county',
      'postcode',
      'scheme_reference',
      'primary_description',
      'totalarea',
      'subtotal',
      'totalvalue',
      'adoptedrv',
      'listyear',
      'baname',
      'baref',
      'voref',
      'from_date',
      'to_date',
      'SCAT_code',
      'uom',
      'unadjusted_price']

### !!! Reading VOA data

VOA data was downloaded from here: https://voaratinglists.blob.core.windows.net/downloads/uk-englandwales-ndr-2017-summaryvaluations-compiled-epoch-0026-baseline-csv.zip

As this datafile is very large, it has not been included in the GitHub repository

In [3]:
# Read VOA ratings - this was obtained from the Valuation Office Agency https://voaratinglists.blob.core.windows.net/html/rlidata.htm
voa= pd.read_csv('Data/BuildingData/voa/VOA_summary_valuations.csv',sep='*')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
voa.head()

Unnamed: 0,01,21045502000,12314845000,0335,The Occupier,SUITE 4 3RD FLR AT EASTHAMPSTEAD HOUSE,Unnamed: 6,Unnamed: 7,Unnamed: 8,TOWN SQUARE,...,8400,2017,Bracknell Forest,00277000103007/39T,33281154281,16-SEP-2019,Unnamed: 25,203,NIA,140.00
0,2,1,Third,Office,45.00,187.49,8437.0,,,,...,,,,,,,,,,
1,1,21045503000,12314846000,0335,The Occupier,SUITE 5 3RD FLR AT EASTHAMPSTEAD HOUSE,,,,TOWN SQUARE,...,12000.0,2017.0,Bracknell Forest,00277000103007/40T,33281150000.0,16-SEP-2019,,203.0,NIA,130.0
2,2,1,Third,Office,70.00,174.10,12187.0,,,,...,,,,,,,,,,
3,1,21045618000,12314847000,0335,The Occupier,SUITE 6 3RD FLR AT EASTHAMPSTEAD HOUSE,,,,TOWN SQUARE,...,3900.0,2017.0,Bracknell Forest,00277000103007/41T,33281150000.0,16-SEP-2019,,203.0,NIA,140.0
4,2,1,Third,Office,21.00,187.49,3937.0,,,,...,,,,,,,,,,


In [5]:
#rename columns
def rename_voa(df):
    # rename column headers
    d = {'old': df.columns}
    df_dict = pd.DataFrame(data=d)
    df_dict.set_index('old', inplace=True)

    df_dict['new']=['record_type',
      'ref',
      'UARN',
      'billing_authority_code',
      'firm_name',
      'number_name',
      'subst3',
      'subst2',
      'subst1',
      'street',
      'town',
      'postal_district',
      'county',
      'postcode',
      'scheme_reference',
      'primary_description',
      'totalarea',
      'subtotal',
      'totalvalue',
      'adoptedrv',
      'listyear',
      'baname',
      'baref',
      'voref',
      'from_date',
      'to_date',
      'SCAT_code',
      'uom',
      'unadjusted_price']

    # convert dataframe to dictionary
    col_dict = df_dict['new'].to_dict()
    df.rename(columns = col_dict, inplace=True)
    
    return df, df_dict

In [6]:
# rename column headers using function defined above
df, df_dict = rename_voa(voa) 

In [7]:
#separate out the two different record types - we only need first
voa=df[df.record_type == 1]

In [8]:
# drop columns that aren't relevant
voa=voa.drop(columns=['record_type',
      'scheme_reference',
      'subtotal',
      'totalvalue',
      'adoptedrv',
      'listyear',
      'baname',
      'baref',
      'voref',
      'from_date',
      'to_date',
      'uom',
      'unadjusted_price'])


In [10]:
#select only records in westminster
voa_westminster=voa[voa.billing_authority_code == '5990']

In [11]:
#replace nans with empty space
voa_westminster = voa_westminster.replace(np.nan, '', regex=True)


In [12]:
#join addresses
voa_westminster['LINE_ADDRESS'] = voa_westminster[['number_name', 'subst1', 'subst2','subst3','street']].agg(' '.join, axis=1)
voa_westminster['LINE_ADDRESS']=voa_westminster['LINE_ADDRESS'].str.strip()

In [14]:
#now read the AddressBase files 
path = "./Data/AddressBase" # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename,  index_col=None, header=0)
    li.append(df)


AddressBase = pd.concat(li, axis=0, ignore_index=True)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
#Simplify with what we want 
AddressBase=AddressBase[["UPRN","TOID","SINGLE_LINE_ADDRESS","SUB_BUILDING","BUILDING_NAME","BUILDING_NUMBER","STREET_NAME","POSTCODE"]]

In [16]:
#remove commas 
AddressBase["SINGLE_LINE_ADDRESS"]= AddressBase["SINGLE_LINE_ADDRESS"].str.replace(",", "")

In [17]:
#Now we want to match postcodes 
VOA_UPRN= pd.merge(voa_westminster,AddressBase,how='inner', left_on='postcode', right_on='POSTCODE')

In [18]:
#replace double spaces
VOA_UPRN['SINGLE_LINE_ADDRESS'] = VOA_UPRN['SINGLE_LINE_ADDRESS'].str.replace('\s+', ' ', regex=True)
VOA_UPRN['LINE_ADDRESS'] = VOA_UPRN['LINE_ADDRESS'].str.replace('\s+', ' ', regex=True)
VOA_UPRN['LINE_ADDRESS'] =VOA_UPRN['LINE_ADDRESS'].str.replace('&','AND')

In [19]:
#Define words that need to be replaced
repl = {'1ST' : 'FIRST',
       '2ND':'SECOND',
       '3RD':'THIRD',
       '4TH':'FOURTH',
        '5TH':'FIFTH',
        '6TH':'SIXTH',
        '7TH':'SEVENTH',
        '8TH':'EIGHTH',
        '9TH':'NINTH',
       'FLR':'FLOOR',
       'FLRS':'FLOOR',
        'HSE':'HOUSE',
        'GND':'GROUND',
       'GRND':'GROUND',
       'GD':'GROUND',
       'GRD':'GROUND',
       'MEZZ':'MEZZANINE',
       'BST':'BASEMENT'}

repl = {rf'\b{k}\b': v for k, v in repl.items()}

VOA_UPRN['LINE_ADDRESS'] =VOA_UPRN['LINE_ADDRESS'].replace(repl, regex=True)

In [20]:
VOA_UPRN["VOA_address"]=VOA_UPRN["LINE_ADDRESS"].str.split(" ")
VOA_UPRN["UPRN_address"]=VOA_UPRN["SINGLE_LINE_ADDRESS"].str.split(" ")
VOA_UPRN['voa_number']=VOA_UPRN["number_name"].str.split(" ").str[-1]

In [21]:
#remove the last 3 elements in the single line address
VOA_UPRN['UPRN_address'] = VOA_UPRN['UPRN_address'].str[:-3]

In [25]:
#calculate number of common words
row=VOA_UPRN.shape[0]
sets=[]

for i in range (0,row):
    test=set(VOA_UPRN['VOA_address'].iloc[i])&set(VOA_UPRN['UPRN_address'].iloc[i])
    sets.append(test)

In [26]:
VOA_UPRN["sets"]=sets


In [27]:
VOA_UPRN['coefficient']=VOA_UPRN['sets'].str.len()/VOA_UPRN['UPRN_address'].str.len()

In [28]:
VOA_UPRN['coefficient2']=VOA_UPRN['sets'].str.len()/VOA_UPRN['VOA_address'].str.len()

In [29]:
#Group by and get the maximum coefficient for each VOA rating
Group= VOA_UPRN.groupby(['ref'], sort=False)['coefficient'].max()

#Make a dataframe
Group=Group.to_frame().reset_index()

In [30]:
idx = VOA_UPRN.groupby(['ref'])['coefficient'].transform(max) == VOA_UPRN['coefficient']

Group=VOA_UPRN[idx]

In [31]:
#get the ones that match street numbers
Number_match=Group[Group.voa_number == Group.BUILDING_NUMBER]

In [32]:
idx = Number_match.groupby(['UPRN'])['coefficient'].transform(max) == Number_match['coefficient']

test=Number_match[idx]


In [33]:
test['length']=test['sets'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
idx = test.groupby(['UPRN'])['length'].transform(max) == test['length']

test=test[idx]

id2=test.groupby(['ref'])['length'].transform(max) == test['length']

test=test[id2]


In [35]:
final_voa_westminster = test

In [37]:
#merge with geometry data
#merge with original certificate data
voa_matched_final=pd.merge(buildings,final_voa_westminster,how='inner', left_on='fid',right_on='TOID')

In [38]:
#get building use categories -- Obtained from https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/449629/Special_category__primary_description__property_types_and_sector_mappings.xls
scat_codes=pd.read_csv('Data/BuildingData/SCAT_Codes.csv')

In [58]:
scat_codes.head()

Unnamed: 0,SCat,SCat_Suffix,SCat_Desc,Primary,Primary_Desc,Broad Property 1,Detailed Property 1,Broad Property 2,Sector,Sub-sector
0,3,G,Advertising Right,CA,Advertising Right & Premises,Commercial,Advertising rights,Other properties,Other,OTHER
1,3,G,Advertising Right,CA1,Advertising Station & Premises,Commercial,Advertising rights,Other properties,Other,OTHER
2,18,G,ATMs,CX,Commercial (Unclassified),Commercial,Other commercial,Other properties,Other,OTHER - RETAIL
3,19,G,Auction Rooms,CX,Commercial (Unclassified),Commercial,Other commercial,Other properties,Other,OTHER - RETAIL
4,21,G,Banks/Insurance/Building Society Offices & Oth...,CO,Offices & Premises,Commercial,Offices,Offices,Retail,RETAIL - FINANCIAL & PROFESSIONAL SERVICES


In [39]:
#select columns we want
scat_codes=scat_codes[['SCat','Detailed Property 1']]

In [40]:
#remove duplicates
scat_codes.drop_duplicates('SCat', inplace = True)

In [41]:
# now match this with voa matched
voa_matched_final=pd.merge(voa_matched_final,scat_codes,how='left',left_on='SCAT_code',right_on='SCat')

In [43]:
#rename column
voa_matched_final = voa_matched_final.rename(columns={'Detailed Property 1': 'dp1'})

In [64]:
voa_matched_final['Detailed Property 1'].unique()

array(['Shops', 'Warehouses & stores',
       'Other educational, training and cultural', 'Medical facilities',
       'Offices', 'Local government offices', 'Pubs & wine bars',
       'Restaurants & cafes', 'Garages & petrol stations',
       'Community centres & halls', 'Advertising rights',
       'Factories, mills & workshops', 'Universities ',
       'Other industrial', 'Private schools & colleges',
       'Other commercial', 'Cinemas, theatres etc.', 'Car parks',
       'Other properties', 'Other leisure', 'Hotels etc.',
       'Hostels & homes', 'Sports centres & stadia',
       'Local authority schools & colleges', 'Libraries and museums',
       'Bus stations, moorings etc.', 'Police stations & courts'],
      dtype=object)

In [44]:
#create a dictionary now, mapping out Detailed property description to the building use categories used by the ND-Need methodology - Deatils here:https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/936797/ND-NEED.pdf#page=39&zoom=100,57,94

use_mapping = {'Shops': 'Shops', 
               'Warehouses & stores': 'Warehouses',
               'Other educational, training and cultural': 'Education',
               'Medical facilities': 'Health',
                'Offices': 'Offices',
               'Local government offices': 'Offices',
               'Pubs & wine bars': 'Hospitality',
               'Restaurants & cafes':'Hospitality',
               'Garages & petrol stations':'Other',
               'Community centres & halls':'Arts, Community and Leisure',
                'Advertising rights': 'Other',
               'Factories, mills & workshops':'Factories',
               'Universities':'Education',
               'Other industrial':'Other',
               'Private schools & colleges':'Education',
               'Other commercial':'Shops',
               'Cinemas, theatres etc.':'Arts, Community and Leisure',
               'Car parks':'Other',
               'Other properties':'Other',
               'Other leisure':'Arts, Community and Leisure',
               'Hotels etc.':'Hospitality',
               'Hostels & homes':'Hospitality',
               'Sports centres & stadia':'Arts, Community and Leisure',
               'Local authority schools & colleges':'Education',
               'Libraries and museums':'Arts, Community and Leisure',
               'Bus stations, moorings etc.':'Other',
               'Police stations & courts':'Emergency Services'
                }


voa_matched_final['building_use'] = voa_matched_final.dp1.map(use_mapping)

In [45]:
#finally drop any duplicate VOA records
voa_matched_final.drop_duplicates('ref', inplace = True)

In [46]:
# drop columns that aren't relevant
voa_matched_final=voa_matched_final.drop(columns=['sets',
      'UPRN_address',
      'VOA_address',
      'coefficient',
      'coefficient2',
      'length',
      'SCat'])

In [47]:
#write as geojson
voa_matched_final.to_file("Data/BuildingData/buildings_VOA.geojson", driver='GeoJSON')
