## Data Matching - DEC

This notebook carries out the data matching process necessary to geo-reference Display Energy Certificate (DEC) data with building polygons. Refer to Section 3.5 of the Dissertation document to review the methodology carried out below. 

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import glob
import matplotlib 
%matplotlib inline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
# Read geojson file of buildings 
buildings= gpd.read_file("Data/BuildingData/FinalBuildings.geojson")


In [7]:
buildings.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 51586 entries, 0 to 51585
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   fid                  51586 non-null  object  
 1   featurecode          51586 non-null  int64   
 2   version              51586 non-null  int64   
 3   versiondate          51586 non-null  object  
 4   theme                51586 non-null  object  
 5   calculatedareavalue  51586 non-null  object  
 6   changedate           51586 non-null  object  
 7   reasonforchange      51586 non-null  object  
 8   descriptivegroup     51586 non-null  object  
 9   descriptiveterm      51586 non-null  object  
 10  make                 51586 non-null  object  
 11  physicallevel        51586 non-null  int64   
 12  physicalpresence     51586 non-null  object  
 13  poly_broken          51586 non-null  object  
 14  NUMPOINTS            51586 non-null  float64 
 15  geometry   

In [8]:
#Remove unwanted columns
#colums to be deleted: 
delete=["poly_broken","physicalpresence","physicallevel","make","descriptiveterm","descriptivegroup","reasonforchange","changedate","versiondate","theme"]

#drop those columns
buildings.drop(delete, axis=1, inplace=True)

In [9]:
#!! now read the AddressBase files 
path = "./Data/AddressBase" # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename,  index_col=None, header=0)
    li.append(df)


AddressBase = pd.concat(li, axis=0, ignore_index=True)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
#Let's try to join UPRN and buildings by TOID

AddressBaseSimple=AddressBase[['TOID','UPRN']]

BuildingsUPRN=pd.merge(buildings, AddressBaseSimple, how='left', left_on='fid',right_on='TOID')

#Remove NA values
BuildingsUPRN.dropna(inplace=True)

In [12]:
#Now try to flatten database and get UPRNS into a list
BuildingsUPRN.head(10)

#convert UPRN to string to match
BuildingsUPRN["UPRN"] = BuildingsUPRN["UPRN"].astype(np.int64)
BuildingsUPRN["UPRN"] = BuildingsUPRN["UPRN"].astype(str)

In [13]:
#creates a list of UPRNs for each fid
fidUPRN = BuildingsUPRN.groupby(['fid'])['UPRN'].apply(' , '.join).reset_index()

In [14]:

#Simplify with what we want 
AddressBase=AddressBase[["UPRN","SINGLE_LINE_ADDRESS","SUB_BUILDING","BUILDING_NAME","BUILDING_NUMBER","STREET_NAME","POSTCODE"]]



## MATCH FULL DEC DATASET

In [15]:
#read full epc dataset - Downloaded fro westminster from here: https://epc.opendatacommunities.org/

DEC= pd.read_csv('Data/EnergyData/DEC_westminster/certificates.csv')

DEC.head()


Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_OPERATIONAL_RATING,YR1_OPERATIONAL_RATING,YR2_OPERATIONAL_RATING,OPERATIONAL_RATING_BAND,ELECTRIC_CO2,HEATING_CO2,RENEWABLES_CO2,PROPERTY_TYPE,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,MAIN_BENCHMARK,MAIN_HEATING_FUEL,OTHER_FUEL,SPECIAL_ENERGY_USES,RENEWABLE_SOURCES,TOTAL_FLOOR_AREA,ANNUAL_THERMAL_FUEL_USAGE,TYPICAL_THERMAL_FUEL_USAGE,ANNUAL_ELECTRICAL_FUEL_USAGE,TYPICAL_ELECTRICAL_FUEL_USAGE,RENEWABLES_FUEL_THERMAL,RENEWABLES_ELECTRICAL,YR1_ELECTRICITY_CO2,YR2_ELECTRICITY_CO2,YR1_HEATING_CO2,YR2_HEATING_CO2,YR1_RENEWABLES_CO2,YR2_RENEWABLES_CO2,AIRCON_PRESENT,AIRCON_KW_RATING,ESTIMATED_AIRCON_KW_RATING,AC_INSPECTION_COMMISSIONED,BUILDING_ENVIRONMENT,BUILDING_CATEGORY,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,NOMINATED_DATE,OR_ASSESSMENT_END_DATE,LODGEMENT_DATETIME,OCCUPANCY_LEVEL
0,56019860062013032514282749030111,MAIN BUILDING,Westminster Adult Education Service,"Amberley Road Centre, Amberley Road",W9 2JJ,143861960001,99.0,101.0,102.0,D,210.0,76.0,0.0,University Campus,2011-07-09,E09000033,E14001036,Greater London Authority,2013-03-25,University Campus,Natural Gas,,,,3342.0,117.0,219.0,114.0,80.0,0.0,0.0,215.0,295.0,84.0,106.0,0.0,0.0,No,,,5.0,Heating and Natural Ventilation,S4;,"MAIN BUILDING, Westminster Adult Education Ser...",Westminster,Westminster North,LONDON,2012-07-10,2012-07-03,2013-03-25 14:28:27,Standard Occupancy
1,26484680062012030912251924070120,,National Audit Office,157-197 Buckingham Palace Road,SW1W 9SP,225383410000,110.0,104.0,,E,1714.0,43.0,0.0,General Office; Restaurant,2012-03-09,E09000033,E14000639,Greater London Authority,2012-03-09,General Office,Natural Gas,,,,20178.5,11.0,115.0,154.0,103.0,0.0,0.0,1668.0,,56.0,,0.0,,Yes,,3.0,5.0,Air Conditioning,C1; H1;,"National Audit Office, 157-197 Buckingham Pala...",Westminster,Cities of London and Westminster,LONDON,2012-03-31,2012-01-31,2012-03-09 12:25:19,Standard Occupancy
2,8664750962017113012480995290120,,Soho Fire Station,126-128 Shaftesbury Avenue,W1D 5ET,290855580000,65.0,,,C,47.0,61.0,0.0,Emergency Services,2017-11-29,E09000033,E14000639,Greater London Authority,2017-11-30,Emergency Services,Natural Gas,,,,1565.0,,,,,,,,,,,,,No,,,4.0,Heating and Natural Ventilation,S9;,"Soho Fire Station, 126-128 Shaftesbury Avenue",Westminster,Cities of London and Westminster,LONDON,2017-11-30,2017-09-02,2017-11-30 12:48:09,Standard Occupancy
3,5227060962018122012490377920140,,St. Marys Hospital Medical School,Norfolk Place,W2 1PG,477360730000,111.0,111.0,117.0,E,2167.0,635.0,0.0,Laboratory Or Operating Theatre,2018-12-19,E09000033,E14000639,Greater London Authority,2018-12-20,Laboratory Or Operating Theatre,Natural Gas,,,,16720.0,196.0,171.0,236.0,213.0,0.0,0.0,1823.0,1842.0,950.0,1066.0,0.0,0.0,Yes,,3.0,5.0,Air Conditioning,S10;,"St. Marys Hospital Medical School, Norfolk Place",Westminster,Cities of London and Westminster,LONDON,2018-12-11,2018-11-30,2018-12-20 12:49:03,Extended Occupancy
4,120427960022017061210281420060141,MAIN BUILDING,St. Gabriel's School,Churchill Gardens,SW1V 3AG,421361060001,70.0,53.0,45.0,C,14.0,37.0,0.0,Schools And Seasonal Public Buildings,2017-06-09,E09000033,E14000639,Greater London Authority,2017-06-12,Schools And Seasonal Public Buildings,District Heating,,,,1470.0,119.0,137.0,17.0,40.0,0.0,0.0,4.0,2.0,34.0,40.0,0.0,0.0,No,,,4.0,Heating and Natural Ventilation,S3;,"MAIN BUILDING, St. Gabriel's School, Churchill...",Westminster,Cities of London and Westminster,LONDON,2017-06-30,2017-04-01,2017-06-12 10:28:14,Standard Occupancy


In [16]:
import numpy as np
#clean up string data

DEC_ob = DEC.select_dtypes(['object'])
DEC[DEC_ob.columns] = DEC[DEC_ob.columns].replace(np.nan, '', regex=True)

DEC.head()

Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_OPERATIONAL_RATING,YR1_OPERATIONAL_RATING,YR2_OPERATIONAL_RATING,OPERATIONAL_RATING_BAND,ELECTRIC_CO2,HEATING_CO2,RENEWABLES_CO2,PROPERTY_TYPE,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,MAIN_BENCHMARK,MAIN_HEATING_FUEL,OTHER_FUEL,SPECIAL_ENERGY_USES,RENEWABLE_SOURCES,TOTAL_FLOOR_AREA,ANNUAL_THERMAL_FUEL_USAGE,TYPICAL_THERMAL_FUEL_USAGE,ANNUAL_ELECTRICAL_FUEL_USAGE,TYPICAL_ELECTRICAL_FUEL_USAGE,RENEWABLES_FUEL_THERMAL,RENEWABLES_ELECTRICAL,YR1_ELECTRICITY_CO2,YR2_ELECTRICITY_CO2,YR1_HEATING_CO2,YR2_HEATING_CO2,YR1_RENEWABLES_CO2,YR2_RENEWABLES_CO2,AIRCON_PRESENT,AIRCON_KW_RATING,ESTIMATED_AIRCON_KW_RATING,AC_INSPECTION_COMMISSIONED,BUILDING_ENVIRONMENT,BUILDING_CATEGORY,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,NOMINATED_DATE,OR_ASSESSMENT_END_DATE,LODGEMENT_DATETIME,OCCUPANCY_LEVEL
0,56019860062013032514282749030111,MAIN BUILDING,Westminster Adult Education Service,"Amberley Road Centre, Amberley Road",W9 2JJ,143861960001,99.0,101.0,102.0,D,210.0,76.0,0.0,University Campus,2011-07-09,E09000033,E14001036,Greater London Authority,2013-03-25,University Campus,Natural Gas,,,,3342.0,117.0,219.0,114.0,80.0,0.0,0.0,215.0,295.0,84.0,106.0,0.0,0.0,No,,,5.0,Heating and Natural Ventilation,S4;,"MAIN BUILDING, Westminster Adult Education Ser...",Westminster,Westminster North,LONDON,2012-07-10,2012-07-03,2013-03-25 14:28:27,Standard Occupancy
1,26484680062012030912251924070120,,National Audit Office,157-197 Buckingham Palace Road,SW1W 9SP,225383410000,110.0,104.0,,E,1714.0,43.0,0.0,General Office; Restaurant,2012-03-09,E09000033,E14000639,Greater London Authority,2012-03-09,General Office,Natural Gas,,,,20178.5,11.0,115.0,154.0,103.0,0.0,0.0,1668.0,,56.0,,0.0,,Yes,,3.0,5.0,Air Conditioning,C1; H1;,"National Audit Office, 157-197 Buckingham Pala...",Westminster,Cities of London and Westminster,LONDON,2012-03-31,2012-01-31,2012-03-09 12:25:19,Standard Occupancy
2,8664750962017113012480995290120,,Soho Fire Station,126-128 Shaftesbury Avenue,W1D 5ET,290855580000,65.0,,,C,47.0,61.0,0.0,Emergency Services,2017-11-29,E09000033,E14000639,Greater London Authority,2017-11-30,Emergency Services,Natural Gas,,,,1565.0,,,,,,,,,,,,,No,,,4.0,Heating and Natural Ventilation,S9;,"Soho Fire Station, 126-128 Shaftesbury Avenue",Westminster,Cities of London and Westminster,LONDON,2017-11-30,2017-09-02,2017-11-30 12:48:09,Standard Occupancy
3,5227060962018122012490377920140,,St. Marys Hospital Medical School,Norfolk Place,W2 1PG,477360730000,111.0,111.0,117.0,E,2167.0,635.0,0.0,Laboratory Or Operating Theatre,2018-12-19,E09000033,E14000639,Greater London Authority,2018-12-20,Laboratory Or Operating Theatre,Natural Gas,,,,16720.0,196.0,171.0,236.0,213.0,0.0,0.0,1823.0,1842.0,950.0,1066.0,0.0,0.0,Yes,,3.0,5.0,Air Conditioning,S10;,"St. Marys Hospital Medical School, Norfolk Place",Westminster,Cities of London and Westminster,LONDON,2018-12-11,2018-11-30,2018-12-20 12:49:03,Extended Occupancy
4,120427960022017061210281420060141,MAIN BUILDING,St. Gabriel's School,Churchill Gardens,SW1V 3AG,421361060001,70.0,53.0,45.0,C,14.0,37.0,0.0,Schools And Seasonal Public Buildings,2017-06-09,E09000033,E14000639,Greater London Authority,2017-06-12,Schools And Seasonal Public Buildings,District Heating,,,,1470.0,119.0,137.0,17.0,40.0,0.0,0.0,4.0,2.0,34.0,40.0,0.0,0.0,No,,,4.0,Heating and Natural Ventilation,S3;,"MAIN BUILDING, St. Gabriel's School, Churchill...",Westminster,Cities of London and Westminster,LONDON,2017-06-30,2017-04-01,2017-06-12 10:28:14,Standard Occupancy


In [17]:
#we need to deal with properties where more than one certificate exists - we want the latest property
DEC=DEC.sort_values('INSPECTION_DATE').groupby('BUILDING_REFERENCE_NUMBER').tail(1)

In [18]:
#replace comma with space
AddressBase["SINGLE_LINE_ADDRESS"]= AddressBase["SINGLE_LINE_ADDRESS"].str.replace(",", "")

In [19]:
#convert EPC addresses into string and remove any extra spaces on either side of string

DEC['ADDRESS1'] = DEC['ADDRESS1'].astype(str).str.upper()
DEC['ADDRESS2'] = DEC['ADDRESS2'].astype(str).str.upper()
DEC['ADDRESS3'] = DEC['ADDRESS3'].astype(str).str.upper()

DEC_ob = DEC.select_dtypes(['object'])
DEC[DEC_ob.columns] = DEC_ob.apply(lambda x: x.str.strip())

In [20]:
#join addresses
DEC['LINE_ADDRESS'] = DEC[['ADDRESS1', 'ADDRESS2', 'ADDRESS3']].agg(' '.join, axis=1)
DEC['LINE_ADDRESS']=DEC['LINE_ADDRESS'].str.replace(',','')
DEC['LINE_ADDRESS']=DEC['LINE_ADDRESS'].str.strip()

In [21]:
#Now we want to match postcodes 
DEC_UPRN= pd.merge(DEC,AddressBase,how='inner', on='POSTCODE')

In [22]:
#replace double spaces
DEC_UPRN['SINGLE_LINE_ADDRESS'] = DEC_UPRN['SINGLE_LINE_ADDRESS'].str.replace('\s+', ' ', regex=True)

In [23]:
#split address into words
DEC_UPRN["DEC_address"]=DEC_UPRN["LINE_ADDRESS"].str.split(" ")
DEC_UPRN["UPRN_address"]=DEC_UPRN["SINGLE_LINE_ADDRESS"].str.split(" ")  

In [24]:
#remove the last 3 elements in the single line address
DEC_UPRN['UPRN_address'] = DEC_UPRN['UPRN_address'].str[:-3]

In [25]:
#count number of words in common
row=DEC_UPRN.shape[0]
sets=[]

for i in range (0,row):
    test=set(DEC_UPRN['DEC_address'].iloc[i])&set(DEC_UPRN['UPRN_address'].iloc[i])
    sets.append(test)

In [26]:
DEC_UPRN["sets"]=sets


In [27]:
DEC_UPRN.head()

Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_OPERATIONAL_RATING,YR1_OPERATIONAL_RATING,YR2_OPERATIONAL_RATING,OPERATIONAL_RATING_BAND,ELECTRIC_CO2,HEATING_CO2,RENEWABLES_CO2,PROPERTY_TYPE,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,MAIN_BENCHMARK,MAIN_HEATING_FUEL,OTHER_FUEL,SPECIAL_ENERGY_USES,RENEWABLE_SOURCES,TOTAL_FLOOR_AREA,ANNUAL_THERMAL_FUEL_USAGE,TYPICAL_THERMAL_FUEL_USAGE,ANNUAL_ELECTRICAL_FUEL_USAGE,TYPICAL_ELECTRICAL_FUEL_USAGE,RENEWABLES_FUEL_THERMAL,RENEWABLES_ELECTRICAL,YR1_ELECTRICITY_CO2,YR2_ELECTRICITY_CO2,YR1_HEATING_CO2,YR2_HEATING_CO2,YR1_RENEWABLES_CO2,YR2_RENEWABLES_CO2,AIRCON_PRESENT,AIRCON_KW_RATING,ESTIMATED_AIRCON_KW_RATING,AC_INSPECTION_COMMISSIONED,BUILDING_ENVIRONMENT,BUILDING_CATEGORY,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,NOMINATED_DATE,OR_ASSESSMENT_END_DATE,LODGEMENT_DATETIME,OCCUPANCY_LEVEL,LINE_ADDRESS,UPRN,SINGLE_LINE_ADDRESS,SUB_BUILDING,BUILDING_NAME,BUILDING_NUMBER,STREET_NAME,DEC_address,UPRN_address,sets
0,7528090022008121712371638030210,,UNIVERSITY COLLEGE LONDON NHS FOUNDATION TRUST,140 HAMPSTEAD ROAD,W1W 6DN,136793870000,0.0,,,A,338.0,1.0,0.0,Information not provided,2008-07-02,E09000033,E14000639,Greater London Authority,2008-12-17,CAN'T FIND IN DIRECTORY,Natural Gas,,,,6300.0,0.0,204.0,97.0,65.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Natural Ventilation,C1;,University College London NHS Foundation Trust...,Westminster,Cities of London and Westminster,LONDON,2008-07-02,2008-07-01,2008-12-17 12:37:16,,UNIVERSITY COLLEGE LONDON NHS FOUNDATION TRUST...,100023469087,JOHN ASTOR HOUSE 3-11 FOLEY STREET LONDON W1W 6DN,,JOHN ASTOR HOUSE,3-11,FOLEY STREET,"[UNIVERSITY, COLLEGE, LONDON, NHS, FOUNDATION,...","[JOHN, ASTOR, HOUSE, 3-11, FOLEY, STREET]",{}
1,6840524012008121712373308000592,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,7A WOODFIELD ROAD,W9 2BA,527748190000,86.0,,,D,280.0,187.0,0.0,Information not provided,2008-07-02,E09000033,E14001036,Greater London Authority,2008-12-17,CAN'T FIND IN DIRECTORY,Natural Gas,,,,3890.0,253.0,348.0,131.0,135.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Natural Ventilation,C1;,Central and North West London NHS Foundation T...,Westminster,Westminster North,LONDON,2008-07-02,2008-07-01,2008-12-17 12:37:33,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,10033556886,FLAT 1 42 WOODFIELD ROAD LONDON W9 2BA,FLAT 1,,42,WOODFIELD ROAD,"[CENTRAL, AND, NORTH, WEST, LONDON, NHS, FOUND...","[FLAT, 1, 42, WOODFIELD, ROAD]","{ROAD, WOODFIELD}"
2,6840524012008121712373308000592,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,7A WOODFIELD ROAD,W9 2BA,527748190000,86.0,,,D,280.0,187.0,0.0,Information not provided,2008-07-02,E09000033,E14001036,Greater London Authority,2008-12-17,CAN'T FIND IN DIRECTORY,Natural Gas,,,,3890.0,253.0,348.0,131.0,135.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Natural Ventilation,C1;,Central and North West London NHS Foundation T...,Westminster,Westminster North,LONDON,2008-07-02,2008-07-01,2008-12-17 12:37:33,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,10033556887,FLAT 2 42 WOODFIELD ROAD LONDON W9 2BA,FLAT 2,,42,WOODFIELD ROAD,"[CENTRAL, AND, NORTH, WEST, LONDON, NHS, FOUND...","[FLAT, 2, 42, WOODFIELD, ROAD]","{ROAD, WOODFIELD}"
3,6840524012008121712373308000592,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,7A WOODFIELD ROAD,W9 2BA,527748190000,86.0,,,D,280.0,187.0,0.0,Information not provided,2008-07-02,E09000033,E14001036,Greater London Authority,2008-12-17,CAN'T FIND IN DIRECTORY,Natural Gas,,,,3890.0,253.0,348.0,131.0,135.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Natural Ventilation,C1;,Central and North West London NHS Foundation T...,Westminster,Westminster North,LONDON,2008-07-02,2008-07-01,2008-12-17 12:37:33,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,100022821522,UNION TAVERN 45 WOODFIELD ROAD LONDON W9 2BA,,,45,WOODFIELD ROAD,"[CENTRAL, AND, NORTH, WEST, LONDON, NHS, FOUND...","[UNION, TAVERN, 45, WOODFIELD, ROAD]","{ROAD, WOODFIELD}"
4,6840524012008121712373308000592,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,7A WOODFIELD ROAD,W9 2BA,527748190000,86.0,,,D,280.0,187.0,0.0,Information not provided,2008-07-02,E09000033,E14001036,Greater London Authority,2008-12-17,CAN'T FIND IN DIRECTORY,Natural Gas,,,,3890.0,253.0,348.0,131.0,135.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Natural Ventilation,C1;,Central and North West London NHS Foundation T...,Westminster,Westminster North,LONDON,2008-07-02,2008-07-01,2008-12-17 12:37:33,,CENTRAL AND NORTH WEST LONDON NHS FOUNDATION T...,10033618054,43 WOODFIELD ROAD LONDON W9 2BA,,,43,WOODFIELD ROAD,"[CENTRAL, AND, NORTH, WEST, LONDON, NHS, FOUND...","[43, WOODFIELD, ROAD]","{ROAD, WOODFIELD}"


In [29]:
#calculate string similarity
DEC_UPRN['coefficient']=DEC_UPRN['sets'].str.len()/DEC_UPRN['DEC_address'].str.len()

In [30]:
#Group by and get the maximum coefficient for each DEC certificate
Group= DEC_UPRN.groupby(['BUILDING_REFERENCE_NUMBER'], sort=False)['coefficient'].max()

#Make a dataframe
Group=Group.to_frame().reset_index()

In [31]:
idx = DEC_UPRN.groupby(['BUILDING_REFERENCE_NUMBER'])['coefficient'].transform(max) == DEC_UPRN['coefficient']

Group=DEC_UPRN[idx]

In [32]:
Final_match=Group.drop_duplicates(subset=['BUILDING_REFERENCE_NUMBER'], keep='last')

In [33]:
#filter out ones that clearly don't have a match
Final_match = Final_match[Final_match['coefficient'] >= 0.5]

In [34]:
Final_match.head(100)

Unnamed: 0,LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_OPERATIONAL_RATING,YR1_OPERATIONAL_RATING,YR2_OPERATIONAL_RATING,OPERATIONAL_RATING_BAND,ELECTRIC_CO2,HEATING_CO2,RENEWABLES_CO2,PROPERTY_TYPE,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,MAIN_BENCHMARK,MAIN_HEATING_FUEL,OTHER_FUEL,SPECIAL_ENERGY_USES,RENEWABLE_SOURCES,TOTAL_FLOOR_AREA,ANNUAL_THERMAL_FUEL_USAGE,TYPICAL_THERMAL_FUEL_USAGE,ANNUAL_ELECTRICAL_FUEL_USAGE,TYPICAL_ELECTRICAL_FUEL_USAGE,RENEWABLES_FUEL_THERMAL,RENEWABLES_ELECTRICAL,YR1_ELECTRICITY_CO2,YR2_ELECTRICITY_CO2,YR1_HEATING_CO2,YR2_HEATING_CO2,YR1_RENEWABLES_CO2,YR2_RENEWABLES_CO2,AIRCON_PRESENT,AIRCON_KW_RATING,ESTIMATED_AIRCON_KW_RATING,AC_INSPECTION_COMMISSIONED,BUILDING_ENVIRONMENT,BUILDING_CATEGORY,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,NOMINATED_DATE,OR_ASSESSMENT_END_DATE,LODGEMENT_DATETIME,OCCUPANCY_LEVEL,LINE_ADDRESS,UPRN,SINGLE_LINE_ADDRESS,SUB_BUILDING,BUILDING_NAME,BUILDING_NUMBER,STREET_NAME,DEC_address,UPRN_address,sets,coefficient
214,6953940002009010509352379109000,,H M REVENUE & CUSTOMS,11 BELGRAVE ROAD,SW1V 1TU,703434170000,200.0,,,G,1096.0,316.0,,Information not provided,2008-10-01,E09000033,E14000639,Greater London Authority,2009-01-05,General office,Oil,,,,5073.75,323.0,161.0,393.0,197.0,0.0,0.0,,,,,,,,,,,Heating and Natural Ventilation,C1;,"H M Revenue & Customs, 11 Belgrave Road",Westminster,Cities of London and Westminster,LONDON,2008-10-01,2008-07-31,2009-01-05 09:35:23,,H M REVENUE & CUSTOMS 11 BELGRAVE ROAD,10092014722,H M REVENUE & CUSTOMS H M REVENUE & CUSTOMS LO...,H M REVENUE & CUSTOMS LONDON REGION LEARNING S...,,11,BELGRAVE ROAD,"[H, M, REVENUE, &, CUSTOMS, 11, BELGRAVE, ROAD]","[H, M, REVENUE, &, CUSTOMS, H, M, REVENUE, &, ...","{ROAD, CUSTOMS, H, 11, BELGRAVE, &, REVENUE, M}",1.0
328,20720819912009102614301401900840,,33 GREYCOAT STREET,,SW1P 2QF,807191640000,200.0,151.0,,G,208.0,90.0,,General office,2009-10-19,E09000033,E14000639,Greater London Authority,2009-10-26,General office,Grid Supplied Electricity,,,,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,187.0,,38.0,,,,Yes,,2.0,5.0,Air Conditioning,C1;,33 Greycoat Street,Westminster,Cities of London and Westminster,LONDON,2009-11-01,2009-08-31,2009-10-26 14:30:14,,33 GREYCOAT STREET,100023338432,33 GREYCOAT STREET LONDON SW1P 2QF,,,33,GREYCOAT STREET,"[33, GREYCOAT, STREET]","[33, GREYCOAT, STREET]","{33, GREYCOAT, STREET}",1.0
386,46080610912014022512404903909437,,PARTNERSHIPS FOR SCHOOLS,33 GREYCOAT STREET,SW1P 2QF,471203430000,139.0,142.0,190.0,F,227.0,0.0,0.0,General Office,2012-12-19,E09000033,E14000639,Greater London Authority,2014-02-25,General Office,Grid Supplied Electricity,,,,2147.0,0.0,122.0,192.0,95.0,0.0,0.0,226.0,301.0,0.0,0.0,0.0,0.0,Yes,,2.0,5.0,Air Conditioning,C1;,"Partnerships for Schools, 33 Greycoat Street",Westminster,Cities of London and Westminster,LONDON,2014-02-28,2013-11-30,2014-02-25 12:40:49,Standard Occupancy,PARTNERSHIPS FOR SCHOOLS 33 GREYCOAT STREET,100023338432,33 GREYCOAT STREET LONDON SW1P 2QF,,,33,GREYCOAT STREET,"[PARTNERSHIPS, FOR, SCHOOLS, 33, GREYCOAT, STR...","[33, GREYCOAT, STREET]","{33, GREYCOAT, STREET}",0.5
402,717370242008100113191259209000,,THE RENT SERVICE,5 WELBECK STREET,W1G 9YQ,321497250000,102.0,,,E,127.0,105.0,,General office,2008-10-01,E09000033,E14000639,Greater London Authority,2008-10-01,General office,Natural Gas,,Server Room; Not metered,,3078.0,176.0,95.0,75.0,110.0,0.0,0.0,,,,,,,,,,,Mixed-mode with Mechanical Ventilation,C1; C1; C1; C1; C1; C1; C1; C1; C1; C1;,"The Rent Service, 5 Welbeck Street",Westminster,Cities of London and Westminster,LONDON,,2008-08-26,2008-10-01 13:19:12,,THE RENT SERVICE 5 WELBECK STREET,200002817438,BLACKWOOD CAPITAL 5 WELBECK STREET LONDON W1G 9YQ,,,5,WELBECK STREET,"[THE, RENT, SERVICE, 5, WELBECK, STREET]","[BLACKWOOD, CAPITAL, 5, WELBECK, STREET]","{5, STREET, WELBECK}",0.5
404,9766099012009033110524400900821,,THE LISTER HOSPITAL,CHELSEA BRIDGE ROAD,SW1W 8RH,813390420000,352.0,,,G,1756.0,894.0,,Hospital - clinical and research,2008-10-01,E09000033,E14000639,Greater London Authority,2009-03-31,Hospital - clinical and research,Natural Gas,,Not applicable,Not applicable,6000.0,768.0,391.0,532.0,90.0,0.0,0.0,,,,,,,,,,,Air Conditioning,S6;,"The Lister Hospital, Chelsea Bridge Road",Westminster,Cities of London and Westminster,LONDON,2008-10-01,2008-08-31,2009-03-31 10:52:44,,THE LISTER HOSPITAL CHELSEA BRIDGE ROAD,100023345418,THE LISTER HOSPITAL CHELSEA BRIDGE ROAD LONDON...,,,,CHELSEA BRIDGE ROAD,"[THE, LISTER, HOSPITAL, CHELSEA, BRIDGE, ROAD]","[THE, LISTER, HOSPITAL, CHELSEA, BRIDGE, ROAD]","{CHELSEA, ROAD, LISTER, HOSPITAL, THE, BRIDGE}",1.0
407,5813692752008121815000811950757,COLUMBIA HOUSE,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE,HOUGHTON STREET,WC2A 2AE,770421250015,155.0,,,G,300.0,88.0,,General office,2008-10-31,E09000033,E14000639,Greater London Authority,2008-12-18,General office,Natural Gas,,Not applicable,Not applicable,3392.0,134.0,112.0,161.0,95.0,0.0,0.0,,,,,,,,,,,Heating and Natural Ventilation,C1;,"COLUMBIA HOUSE, London School of Economics & P...",Westminster,Cities of London and Westminster,LONDON,2008-10-01,2008-07-31,2008-12-18 15:00:08,,COLUMBIA HOUSE LONDON SCHOOL OF ECONOMICS & PO...,100023430221,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,,,,HOUGHTON STREET,"[COLUMBIA, HOUSE, LONDON, SCHOOL, OF, ECONOMIC...","[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","{SCHOOL, OF, POLITICAL, LONDON, &, ECONOMICS, ...",0.818182
408,5800610702009102810231258209200,,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE,HOUGHTON STREET,WC2A 2AE,770421250000,51.0,86.0,,C,858.0,254.0,,Cultural activities,2009-10-28,E09000033,E14000639,Greater London Authority,2009-10-28,Cultural activities,Natural Gas,,,,21641.0,61.0,234.0,72.0,101.0,0.0,0.0,1441.0,,310.0,,,,Yes,,2.0,3.0,Mixed-mode with Mechanical Ventilation,H4;,London School of Economics & Political Science...,Westminster,Cities of London and Westminster,LONDON,2009-10-28,2009-08-31,2009-10-28 10:23:12,,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,100023430221,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,,,,HOUGHTON STREET,"[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","{SCHOOL, OF, POLITICAL, LONDON, &, ECONOMICS, ...",1.0
409,5801352012009110509150211930757,ST PHILIPS,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE,HOUGHTON STREET,WC2A 2AE,770421250013,69.0,74.0,,C,171.0,71.0,,University campus,2009-11-05,E09000033,E14000639,Greater London Authority,2009-11-05,University campus,Natural Gas,,,,3194.0,114.0,268.0,97.0,106.0,0.0,0.0,130.0,,72.0,,,,No,,,4.0,Heating and Natural Ventilation,S4;,"ST PHILIPS, London School of Economics & Polit...",Westminster,Cities of London and Westminster,LONDON,2009-11-05,2009-08-31,2009-11-05 09:15:02,,ST PHILIPS LONDON SCHOOL OF ECONOMICS & POLITI...,100023430221,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,,,,HOUGHTON STREET,"[ST, PHILIPS, LONDON, SCHOOL, OF, ECONOMICS, &...","[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","{SCHOOL, OF, POLITICAL, LONDON, &, ECONOMICS, ...",0.818182
410,5813310742013021920120256270020,EAST BUILDING,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE,HOUGHTON STREET,WC2A 2AE,770421250007,47.0,62.0,100.0,B,136.0,117.0,0.0,University Campus,2013-02-06,E09000033,E14000639,Greater London Authority,2013-02-19,University Campus,Natural Gas,,,,4556.0,132.0,269.0,54.0,118.0,0.0,0.0,143.0,213.0,106.0,200.0,0.0,0.0,Yes,,2.0,1.0,Heating and Natural Ventilation,S4;,"EAST BUILDING, London School of Economics & Po...",Westminster,Cities of London and Westminster,LONDON,2012-12-17,2012-09-30,2013-02-19 20:12:02,Extended Occupancy,EAST BUILDING LONDON SCHOOL OF ECONOMICS & POL...,100023430221,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,,,,HOUGHTON STREET,"[EAST, BUILDING, LONDON, SCHOOL, OF, ECONOMICS...","[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","{SCHOOL, OF, POLITICAL, LONDON, &, ECONOMICS, ...",0.818182
411,5813120062013021916102972020672,CLARE MARKET,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE,HOUGHTON STREET,WC2A 2AE,770421250002,47.0,62.0,,B,63.0,54.0,0.0,University Campus,2013-02-06,E09000033,E14000639,Greater London Authority,2013-02-19,University Campus,Natural Gas,,,,2115.0,132.0,269.0,54.0,118.0,0.0,0.0,67.0,,49.0,,0.0,,Yes,,2.0,1.0,Heating and Natural Ventilation,S4;,"CLARE MARKET, London School of Economics & Pol...",Westminster,Cities of London and Westminster,LONDON,2012-12-17,2012-09-30,2013-02-19 16:10:29,Extended Occupancy,CLARE MARKET LONDON SCHOOL OF ECONOMICS & POLI...,100023430221,LONDON SCHOOL OF ECONOMICS & POLITICAL SCIENCE...,,,,HOUGHTON STREET,"[CLARE, MARKET, LONDON, SCHOOL, OF, ECONOMICS,...","[LONDON, SCHOOL, OF, ECONOMICS, &, POLITICAL, ...","{SCHOOL, OF, POLITICAL, LONDON, &, ECONOMICS, ...",0.818182


In [35]:
#remove duplicates
Final_match.drop_duplicates('BUILDING_REFERENCE_NUMBER', inplace = True)

Final_match.shape

(429, 62)

In [36]:
#extract UPRN and Building Reference number
matched_DEC_UPRN= Final_match[['BUILDING_REFERENCE_NUMBER','UPRN']]

In [37]:
matched_DEC_UPRN.head()

Unnamed: 0,BUILDING_REFERENCE_NUMBER,UPRN
214,703434170000,10092014722
328,807191640000,100023338432
386,471203430000,100023338432
402,321497250000,200002817438
404,813390420000,100023345418


In [38]:
matched_DEC_UPRN["UPRN"] = matched_DEC_UPRN["UPRN"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
#Now match this to the UPRN building polygons
DEC_Buildings_matched=pd.merge(BuildingsUPRN,matched_DEC_UPRN,how='inner', on='UPRN')

In [40]:
#merge with original certificate data
DEC_Buildings_matched_final=pd.merge(DEC_Buildings_matched,DEC,how='inner', on='BUILDING_REFERENCE_NUMBER')

In [41]:
#export as geojson
DEC_Buildings_matched_final.to_file("Data/BuildingData/buildings_DEC.geojson", driver='GeoJSON')