In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import requests

busy = pd.read_parquet("../processed_data/london_busy.parquet")
busy

Unnamed: 0,name,group,geometry,latitude,longitude
0,Tesco Express,shop,b'\x01\x01\x00\x00\x00tb\x0f\xedc\x05\xc0\xbf(...,51.511122,-0.125165
1,,shop,b'\x01\x01\x00\x00\x00H\xb5\x99T~c\xc7\xbf*\x0...,51.550055,-0.182724
2,Tesco Express,shop,b'\x01\x01\x00\x00\x00\xee\xc0\xef\x95/\x1e\xc...,51.562078,-0.149359
3,Tesco Express,shop,"b'\x01\x01\x00\x00\x00""\xf7\x19\xbcT\x11\xc1\x...",51.545341,-0.133341
4,Chris Dry Cleaners,shop,b'\x01\x01\x00\x00\x00\x9d~\xf5]\xb6B\xb3\xbf\...,51.551313,-0.075237
...,...,...,...,...,...
91888,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x00...,51.515215,-0.168673
91889,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\t\x00\x...,51.413759,-0.077673
91890,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x08\x00...,51.494106,-0.236990
91891,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0c\x00...,51.409700,-0.149445


In [3]:
# Count total rows
initial_rows = len(busy)
print(initial_rows, 'total number of initial rows')

# Missing Longitude analysis
missing_long = busy[busy['longitude'].isna()]
num_missing_long = len(missing_long)
print(num_missing_long, 'number of rows that have Longitude missing (unidentifiable)')
print(num_missing_long / initial_rows * 100, '% percentage of rows that are missing Longitude')

# Rows with Longitude present
print(initial_rows - num_missing_long, 'number of rows that contain Longitude')

# Missing Latitude analysis
missing_lat = busy[busy['latitude'].isna()]
num_missing_lat = len(missing_lat)
print(num_missing_lat, 'number of rows that have Latitude missing (unidentifiable)')
print(num_missing_lat / initial_rows * 100, '% percentage of rows that are missing Latitude')

# Rows with Latitude present
print(initial_rows - num_missing_lat, 'number of rows that contain Latitude')

# Check rows missing both Latitude and Longitude
missing_both = busy[busy['latitude'].isna() & busy['longitude'].isna()]
num_missing_both = len(missing_both)
print(num_missing_both, 'number of rows missing both Latitude and Longitude')

# Verify if all missing values are in the same rows
if num_missing_both == num_missing_lat == num_missing_long:
    print("✅ All missing Latitude and Longitude values are from the same rows.")
else:
    print("⚠️ Latitude and Longitude missing values are not from the exact same rows.")


91893 total number of initial rows
0 number of rows that have Longitude missing (unidentifiable)
0.0 % percentage of rows that are missing Longitude
91893 number of rows that contain Longitude
0 number of rows that have Latitude missing (unidentifiable)
0.0 % percentage of rows that are missing Latitude
91893 number of rows that contain Latitude
0 number of rows missing both Latitude and Longitude
✅ All missing Latitude and Longitude values are from the same rows.


In [5]:
# Missing name analysis
missing_name = busy[busy['name'].isna()]
num_missing_name = len(missing_name)
print(num_missing_name, 'number of rows that have name missing')
print(num_missing_name / initial_rows * 100, '% percentage of rows that are missing name')

# Rows with Latitude present
print(initial_rows - num_missing_name, 'number of rows that contain name')

20613 number of rows that have name missing
22.431523619862233 % percentage of rows that are missing name
71280 number of rows that contain name


In [3]:
lsoa_boundaries = gpd.read_file('../processed_data/LSOA boundaries.geojson')

In [7]:
lsoa_boundaries

Unnamed: 0,FID,LSOA21CD,LSOA21NM,LSOA21NMW,BNG_E,BNG_N,LAT,LONG,GlobalID,geometry
0,1,E01000001,City of London 001A,,532123,181632,51.51817,-0.097150,3478c558-3297-4e2b-979e-e29dd9ff3bf5,"POLYGON ((-0.09474 51.5206, -0.09546 51.51544,..."
1,2,E01000002,City of London 001B,,532480,181715,51.51883,-0.091970,f2072109-b1ae-426c-b166-083cc32f1789,"POLYGON ((-0.0881 51.51941, -0.09546 51.51544,..."
2,3,E01000003,City of London 001C,,532239,182033,51.52174,-0.095330,a9009c33-9b6b-4230-ba62-fc3264806de4,"POLYGON ((-0.09453 51.52205, -0.09274 51.52139..."
3,4,E01000005,City of London 001E,,533581,181283,51.51469,-0.076280,86aee0aa-079f-4f92-8f9d-5773824f4945,"POLYGON ((-0.07589 51.5159, -0.07394 51.51445,..."
4,5,E01000006,Barking and Dagenham 016A,,544994,184274,51.53875,0.089317,c33f1f5b-6b15-47a1-b046-b6a148a9f6d3,"POLYGON ((0.09328 51.53787, 0.09363 51.53767, ..."
...,...,...,...,...,...,...,...,...,...,...
35667,35668,W01002036,Vale of Glamorgan 005G,Bro Morgannwg 005G,317939,172435,51.44494,-3.182180,f79ed9c6-8220-49d9-9a69-e80e9f0007cb,"POLYGON ((-3.17711 51.44702, -3.17619 51.44261..."
35668,35669,W01002037,Vale of Glamorgan 005H,Bro Morgannwg 005H,318527,172406,51.44476,-3.173710,fdd8e9dc-8504-4955-aa8d-e4cff563ed76,"POLYGON ((-3.16647 51.44662, -3.1718 51.44303,..."
35669,35670,W01002038,Vale of Glamorgan 014G,Bro Morgannwg 014G,306491,167360,51.39754,-3.345520,7b56d7b1-48e6-4883-a46e-3bd00345bc8c,"POLYGON ((-3.34342 51.3898, -3.34926 51.38719,..."
35670,35671,W01002039,Vale of Glamorgan 014H,Bro Morgannwg 014H,306564,166023,51.38553,-3.344120,36d00276-9970-4f4b-9322-5b89fdacf0c3,"POLYGON ((-3.33578 51.38279, -3.33868 51.38189..."


In [30]:
def get_lsoa_from_coordinates(latitude, longitude):
    """
    Retrieves the LSOA code for a given latitude and longitude using the Postcodes.io API.
    Now handles the list response from the API.
    """
    base_url = "https://api.postcodes.io/postcodes"
    params = {"lat": latitude, "lon": longitude}
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        if data and data['status'] == 200 and 'result' in data and isinstance(data['result'], list) and len(data['result']) > 0 and 'lsoa' in data['result'][0]:
            return data['result'][0]['lsoa']
        else:
            print(f"Error: Unexpected API response - {data}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")
        return None

# Test the function with an example coordinate
example_latitude = 51.5074
example_longitude = 0.1278
lsoa_code = get_lsoa_from_coordinates(example_latitude, example_longitude)
print(f"The LSOA code for ({example_latitude}, {example_longitude}) is: {lsoa_code}")

The LSOA code for (51.5074, 0.1278) is: Bexley 001C


In [16]:
def name_to_lsoa(name):
    """
    Converts an LSOA name or a Series of LSOA names to LSOA codes.

    Args:
        name (str or pd.Series): The LSOA name(s) to convert.

    Returns:
        pd.Series: A Series containing the corresponding LSOA codes.  Returns NaN for names not found.
    """
    if isinstance(name, pd.Series):
        # Vectorized lookup for a Series of names
        return name.map(lambda x: lsoa_boundaries.loc[lsoa_boundaries['LSOA21NM'] == x, 'LSOA21CD'].iloc[0] if x in lsoa_boundaries['LSOA21NM'].values else pd.NA)
    else:
        # Handle a single name (for completeness, though the main use case is a Series)
        result = lsoa_boundaries.loc[lsoa_boundaries['LSOA21NM'] == name, 'LSOA21CD']
        if not result.empty:
            return result.iloc[0]  # Return the first element if found
        else:
            return pd.NA  # Return NaN if not found

# Test:
example_name = 'City of London 001A'
code = name_to_lsoa(example_name)
code

'E01000001'

In [8]:
import pandas as pd
import requests
import time
import json
import os  # For checking if a file exists

def batch_get_lsoa_from_coordinates(coordinates):
    """
    Retrieves LSOA codes for a list of (latitude, longitude) tuples using Postcodes.io batch endpoint.
    Handles the nested 'result' structure in the API response.
    """
    base_url = "https://api.postcodes.io/postcodes"
    payload = {"geolocations": []}
    for lat, lon in coordinates:
        payload["geolocations"].append({"latitude": lat, "longitude": lon})

    try:
        response = requests.post(base_url, json=payload)
        response.raise_for_status()
        data = response.json()
        if data and data.get('status') == 200 and 'result' in data and isinstance(data['result'], list):
            lsoa_codes = {}
            for item in data['result']:
                if isinstance(item, dict) and 'query' in item and 'result' in item and isinstance(item['result'], list) and len(item['result']) > 0 and isinstance(item['result'][0], dict) and 'lsoa' in item['result'][0]:
                    lsoa_codes[(item['query']['latitude'], item['query']['longitude'])] = item['result'][0]['lsoa']
                else:
                    print(f"Warning: Unexpected item structure in API result: {item}")
            return lsoa_codes
        else:
            print(f"Error: Unexpected batch API response - {data}")
            return {}
    except requests.exceptions.RequestException as e:
        print(f"Error during batch API request: {e}")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response: {e}, Response text: {response.text if 'response' in locals() else 'No response'}")
        return {}

def get_lsoa_in_batches_with_save(df, batch_size=100, output_filename='busy_with_lsoa.csv', save_interval=5):
    """
    Processes the DataFrame in batches to get LSOA codes and saves the progress periodically.

    Args:
        df (pd.DataFrame): The input DataFrame with 'latitude' and 'longitude' columns.
        batch_size (int): The number of rows to process in each batch.
        output_filename (str): The name of the CSV file to save the progress to.
        save_interval (int): Save the DataFrame after processing this many batches.
    """
    lsoa_codes = {}
    start_row = 0

    # Check if a previous save file exists and load it
    if os.path.exists(output_filename):
        try:
            saved_df = pd.read_csv(output_filename)
            if 'lsoa_code' in saved_df.columns and len(saved_df) == len(df):
                print(f"Resuming from saved file: {output_filename}")

            else:
                print(f"Warning: Saved file {output_filename} is incomplete or has incorrect structure. Starting from scratch.")
        except Exception as e:
            print(f"Error loading saved file {output_filename}: {e}. Starting from scratch.")

    df['lsoa_code'] = None  # Initialize the 'lsoa_code' column

    num_batches = (len(df) + batch_size - 1) // batch_size
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, len(df))
        batch_df = df.iloc[start_index:end_index]
        coordinates_batch = batch_df[['latitude', 'longitude']].values.tolist()
        batch_results = batch_get_lsoa_from_coordinates(coordinates_batch)

        for index, row in batch_df.iterrows():
            lsoa_codes[(row['latitude'], row['longitude'])] = batch_results.get((row['latitude'], row['longitude']))
            df.loc[index, 'lsoa_code'] = batch_results.get((row['latitude'], row['longitude']))

        print(f"Processed batch {batch_num + 1}/{num_batches}")
        time.sleep(0.2)  # Respect rate limits

        # Save intermittently
        if (batch_num + 1) % save_interval == 0:
            print(f"Saving progress to {output_filename} after batch {batch_num + 1}")
            df.to_csv(output_filename, index=False)

    # Save the final result
    print(f"Saving final result to {output_filename}")
    df.to_csv(output_filename, index=False)
    return df

# Assuming your DataFrame is named 'busy'
busy_with_lsoa = get_lsoa_in_batches_with_save(busy, batch_size=100, output_filename='busy_with_lsoa.csv', save_interval=10)
print(busy_with_lsoa.head())

Resuming from saved file: busy_with_lsoa.csv
Processed batch 1/919
Processed batch 2/919
Processed batch 3/919
Processed batch 4/919
Processed batch 5/919
Processed batch 6/919
Processed batch 7/919
Processed batch 8/919
Processed batch 9/919
Processed batch 10/919
Saving progress to busy_with_lsoa.csv after batch 10
Processed batch 11/919
Processed batch 12/919
Processed batch 13/919
Processed batch 14/919
Processed batch 15/919
Processed batch 16/919
Processed batch 17/919
Processed batch 18/919
Processed batch 19/919
Processed batch 20/919
Saving progress to busy_with_lsoa.csv after batch 20
Processed batch 21/919
Processed batch 22/919
Processed batch 23/919
Processed batch 24/919
Processed batch 25/919
Processed batch 26/919
Processed batch 27/919
Processed batch 28/919
Processed batch 29/919
Processed batch 30/919
Saving progress to busy_with_lsoa.csv after batch 30
Processed batch 31/919
Processed batch 32/919
Processed batch 33/919
Processed batch 34/919
Processed batch 35/919


In [32]:
busy_with_lsoa = pd.read_csv('busy_with_lsoa.csv')
busy_with_lsoa

Unnamed: 0,name,group,geometry,latitude,longitude,lsoa_code
0,Tesco Express,shop,b'\x01\x01\x00\x00\x00tb\x0f\xedc\x05\xc0\xbf(...,51.511122,-0.125165,Westminster 018A
1,,shop,b'\x01\x01\x00\x00\x00H\xb5\x99T~c\xc7\xbf*\x0...,51.550055,-0.182724,Camden 010D
2,Tesco Express,shop,b'\x01\x01\x00\x00\x00\xee\xc0\xef\x95/\x1e\xc...,51.562078,-0.149359,Camden 001C
3,Tesco Express,shop,"b'\x01\x01\x00\x00\x00""\xf7\x19\xbcT\x11\xc1\x...",51.545341,-0.133341,Camden 015B
4,Chris Dry Cleaners,shop,b'\x01\x01\x00\x00\x00\x9d~\xf5]\xb6B\xb3\xbf\...,51.551313,-0.075237,Hackney 014B
...,...,...,...,...,...,...
91888,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x00...,51.515215,-0.168673,Westminster 015B
91889,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\t\x00\x...,51.413759,-0.077673,Croydon 001B
91890,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x08\x00...,51.494106,-0.236990,Hammersmith and Fulham 011A
91891,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0c\x00...,51.409700,-0.149445,Merton 014D


In [33]:
# Convert name to LSOA code
updated_lsoa_codes = busy_with_lsoa['lsoa_code'].apply(name_to_lsoa)

# Print the result
updated_lsoa_codes

0        E01004734
1        E01000972
2        E01000909
3             <NA>
4        E01001747
           ...    
91888    E01004683
91889    E01001138
91890    E01001928
91891    E01003412
91892    E01002412
Name: lsoa_code, Length: 91893, dtype: object

In [34]:
# Add the new LSOA codes
busy_with_lsoa['lsoa_code'] = updated_lsoa_codes
busy_with_lsoa

Unnamed: 0,name,group,geometry,latitude,longitude,lsoa_code
0,Tesco Express,shop,b'\x01\x01\x00\x00\x00tb\x0f\xedc\x05\xc0\xbf(...,51.511122,-0.125165,E01004734
1,,shop,b'\x01\x01\x00\x00\x00H\xb5\x99T~c\xc7\xbf*\x0...,51.550055,-0.182724,E01000972
2,Tesco Express,shop,b'\x01\x01\x00\x00\x00\xee\xc0\xef\x95/\x1e\xc...,51.562078,-0.149359,E01000909
3,Tesco Express,shop,"b'\x01\x01\x00\x00\x00""\xf7\x19\xbcT\x11\xc1\x...",51.545341,-0.133341,
4,Chris Dry Cleaners,shop,b'\x01\x01\x00\x00\x00\x9d~\xf5]\xb6B\xb3\xbf\...,51.551313,-0.075237,E01001747
...,...,...,...,...,...,...
91888,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x00...,51.515215,-0.168673,E01004683
91889,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\t\x00\x...,51.413759,-0.077673,E01001138
91890,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x08\x00...,51.494106,-0.236990,E01001928
91891,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0c\x00...,51.409700,-0.149445,E01003412


In [35]:
# see how many missing LSOA codes we have
missing_lsoa_count = updated_lsoa_codes.isna().sum()

# Calculate the total number of records
total_records = len(updated_lsoa_codes)

# Calculate the percentage of missing LSOA codes
missing_lsoa_percentage = (missing_lsoa_count / total_records) * 100

print(missing_lsoa_count)
print(missing_lsoa_percentage)

14867
16.178599022776492


In [36]:
busy_with_lsoa = busy_with_lsoa.dropna(subset=['lsoa_code'])
busy_with_lsoa = busy_with_lsoa.reset_index(drop=True)
busy_with_lsoa

Unnamed: 0,name,group,geometry,latitude,longitude,lsoa_code
0,Tesco Express,shop,b'\x01\x01\x00\x00\x00tb\x0f\xedc\x05\xc0\xbf(...,51.511122,-0.125165,E01004734
1,,shop,b'\x01\x01\x00\x00\x00H\xb5\x99T~c\xc7\xbf*\x0...,51.550055,-0.182724,E01000972
2,Tesco Express,shop,b'\x01\x01\x00\x00\x00\xee\xc0\xef\x95/\x1e\xc...,51.562078,-0.149359,E01000909
3,Chris Dry Cleaners,shop,b'\x01\x01\x00\x00\x00\x9d~\xf5]\xb6B\xb3\xbf\...,51.551313,-0.075237,E01001747
4,M&S Foodhall,shop,b'\x01\x01\x00\x00\x00yMY\x97d.\xc2\xbf\x1b\xb...,51.537810,-0.142041,E01000863
...,...,...,...,...,...,...
77021,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0b\x00...,51.515215,-0.168673,E01004683
77022,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\t\x00\x...,51.413759,-0.077673,E01001138
77023,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x08\x00...,51.494106,-0.236990,E01001928
77024,,parking,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x0c\x00...,51.409700,-0.149445,E01003412


In [37]:
output_file_path = '../processed_data/busy_with_lsoa.parquet'
busy_with_lsoa.to_parquet(output_file_path)