# ETL to transform London postcodes to database schema

This ETL is coded in pandas, since it is used only once and there is little risk of scalability (no many postcodes newly added to the city).

The following process complements the database for *fareApp*, enriching information around properties. In this case, regarding public transportation.

## Initialisation

In [1]:
### Import modules

import os
import json
import boto3
import warnings
import pandas as pd
from pykml import parser

warnings.filterwarnings('ignore')

In [2]:
### Constants

ORIGINAL_PATH = './OriginalData/'
PARQUET_PATH = './SchemaReadyData_parquet/'

In [3]:
### Database credentials

DB_HOST = os.environ.get('DB_HOST')
DB_PORT = os.environ.get('DB_PORT')
DB_NAME = os.environ.get('DB_NAME')
DB_USERNAME = os.environ.get('DB_USERNAME')
DB_PASSWORD = os.environ.get('DB_PASSWORD')

DB_URL = f'postgresql+psycopg2://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'

## Retrieve data

In [4]:
### Read postcodes files

df_postcodes = pd.read_csv(ORIGINAL_PATH + 'London postcodes.csv')

# Show content of first row
df_postcodes.iloc[0]

Postcode                                                                    BR1 1AA
In Use?                                                                         Yes
Latitude                                                                  51.401546
Longitude                                                                  0.015415
Easting                                                                      540291
Northing                                                                     168873
Grid Ref                                                                   TQ402688
County                                                               Greater London
District                                                                    Bromley
Ward                                                                   Bromley Town
District Code                                                             E09000006
Ward Code                                                                 E0

In [5]:
### Read TfL stations file
# Credit to https://gist.github.com/nishadhka/3ba801ca980da5b76004631c1935f604

filename = 'stations.kml'
with open(ORIGINAL_PATH + filename) as f:
    kml_data = parser.parse(f).getroot().Document

Clearly, the file contains several information regarding each post code. However, at this point only the postcode, region, borough and median coordinates are of interest.

In [6]:
### Bring neighbourhood table to cross reference

try:

    df_neighbourhoods = pd.read_sql_table('neighbourhoods', DB_URL, schema = 'listings')

    print('Table retrieved successfully.')
except:
    print('Oh oh, something happened mate. Try again.')

Table retrieved successfully.


## Process postcode data

In [7]:
### Retrieve certain features only

SLICE_COLUMNS = ['Postcode',
                 'In Use?',
                 'Latitude',
                 'Longitude',
                 'District',
                 'Constituency',
                 'Postcode area',
                 'Postcode district']

df_postcodes_sliced = df_postcodes[SLICE_COLUMNS]

In [8]:
### Merge with neighbourhoods to retrieve ID

df_postcodes_merged = pd.merge(df_postcodes_sliced,
                              df_neighbourhoods,
                              how = 'left',
                              left_on = 'District',
                              right_on = 'neighbourhood')

# Check all rows contain a non-null id
null_ids = sum(df_postcodes_merged['id'].isnull())
print(f'Rows without id: {null_ids}')

# Drop neighbourhood field
df_postcodes_merged.drop(columns = ['neighbourhood'], inplace = True)

# Rename id to neghbourhood_id
df_postcodes_merged.rename(columns = {'id': 'neighbourhood_id'},
                          inplace = True)

Rows without id: 0


In [9]:
### Process data

df_postcodes_process = df_postcodes_merged.copy()

# Generate postcode formatted to URLs
df_postcodes_process['postcode_url'] = df_postcodes_process['Postcode'].str.replace(' ', '+')

# Change use column to bool
USE_MAP = {'Yes': True,
          'No': False}

df_postcodes_process['In Use?'] = df_postcodes_process['In Use?'].map(USE_MAP)

# Generate id column
df_postcodes_process.reset_index(drop = False, inplace = True)
df_postcodes_process['index'] += 1

# Rename features
RENAME = {'Postcode': 'postcode',
          'In Use?': 'active',
          'Latitude': 'latitude',
          'Longitude': 'longitude',
          'District': 'district',
          'Constituency': 'constituency',
          'Postcode area': 'postcode_area',
          'Postcode district': 'postcode_district',
          'index': 'id'}

df_postcodes_process.rename(columns = RENAME, inplace = True)

In [16]:
### Insert into dataset

try:
    df_postcodes_process.to_sql('postcodes',
                                DB_URL,
                                schema = 'city',
                                if_exists = 'append',
                                index = False,
                                method = 'multi')
    
    print('Successfully inserted rows into target table')
except:
    print('Oh oh, something happened mate inserting rows. Try again.')

Successfully inserted rows into target table


## Process stations data

In [11]:
### Build dataframe from KML file

station_name = [i.name.text.strip() for i in kml_data.Placemark]
station_description = [i.description.text.strip() for i in kml_data.Placemark]
station_point = [i.Point.coordinates.text.strip() for i in kml_data.Placemark]

# Create dataframe
stations_data = {'name': station_name,
                'description': station_description,
                'point': station_point}

df_stations = pd.DataFrame(stations_data)

In [12]:
### Process dataframe

# Unwrap point
df_station_point = df_stations['point'].str.split(',', expand = True)
df_station_point.drop(columns = [2], inplace = True)
df_station_point.rename(columns = {0: 'longitude', 1: 'latitude'}, inplace = True)

# Concatenate coordinates
df_stations_process = pd.concat([df_stations, df_station_point], axis = 1)
df_stations_process.drop(columns = ['point'], inplace = True)

# Cast datatypes
df_stations_process['latitude'] = pd.to_numeric(df_stations_process['latitude'])
df_stations_process['longitude'] = pd.to_numeric(df_stations_process['longitude'])

# Create index
df_stations_process.reset_index(drop = False, inplace = True)
df_stations_process['index'] += 1
df_stations_process.rename(columns = {'index': 'id'}, inplace = True)

df_stations_process.head()

Unnamed: 0,id,name,description,longitude,latitude
0,1,Acton Town Station,"Acton Town Station, London Underground Ltd., G...",-0.280251,51.50275
1,2,Aldgate Station,"Aldgate Station, London Underground Ltd., Aldg...",-0.075614,51.514272
2,3,Aldgate East Station,"Aldgate East Station, London Underground Ltd.,...",-0.072287,51.515233
3,4,Alperton Station,"Alperton Station, London Underground Ltd., Eal...",-0.299487,51.540695
4,5,Amersham Station,"Amersham Station, Stn Approach, Amersham, Buck...",-0.607479,51.67415


In [13]:
### Insert into dataset

try:
    df_stations_process.to_sql('tfl_stations',
                                DB_URL,
                                schema = 'city',
                                if_exists = 'append',
                                index = False,
                                method = 'multi')
    
    print('Successfully inserted rows into target table')
except:
    print('Oh oh, something happened mate inserting rows. Try again.')

Successfully inserted rows into target table


## Save local and S3 backups

In [14]:
### Local backup

df_postcodes_process.to_parquet(PARQUET_PATH + 'postcodes.parquet', index = False)
df_stations_process.to_parquet(PARQUET_PATH + 'tfl_stations.parquet', index = False)

In [15]:
### Save original data to S3

s3_client = boto3.client("s3")
s3_parquet_bucket_name = os.environ.get('AWS_BUCKET_PARQUET')
s3_original_bucket_name = os.environ.get('AWS_BUCKET_CSV')

# Write original files into S3 storage
files = [f for f in os.listdir(ORIGINAL_PATH) if os.path.isfile(os.path.join(ORIGINAL_PATH, f))]
for f in files:
    s3_client.upload_file(ORIGINAL_PATH + f, s3_original_bucket_name, f)
    
# Write parquet files into S3 storage
files = [f for f in os.listdir(PARQUET_PATH) if os.path.isfile(os.path.join(PARQUET_PATH, f))]
for f in files:
    s3_client.upload_file(PARQUET_PATH + f, s3_parquet_bucket_name, f)