In [1]:
from pyspark.sql import SparkSession, functions as fs


session = SparkSession.builder.appName("UkMakerSpaces").master("local[*]").getOrCreate()

makerSpace = session.read.option("header", "true") \
    .csv("../input_data/uk-makerspaces-identifiable-data.csv")

postCode = session.read.option("header", "true").csv("../input_data/uk-postcode.csv") \
    .withColumn("PostCode", fs.concat_ws("", fs.col("PostCode"), fs.lit(" ")))

print("=== Print 20 records of makerspace table ===")
makerSpace.select("Name of makerspace", "Postcode").show()

=== Print 20 records of makerspace table ===
+--------------------+--------+
|  Name of makerspace|Postcode|
+--------------------+--------+
|        Hub Workshop|SE15 3SN|
|Nottingham Hacksp...| NG3 1JH|
|         Farset Labs|BT12 5GH|
|       Medway Makers| ME4 3JE|
|             fizzPop|  B5 5SR|
|South London Make...|SE24 9AA|
|Create Space London | HA9 6DE|
|          FounderHub|CF10 1DY|
|  LuneLab Makerspace| LA2 6ND|
|            The Shed| CT2 7NF|
|      Build Brighton| BN2 4AB|
|           Makespace| CB2 1RX|
|   Swansea Hackspace| SA1 1DP|
|57North (previous...|AB11 5BN|
|        BEC Fab Lab |CA13 0HT|
|   Dundee MakerSpace| DD1 4QB|
|                EPIK| CT3 4GP|
|Fab Lab Nerve Centre|BT48 6HJ|
|  fablab@strathclyde|  G1 1XJ|
|MakerspaceFY1 (Bl...| FY1 4DY|
+--------------------+--------+
only showing top 20 rows



In [2]:
print("=== Print 20 records of postcode table ===")
postCode.select("PostCode", "Region").show()

=== Print 20 records of postcode table ===
+--------+-------------+
|PostCode|       Region|
+--------+-------------+
|    AB1 |     Aberdeen|
|    AB2 |     Aberdeen|
|    AB3 |     Aberdeen|
|    AB4 |     Aberdeen|
|    AB5 |     Aberdeen|
|    AB9 |     Aberdeen|
|   AB10 |     Aberdeen|
|   AB11 |     Aberdeen|
|   AB12 |     Aberdeen|
|   AB13 |     Aberdeen|
|   AB14 |     Aberdeen|
|   AB15 |     Aberdeen|
|   AB16 |     Aberdeen|
|   AB21 |     Aberdeen|
|   AB22 |     Aberdeen|
|   AB23 |     Aberdeen|
|   AB24 |     Aberdeen|
|   AB25 |     Aberdeen|
|   AB30 |Aberdeenshire|
|   AB31 |Aberdeenshire|
+--------+-------------+
only showing top 20 rows



In [5]:
joined = makerSpace \
    .join(postCode, makerSpace["Postcode"].startswith(postCode["Postcode"]), "left_outer")

print("=== Group by Region ===")
joined.groupBy("Region").count().orderBy("count", ascending = False).show(200)

=== Group by Region ===
+--------------------+-----+
|              Region|count|
+--------------------+-----+
|             Cardiff|    3|
|       Tower Hamlets|    3|
|          Manchester|    3|
|             Glasgow|    3|
|             Lambeth|    2|
|              Oxford|    2|
|           Southwark|    2|
|            Aberdeen|    2|
|             Bristol|    2|
|           Liverpool|    2|
|           Sheffield|    2|
|   Brighton and Hove|    2|
|              Camden|    2|
|             Belfast|    2|
|               Leeds|    2|
|                null|    2|
|          Sunderland|    1|
|       Staffordshire|    1|
|    Scottish Borders|    1|
|            Coventry|    1|
|            Bradford|    1|
|             Swindon|    1|
|          Wandsworth|    1|
|          Eastbourne|    1|
|              Dudley|    1|
|          Birmingham|    1|
|Cheshire West and...|    1|
|              Exeter|    1|
|   Barrow-in-Furness|    1|
|           Cambridge|    1|
|              Dund

In [7]:
session.stop()