In [1]:
from pyspark.sql import SparkSession, Row, Column
from pyspark.sql.functions import sqrt
from collections import OrderedDict

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [2]:
transactions = spark.read.option("header", "true").option("inferschema", "true").option("mode", "DROPMALFORMED").csv("transaction_customer_join.csv")
transactions.rdd.getNumPartitions()

7

In [5]:
latlon = transactions.select("latitude", "longitude").filter(transactions.latitude.isNotNull() & transactions.longitude.isNotNull()).collect()

In [6]:
maxlat = 0
minlat = 0
maxlon = 0
minlon = 0

first = True

for (lat, lon) in latlon:
    if first:
        maxlat = lat
        minlat = lat
        maxlon = lon
        minlon = lon
        first = False
    else:
        maxlat = max(maxlat, lat)
        minlat = min(minlat, lat)
        maxlon = max(maxlon, lon)
        minlon = min(minlon, lon)
        
print(f"Maximum Latitude:  {maxlat}")
print(f"Minimum Latitude:  {minlat}")
print(f"Maximum Longitude:  {maxlon}")
print(f"Minimum Longitude:  {minlon}")

Maximum Latitude:  43.834977324
Minimum Latitude:  43.5929446362
Maximum Longitude:  -79.1434143924
Minimum Longitude:  -79.617313153


In [7]:
import math
EARTH_RADIUS = 6371 * 1000.0 # metres

# lat long in radians

def haversine(lat1, long1, lat2, long2):

    lat1_rad, long1_rad = math.radians(lat1), math.radians(long1)
    lat2_rad, long2_rad = math.radians(lat2), math.radians(long2)

    delta_lat = lat2_rad - lat1_rad
    delta_long = long2_rad - long1_rad

    a = (math.sin(delta_lat / 2)**2) + (math.cos(lat1_rad) * math.cos(lat2_rad) * (math.sin(delta_long / 2)**2))
    c = 2.0 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    dist = EARTH_RADIUS * c

    return dist

In [8]:
haversine(maxlat, maxlon, minlat, minlon)

46636.696769443086

In [10]:
haversine(1.5/111, 0, 0, 0)

1502.6341438453885