# Lab in Data Science: Final Project

Pierre Fouche, Matthias Leroy and Raphaël Steinmann

## Imports

In [23]:
%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [24]:
import getpass
import pyspark
from datetime import datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as functions
import math

## Initialize the `SparkSession`

In [25]:
conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('project-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '6')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc

In [None]:
# init spark session
spark = SparkSession(sc)

## Loading the data

In [64]:
# load full data
#df = spark.read.load('/datasets/project/istdaten/*/*', format='csv', header='true', inferSchema='true', sep=';')
# load sample data
df = spark.read.load('/datasets/project/istdaten/2017/10', format='csv', header='true', inferSchema='true', sep=';')

In [66]:
df.count()

25561479

In [120]:
# load metadata
raw_metadata = spark.read.load('/datasets/project/metadata', format='com.databricks.spark.csv', header='false', sep='\\t')

Let's clean the metadata DF:

In [132]:
# remove multiple spaces
metadata = raw_metadata.withColumn('_c0', functions.regexp_replace(raw_metadata._c0, '\s+', ' '))
# split into columns
for (name, index) in [('station_ID',0), ('long',1), ('lat',2), ('height',3), ('name',5)]:
    metadata = metadata.withColumn(name, functions.split(metadata._c0, ' ')[index])
# remove useless column
metadata = metadata.drop('_c0')

In [133]:
metadata.show(5)

+----------+---------+---------+------+----------+
|station_ID|     long|      lat|height|      name|
+----------+---------+---------+------+----------+
|   0000002|26.074412|44.446770|     0| Bucuresti|
|   0000003| 1.811446|50.901549|     0|    Calais|
|   0000004| 1.075329|51.284212|     0|Canterbury|
|   0000005|-3.543547|50.729172|     0|    Exeter|
|   0000007| 9.733756|46.922368|   744|  Fideris,|
+----------+---------+---------+------+----------+
only showing top 5 rows



## Data Processing
We will use the SBB data limited around the Zurich area. We will focus on all the stops within 10km of the Zurich train station.

In [20]:
def distance(long1, lat1, long2, lat2):
    """
    Compute the distance in kms between two locations
    given their coordinates (longitude, latitude)
    """
    # convert decimal degrees to radians 
    long1, long2, lat1, lat2 = [radians(x) for x in [long1, long2, lat1, lat2]]
    
    r = 6371 # earth radius
    # haversine formula
    return 2*r*asin(sqrt(hav(lat2-lat1)+cos(lat1)*cos(lat2)*hav(long2-long1)))

def hav(x):
    """ haversine function """"
    return (1-cos(x))/2

In [21]:
lyon = (45.7597, 4.8422)
paris = (48.8567, 2.3508)
distance(lyon[0], lyon[1], paris[0], paris[1])

441.42016756394304