# Lab in Data Science: Final Project

Pierre Fouche, Matthias Leroy and Raphaël Steinmann

## Imports

In [1]:
%matplotlib inline
import matplotlib.pylab as plt
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [2]:
import getpass
import pyspark
from datetime import datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as functions
import math
import helpers

%load_ext autoreload
%autoreload 2

## Initialize the `SparkSession`

In [3]:
conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('project-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '6')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc

In [4]:
# init spark session
spark = SparkSession(sc)

## Loading the data

In [5]:
# load full data
# df = spark.read.load('/datasets/project/istdaten/*/*', format='csv', header='true', inferSchema='true', sep=';')
# load sample data
df = spark.read.load('/datasets/project/istdaten/2017/10', format='csv', header='true', inferSchema='true', sep=';')

In [6]:
df.count()

25561479

In [7]:
# load metadata
raw_metadata = spark.read.load('/datasets/project/metadata', format='com.databricks.spark.csv', header='false', sep='\\t')

## Data Processing

### Cleaning metadata
First, let's clean the metadata dataframe:

In [8]:
# remove multiple spaces
metadata = raw_metadata.withColumn('_c0', functions.regexp_replace(raw_metadata._c0, '\s+', ' '))
# split into columns
metadata = metadata.withColumn('name', functions.split(metadata._c0, '%')[1])
for (name, index, type_) in [('station_ID',0, 'int'), ('long',1, 'double'), ('lat',2, 'double'), ('height',3, 'int')]:
    metadata = metadata.withColumn(name, functions.split(metadata._c0, ' ')[index].cast(type_))
# remove useless column
metadata = metadata.drop('_c0')

In [9]:
metadata.show(5)

+-----------------+----------+---------+---------+------+
|             name|station_ID|     long|      lat|height|
+-----------------+----------+---------+---------+------+
|        Bucuresti|         2|26.074412| 44.44677|     0|
|           Calais|         3| 1.811446|50.901549|     0|
|       Canterbury|         4| 1.075329|51.284212|     0|
|           Exeter|         5|-3.543547|50.729172|     0|
| Fideris, Bahnhof|         7| 9.733756|46.922368|   744|
+-----------------+----------+---------+---------+------+
only showing top 5 rows



We will use the SBB data limited around the Zurich area. We will focus on all the stops within 10km of the Zurich train station. Let's get rid of all the stations that are too far away from Zurich:

In [10]:
metadata.count()

25935

In [11]:
metadata.filter(metadata.name.contains('Zürich')).show(10, False)

+-----------------------------+----------+--------+---------+------+
|name                         |station_ID|long    |lat      |height|
+-----------------------------+----------+--------+---------+------+
| Rickenbach AG, Zürichstrasse|8502555   |8.395711|47.263635|390   |
| Zürich, Goldbrunnenplatz    |8502572   |8.513918|47.370293|421   |
| Zürich HB                   |8503000   |8.540192|47.378177|408   |
| Zürich Altstetten           |8503001   |8.48894 |47.391481|399   |
| Zürich Stadelhofen          |8503003   |8.548466|47.366611|411   |
| Zürich Tiefenbrunnen        |8503004   |8.561372|47.350124|408   |
| Zürich Oerlikon             |8503006   |8.544115|47.411529|442   |
| Zürich Seebach              |8503007   |8.544636|47.418747|442   |
| Zürich Affoltern            |8503008   |8.508565|47.420913|456   |
| Zürich Wollishofen          |8503009   |8.533588|47.34744 |409   |
+-----------------------------+----------+--------+---------+------+
only showing top 10 rows



In [12]:
# coordinates of Zürich main train station
lat_zurich = 47.3782
long_zurich = 8.5402

#metadata = metadata.withColumn('dist_from_zurich', helpers.distance(metadata.long, metadata.lat, long_zurich, lat_zurich))

#metadata = metadata.rdd.filter(lambda x: distance(x.long, x.lat, long_zurich, lat_zurich) < 10)

In [34]:
# convert to pandas dataframe
pandas_df = metadata.toPandas()
# keep only the stops that are located < 10km from Zurich HB
pandas_df['distance_to_zh'] = pandas_df.apply(lambda x: helpers.distance(x['long'], x['lat'], long_zurich, lat_zurich), axis=1)
pandas_df = pandas_df[pandas_df['distance_to_zh'] < 10]

In [35]:
pandas_df.distance_to_zh.max()

9.9833358343820819