---

**Assignment 3: Mobility Tracking (Connected Mobility Basics, Winter 2019/20)**

Felix Schorer | Haoyang Sun | Simon Zachau

---

In [None]:
%pip install numpy matplotlib scapy[basic] tqdm tabulate

# 1. Introduction

We collected Wifi probes with a Raspberry Pi and 2 antennas on different commutes around Munich to analyse them. Specifically, we look at the number of packets and their MAC addresses relative to time, and therefore also location. In the beginning, we asked ourselves if we could track devices over multiple days during the time of a route and if we can we infer from the data when the train/subway stops.

# 2. Routes

## 2.1 Route from Kaufering to Garching Forschungszentrum

We measured the route from Kaufering to Garching Forschungszentrum on 3 different days each before Christmas and after Christmas. It involves the train until Munich Central Station, the S-Bahn from Munich Central Station to Marienplatz, and the U6 from Marienplatz to Garching Forschungszentrum. The fact that our measurements involve multiple days, but are conducted at 7:00 AM consistently every morning, gives us the chance to compare the results.

![route from Kaufering to Garching Forschungszentrum](./illustrations/kaufering_garchingforschungzentrum.png)

In [None]:
from tabulate import tabulate
from captures import commutes


table = [[c.description, c.start_date] for c in commutes]
print(tabulate(table, headers=['Description', 'Start Date/Time']))

## 2.2 Round Trip Routes of U2 and U6

On the 19th of December we measured 1 round trip in each the U2 and the U6 at noon. Both lines meet in the city center but cross different areas of Munich. Different demographics and destinations might enable different observations.

![route of U2 and U6](./illustrations/u2u6.png)

In [None]:
from tabulate import tabulate
from captures import u2_to_messestadt, u6_roundtrip


table = [[u2_to_messestadt.description, u2_to_messestadt.start_date],
         [u6_roundtrip.description, u6_roundtrip.start_date]]
print(tabulate(table, headers=['Description', 'Start Date/Time']))

## 2.3 Route from Munich Central Station to Dortmund Central Station

On the 24th of December we measured a trip in the ICE from Munich Central Station to Dortmund Central Station. We might be able to analyse different devices and device usages compared to subway lines in the city center.

![route from Munich Central Station to Dortmund Central Station](./illustrations/munich_dortmund.png)

In [None]:
from tabulate import tabulate
from captures import munich_to_dortmund


table = [[munich_to_dortmund.description, munich_to_dortmund.start_date]]
print(tabulate(table, headers=['Description', 'Start Date/Time']))

# 3. Analysis

## 3.1 Number of Probes Per Time Interval

In [None]:
import random
import numpy as np
from tqdm import tqdm
from scapy.all import PcapReader
from scapy.layers.dot11 import Dot11ProbeReq


def accumulate(file, *accumulators):
    with PcapReader(file) as reader, tqdm(unit='packets', desc=file) as pbar:
        for packet in reader:
            for accumulator in accumulators:
                accumulator(packet)
            pbar.update()
        return accumulators


class TimeSeriesAccumulator(list):
    TYPE = ('timestamp', np.float), ('mac', np.bytes_, 6)
    
    def __call__(self, packet):
        if packet.haslayer(Dot11ProbeReq):
            timestamp = packet.time
            mac = bytes.fromhex(packet.addr2.replace(':', ''))
            self.append((timestamp, mac))
            
    def as_numpy_array(self):
        return np.array(self, dtype=np.dtype([*self.TYPE]))


class RandomProbeSampler(list):
    PROBABILITY = 0.001
    
    def __call__(self, packet):
        if packet.haslayer(Dot11ProbeReq) and random.random() <= self.PROBABILITY:
            self.append(packet)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt


def plot_line_chart(data, x_label=None, y_label=None, x_scale='linear', y_scale='linear'):
    plt.figure(figsize=(18, 6))
    legends = []
    for xs, ys, legend in data:
        plt.plot(xs, ys)
        legends.append(legend)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.xscale(x_scale)
    plt.yscale(y_scale)
    plt.legend(legends)

In [None]:
import math


def duration(timestamps):
    return np.amax(timestamps) - np.amin(timestamps)


def frequency_count(timestamps, time_interval):
    number_of_intervals = math.ceil(duration(timestamps) / time_interval)
    counts = np.zeros(number_of_intervals)
    rel_timestamps = timestamps - np.amin(timestamps)
    for timestamp in rel_timestamps:
        counts[math.floor(timestamp / time_interval)] += 1
    time = np.arange(0, len(counts) * time_interval, time_interval)
    return time, counts

### Route from Kaufering to Garching Forschungszentrum

In [None]:
from captures import commutes, make_legend


commute_time_series = dict()  # commute -> feature matrix
commute_samples = dict()  # commute -> list of probe request samples

for commute in commutes:
    features, samples = accumulate(commute.filename, TimeSeriesAccumulator(), RandomProbeSampler())
    commute_time_series[commute] = features.as_numpy_array()
    commute_samples[commute] = samples

In [None]:
TIME_INTERVAL = 120

plots = []

for commute in commutes:
    timestamps = commute_time_series[commute][:]['timestamp']
    xs, ys = frequency_count(timestamps, TIME_INTERVAL)
    plots.append((xs, ys, make_legend(commute)))

plot_line_chart(plots, x_label='time (s)', y_label='number of probes per {}s'.format(TIME_INTERVAL))

#### Observations
- The trip on 12/18/2019 (green line) ends early because of the battery running out.
- The change of number of packets per 100s across time is consistent in relative terms.
- The number of packets per 100s starts at about 500 in Kaufering, grows until Central Station until 2000 packets per 100s, 
  peaks multiple times around the city center, and decreases until the destination Garching Forschungszentrum.

### Round Trip Routes of U2 and U6

TODO

### Route from Munich Central Station to Dortmund Central Station

In [None]:
from captures import munich_to_dortmund, make_legend


munich_dortmund_time_series = dict()

features, samples = accumulate(munich_to_dortmund.filename, TimeSeriesAccumulator(), RandomProbeSampler())
munich_dortmund_time_series[munich_to_dortmund] = features.as_numpy_array()

In [None]:
TIME_INTERVAL = 120

timestamps = munich_dortmund_time_series[munich_to_dortmund][:]['timestamp']
xs, ys = frequency_count(timestamps, TIME_INTERVAL)
plot = (xs, ys, make_legend(munich_to_dortmund))

plot_line_chart([plot], x_label='time (s)', y_label='number of probes per {}s'.format(TIME_INTERVAL))

#### Observations
Todo

## 3.2 Inter-Contact Times of Mac Addresses

In [None]:
from collections import defaultdict


def compute_inter_contact_times(probes):
    min_max_timestamps = defaultdict(tuple)  # mac -> min, max timestamp
    for probe in probes:
        current_min_max = min_max_timestamps[probe['mac']]
        min_timestamp = min([*current_min_max, probe['timestamp']])
        max_timestamp = max([*current_min_max, probe['timestamp']])
        min_max_timestamps[probe['mac']] = min_timestamp, max_timestamp
    return sorted(max_timestamp - min_timestamp for min_timestamp, max_timestamp in min_max_timestamps.values())

def compute_cdf(values):
    total = len(values)
    accumulator = 0
    ys, xs = [], []
    current_value = None
    for value in values:
        if current_value != value and current_value is not None:
            ys.append(accumulator)
            xs.append(current_value)
        current_value = value
        accumulator += 1 / total
    return xs, ys

In [None]:
plots = []

# Route from Kaufering to Garching Forschungszentrum
for commute in commutes:
    inter_contact_times = compute_inter_contact_times(commute_time_series[commute])
    xs, ys = compute_cdf(inter_contact_times)
    plots.append((xs, ys, make_legend(commute)))

# TODO: Round Trip Routes of U2 and U6

# Route from Munich Central Station to Dortmund Central Station
inter_contact_times = compute_inter_contact_times(munich_dortmund_time_series[munich_to_dortmund])
xs, ys = compute_cdf(inter_contact_times)
plots.append((xs, ys, make_legend(munich_to_dortmund)))

# Plot all
plot_line_chart(plots, x_label='time (s)', y_label='per cent', x_scale='log')

#### Observations
- Almost 90% of MAC addresses are seen for less than 0.1 seconds.
 - Probably due to MAC address randomization
- The phone carried by the conductor of the measurement does not perform MAC address randomization. 

## 3.3 Vendors

In [None]:
!wget http://standards-oui.ieee.org/oui/oui.csv

In [None]:
import csv


with open('oui.csv', 'r') as file:
    reader = csv.reader(file)
    header = next(reader)
    vendors = {bytes.fromhex(row[1]): row[2] for row in reader}


def lookup_vendor(mac):
    return vendors.get(mac[:3], '<randomized>')

In [None]:
def vendor_frequency_count(probes, time_interval, top=10):
    total = Counter(lookup_vendor(mac) for mac in macs)
    ranks = {vendor: index for index, (vendor, _) in enumerate(total.most_common(top))}
    
    timestamps = probes[:]['timestamp']
    number_of_intervals = math.ceil(duration(timestamps) / time_interval)
    counts = np.zeros((top + 1, number_of_intervals))
    min_timestamp = np.amin(timestamps)
    for probe in probes:
        vendor = lookup_vendor(probe['mac'])
        rank = ranks.get(vendor, top)
        rel_timestamp = probe['timestamp'] - min_timestamp
        counts[rank, math.floor(rel_timestamp / time_interval)] += 1
    time = np.arange(0, number_of_intervals * time_interval, time_interval)
    return time, counts, [*ranks.keys(), '<other>']


def vendor_distribution(*args, **kwargs):
    time, counts, labels = vendor_frequency_count(*args, **kwargs)
    return time, counts / np.sum(counts, axis=0), labels

In [None]:
def plot_stack_plot(data, x_label=None, y_label=None, title=None):
    plt.figure(figsize=(18, 6))
    xs, yss, labels = data
    plt.stackplot(xs, yss, labels=labels)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend()
    

### Route from Kaufering to Garching Forschungszentrum

In [None]:
from collections import Counter
# from tabulate import tabulate


for commute in commutes:
    macs = commute_time_series[commute][:]['mac']
    counter = Counter(lookup_vendor(mac) for mac in macs)
#     print(make_legend(commute))
#     print(tabulate(counter.most_common(10), headers=['vendor', 'count']))
#     print('\n\n')

In [None]:
for commute in commutes:
    plot_stack_plot(vendor_distribution(commute_time_series[commute], TIME_INTERVAL, top=10), 
                    x_label='time (s)', y_label='percentage of probes per {}s'.format(TIME_INTERVAL), 
                    title=make_legend(commute))

#### Observations
- The majority of MAC addresses is randomized.
- The Raspberry Pi is sending probe requests as well.
- The conductor of the measurements owns a OnePlus phone.

### Round Trip Routes of U2 and U6

TODO

### Route from Munich Central Station to Dortmund Central Station

In [None]:
from collections import Counter
# from tabulate import tabulate


macs = munich_dortmund_time_series[munich_to_dortmund][:]['mac']
counter = Counter(lookup_vendor(mac) for mac in macs)
#     print(make_legend(munich_to_dortmund))
#     print(tabulate(counter.most_common(10), headers=['vendor', 'count']))
#     print('\n\n')

In [None]:
plot_stack_plot(vendor_distribution(munich_dortmund_time_series[munich_to_dortmund], TIME_INTERVAL, top=10),
                x_label='time (s)', y_label='percentage of probes per {}s'.format(TIME_INTERVAL),
                title=make_legend(munich_to_dortmund))

#### Observations
Todo