In [None]:
import os, psycopg2, folium, pandas as pd, numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt, matplotlib.colors as mcolors
az.style.use("arviz-whitegrid")
plt.rcParams["figure.figsize"] = [20, 6]
plt.rcParams["figure.dpi"] = 100
plt.rcParams["xtick.labelsize"] = 20
plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["legend.fontsize"] = 20
import warnings


# db connection

In [None]:
from sqlalchemy import create_engine 
from sqlalchemy.orm import sessionmaker
# database credentials
db_usr, db_pwd = '', '' # your database user name and password

# database login
host, port, db = 'nc-health-data-prod.cluster-ccsgl7rk4urn.eu-central-1.rds.amazonaws.com', 5432, 'master'
engine = create_engine('postgresql://'+db_usr+':'+db_pwd+'@'+host+':'+str(port)+'/'+db)
Session = sessionmaker(bind=engine)
session = Session()
conn = engine.connect()

# data

## activity before and after mass event location visit

In [None]:
query = """
    select ex."day", ex.did, sum(array_length(stime_arr, 1)) as nping
    from euro_stuttgart as raf
    join ex_corona_sdkv6_2024_27 as ex
    on ex."day" = raf."day" and ex.did = raf.did
    group by 1, 2
"""
data_pingcnt = pd.DataFrame(pd.read_sql_query(query, conn))
data_pingcnt = data_pingcnt.sort_values('nping', ascending=False)
data_pingcnt

In [None]:
query = """
WITH t1 AS (
    SELECT 
        ex."day", 
        ex.did, 
        SUM(array_length(stime_arr, 1)) AS nping
    FROM euro_stuttgart AS raf
    JOIN (
        SELECT * FROM ex_corona_sdkv6_2024_25
        UNION ALL
        SELECT * FROM ex_corona_sdkv6_2024_27
    ) AS ex 
        ON ex."day" = raf."day" AND ex.did = raf.did
    GROUP BY ex."day", ex.did
),
pings AS (
    SELECT
        ex."day",
        ex.did,
        ex.tile_id,
        UNNEST(stime_arr) AS stime,
        UNNEST(tile_arr) AS tl8
    FROM (
        SELECT * FROM ex_corona_sdkv6_2024_25
        UNION ALL
        SELECT * FROM ex_corona_sdkv6_2024_27
    ) AS ex
    JOIN t1 
        ON t1.did = ex.did AND t1."day" = ex."day"
),
traj AS (
    SELECT
        p."day",
        p.did,
        p.stime,
        p.tl8,
        ST_Transform(
            ST_Translate(
                ST_SetSRID(tile8togeo(p.tl8), 32632),
                tx.minx,
                tx.miny
            ),
            3857
        ) AS geopoint
    FROM pings AS p
    JOIN txc_dt_grid_1000m AS tx 
        ON p.tile_id = tx.tile_id
)
SELECT
    "day",
    did,
    stime,
    ST_X(ST_Transform(geopoint, 4326)) AS ping_lon,
    ST_Y(ST_Transform(geopoint, 4326)) AS ping_lat
FROM traj
"""

data_traj = pd.DataFrame(pd.read_sql_query(query, conn))
data_traj['ping_lon_next'] = data_traj.ping_lon.shift(-1)
data_traj['ping_lat_next'] = data_traj.ping_lat.shift(-1)
data_traj


In [None]:
data_traj

In [None]:
data_traj.to_csv('Stuttgart_dids_match_pings.csv', index=False)

In [None]:
data_traj

In [None]:
data_traj['stime'] = pd.to_datetime(data_traj['stime'])
data_traj['day'] = pd.to_datetime(data_traj['day'])
data_traj['did'] = data_traj['did'].astype(str) + data_traj['day'].astype(str)


In [None]:
data_traj

In [None]:

# ensure stime is a datetime
# sort by did and stime
data_traj = data_traj.sort_values(['did', 'stime'])

# downsample: one ping every minute per did
data_traj_1min = (
    data_traj
    .groupby('did')
    .apply(lambda df: df.set_index('stime').resample('1min').first())
    .dropna(subset=['ping_lon', 'ping_lat'])
    .reset_index(level=0, drop=True)
    .reset_index()
)


In [None]:
data_traj_1min

In [None]:
data_traj_5min = (
    data_traj
    .groupby('did')
    .apply(lambda df: df.set_index('stime').resample('5min').first())
    .dropna(subset=['ping_lon', 'ping_lat'])  # optional: keep only valid pings
    .reset_index(level=0, drop=True)
    .reset_index()
)



## Maps

In [None]:
from folium import Map, PolyLine
from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
from folium.plugins import FloatImage
import datetime

import base64
from folium.plugins import FloatImage

# map centered at Stuttgart
m = Map(location=[48.7758, 9.1829], zoom_start=8)


# Normalize using just the hour (0 to 24)
vmin = 0
vmax = 24
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.get_cmap('gist_rainbow')

# Assuming 'data_traj_1min' is already sorted by did and stime.
for idx, (did, df) in enumerate(data_traj.sort_values(['did', 'stime']).groupby('did')):
    df = df.reset_index(drop=True)
    for i in range(len(df) - 1):
        lat1, lon1 = df.loc[i, ['ping_lat', 'ping_lon']]
        lat2, lon2 = df.loc[i + 1, ['ping_lat', 'ping_lon']]
        stime = df.loc[i, 'stime']
        # Compute the hour value as a float (e.g., 16:30 becomes 16.5)
        hour_value = stime.hour 
        color = mcolors.to_hex(cmap(norm(hour_value)))
        folium.PolyLine(
            [[lat1, lon1], [lat2, lon2]],
            color=color,
            weight=4,
            opacity=0.7
        ).add_to(m)


# Create and save the colorbar as an image
fig, ax = plt.subplots(figsize=(6, 0.5))
fig.subplots_adjust(bottom=0.5)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
cb = plt.colorbar(sm, cax=ax, orientation='horizontal')
# Format ticks as "HH:00"
tick_locs = cb.get_ticks()
tick_labels = [ts for ts in tick_locs]
cb.set_ticks(tick_locs)
cb.set_ticklabels(tick_labels)
plt.title("Hour → Color Mapping")
plt.savefig("colorbar.png", bbox_inches='tight', dpi=150)
plt.close()

with open("colorbar.png", "rb") as f:
    encoded = base64.b64encode(f.read()).decode('utf-8')
data_uri = f"data:image/png;base64,{encoded}"

# Add the colorbar image as a floating legend to the folium map
FloatImage(data_uri, bottom=5, left=10).add_to(m)
m

In [None]:
import base64
from folium import Map, PolyLine, CircleMarker
from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
from folium.plugins import FloatImage
import datetime
import folium

# Map centered at Stuttgart
m = Map(location=[48.7758, 9.1829], zoom_start=10)

# Filter to just one device
did_target = "F608E0B9121E5A3518814E0952082E3A2024-06-19"
df_did = data_traj_1min[data_traj_1min.did == did_target].sort_values('stime').reset_index(drop=True)
df_did_5min = data_traj_5min[data_traj_5min.did == did_target].sort_values('stime').reset_index(drop=True)

# Normalize using just the hour (0 to 24)
vmin = 0
vmax = 24
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.get_cmap('gist_rainbow')

# Draw colored segments based on hour
for i in range(len(df_did) - 1):
    lat1, lon1 = df_did.loc[i, ['ping_lat', 'ping_lon']]
    lat2, lon2 = df_did.loc[i + 1, ['ping_lat', 'ping_lon']]
    stime = df_did.loc[i, 'stime']
    # Use hour value (you could also add minute/60 for more precision)
    hour_value = stime.hour  
    color = mcolors.to_hex(cmap(norm(hour_value)))
    PolyLine(
        [[lat1, lon1], [lat2, lon2]],
        color=color,
        weight=4,
        opacity=0.7
    ).add_to(m)

# Process the 5-minute data for static black points
for lat, lon in zip(df_did_5min.ping_lat, df_did_5min.ping_lon):
    CircleMarker(
        location=[lat, lon],
        radius=1,
        color='black',
        fill=True,
        fill_color='black',
        fill_opacity=0.5
    ).add_to(m)

# -------------------------

# -------------------------
# Convert the colorbar image to base64 for embedding in the map
with open("colorbar.png", "rb") as f:
    encoded = base64.b64encode(f.read()).decode('utf-8')
data_uri = f"data:image/png;base64,{encoded}"

# Add the colorbar image as a floating legend to the folium map
FloatImage(data_uri, bottom=5, left=10).add_to(m)

m


## Heatmap per hour

In [None]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# convert stime to datetime, if it's not already
data_traj['stime'] = pd.to_datetime(data_traj['stime'])

# extract hour from stime
data_traj['hour'] = data_traj['stime'].dt.hour

# filter for the hour you want
data_hour = data_traj[data_traj['hour'] == 19]

# define a center location and zoom for your map (modify as needed)
m = folium.Map(location=[48.7758, 9.1829], zoom_start=6)

# build the heat map using ping latitude and longitude
heat_data = data_hour[['ping_lat', 'ping_lon']].values.tolist()
HeatMap(heat_data, name='hour', radius=6, blur=5).add_to(m)

m



For the heatmaps we use the complete data of the two matches:

In [None]:
import folium
from folium.plugins import HeatMap
import pandas as pd
import os
# ensure stime is datetime
data_traj['stime'] = pd.to_datetime(data_traj['stime'])

# extract hour
data_traj['hour'] = data_traj['stime'].dt.hour


# get unique hours in sorted order
hours_sorted = sorted(data_traj['hour'].unique())

# create output directory to store the html files
html_dir = 'hourly_html_maps'
os.makedirs(html_dir, exist_ok=True)

def create_heat_map_for_hour(hour_val, data):
    # filter data for the given hour
    df_hour = data[data['hour'] == hour_val]
    
    # create folium map
    m = folium.Map(location=[48.7758, 9.1829], zoom_start=13)
    heat_data = df_hour[['ping_lat', 'ping_lon']].values.tolist()
    
    # add heat map layer
    HeatMap(heat_data, radius=6, blur=5).add_to(m)
    return m

# generate one html map per hour
for hr in hours_sorted:
    m_hour = create_heat_map_for_hour(hr, data_traj)
    outfile = os.path.join(html_dir, f'heatmap_hour_{hr}.html')
    m_hour.save(outfile)
    print(f'Saved: {outfile}')


In [None]:
from selenium import webdriver
import time
import os
import re

# Function to extract hour from filenames like "heatmap_hour_13.html"
def extract_hour(filename):
    match = re.search(r'heatmap_hour_(\d+).html', filename)
    return int(match.group(1)) if match else -1

# Use Firefox; make sure geckodriver is in your PATH
driver = webdriver.Firefox()

html_files = sorted([f for f in os.listdir(html_dir) if f.endswith('.html')],
                    key=extract_hour)

png_dir = 'hourly_png_maps'
os.makedirs(png_dir, exist_ok=True)

for html_file in html_files:
    file_path = os.path.join(html_dir, html_file)
    driver.get('file://' + os.path.abspath(file_path))
    
    # Allow the map to load
    time.sleep(2)
    
    driver.set_window_size(1200, 900)
    
    png_file = os.path.join(png_dir, html_file.replace('.html', '.png'))
    driver.save_screenshot(png_file)
    print(f'Saved screenshot: {png_file}')

driver.quit()


In [None]:
import os
import re
from PIL import Image, ImageDraw, ImageFont
import imageio

# Directories
input_dir = 'hourly_png_maps'         # Folder with your original PNG maps
annotated_dir = 'hourly_png_annotated'  # Folder to save annotated PNG images
os.makedirs(annotated_dir, exist_ok=True)

def extract_hour(filename):
    """
    Extracts the hour as an integer from a filename formatted as "heatmap_hour_{HOUR}.png".
    """
    match = re.search(r'heatmap_hour_(\d+)\.png', filename)
    return int(match.group(1)) if match else -1

# Get PNG files sorted numerically by hour
png_files = sorted([f for f in os.listdir(input_dir) if f.endswith('.png')],
                   key=extract_hour)

# Annotate each image with the hour
for png_file in png_files:
    img_path = os.path.join(input_dir, png_file)
    img = Image.open(img_path)
    draw = ImageDraw.Draw(img)
    
    # Load a TrueType font; fallback to default if not available
    try:
        font = ImageFont.truetype("arial.ttf", 40)
    except IOError:
        font = ImageFont.load_default()
    
    hour = extract_hour(png_file)
    text = f"Hour: {hour:02d}"
    
    # Position for the text (with padding)
    position = (10, 10)
    
    # Optional: add a shadow for better visibility
    shadow_color = "black"
    for offset in [(1, 1), (-1, -1), (1, -1), (-1, 1)]:
        pos = (position[0] + offset[0], position[1] + offset[1])
        draw.text(pos, text, font=font, fill=shadow_color)
    
    # Draw the text in white
    draw.text(position, text, font=font, fill="white")
    
    # Save the annotated image
    annotated_path = os.path.join(annotated_dir, png_file)
    img.save(annotated_path)
    print(f"Annotated and saved: {annotated_path}")

# Assemble annotated images into an animated GIF
annotated_files = sorted([f for f in os.listdir(annotated_dir) if f.endswith('.png')],
                          key=extract_hour)
images = [imageio.imread(os.path.join(annotated_dir, f)) for f in annotated_files]

gif_filename = 'density_map_hourly_annotated.gif'
imageio.mimsave(gif_filename, images, duration=1.0)  # duration=1.0 sec per frame
print(f"GIF created: {gif_filename}")



In [None]:
from PIL import Image
import os
import re

def extract_hour(filename):
    """
    Extracts the hour as an integer from a filename like "heatmap_hour_13.png"
    """
    match = re.search(r'heatmap_hour_(\d+)\.png', filename)
    return int(match.group(1)) if match else -1

# Directory containing your annotated PNG images
annotated_dir = 'hourly_png_annotated'
annotated_files = sorted(
    [f for f in os.listdir(annotated_dir) if f.endswith('.png')],
    key=extract_hour
)

frames = []
for file in annotated_files:
    path = os.path.join(annotated_dir, file)
    im = Image.open(path)
    # Convert image to 'P' mode using an adaptive palette with dithering for smoother colors
    im_p = im.convert('P', palette=Image.ADAPTIVE, dither=Image.FLOYDSTEINBERG)
    frames.append(im_p)

# Save frames as an animated GIF with optimized palette and dithering
gif_filename = 'density_map_hourly_annotated_near.gif'
frames[0].save(gif_filename, save_all=True, append_images=frames[1:], duration=1000, loop=0, optimize=True)
print(f"Optimized GIF created: {gif_filename}")


## Activities before and after (Clustering and Events)

## Examples to calibrate

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.stats import mode

# --- Clustering Class Definition ---
class ClusteringLocs:
    def __init__(self, lon_ar, lat_ar, eps=0.001, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples
        self.array = np.array(list(zip(lon_ar, lat_ar)))
        self.db = None
        if len(self.array) > 2:
            self.db = DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(self.array)
    
    def labels(self):
        if self.db is not None:
            return self.db.labels_
        else:
            return np.array([])
        
    def num_clus(self):
        if self.db is not None:
            num_clusters = len(set(self.db.labels_)) - (1 if -1 in self.db.labels_ else 0)
            return num_clusters
        else:
            return 0
        
    def clus_centroids(self):
        if self.db is not None:
            labels1 = self.labels()
            centroids = []
            for label in set(labels1):
                if label != -1:  # Exclude noise points
                    mask = (labels1 == label)
                    cluster_points = self.array[mask]
                    # Compute a representative centroid using the mode of the points.
                    centroid = list(mode(cluster_points, keepdims=True).mode[0])
                    centroids.append(centroid)
            centroids = np.array(centroids)
            if centroids.ndim == 1:
                return centroids, np.array([])
            else:
                return centroids[:, 0], centroids[:, 1]
        else:
            return np.array([]), np.array([])

# --- Create DataFrame with did and centroids_ar ---
# Assume data_traj is your DataFrame with at least the columns 'did', 'ping_lon', and 'ping_lat'
results = []
for did in data_traj_1min['did'].unique():
    df_did = data_traj_1min[data_traj_1min['did'] == did]
    lon_ar = df_did['ping_lon'].values
    lat_ar = df_did['ping_lat'].values
    clusterer = ClusteringLocs(lon_ar, lat_ar, eps=0.001, min_samples=4)
    
    if clusterer.num_clus() > 0:
        cent_x, cent_y = clusterer.clus_centroids()
        # Combine the separate x and y arrays into a list of (lon, lat) tuples.
        centroids = [(lon, lat) for lon, lat in zip(cent_x, cent_y)]
    else:
        centroids = []
        
    results.append({'did': did, 'centroids_ar': centroids})

did_centroids_ar = pd.DataFrame(results)
print(did_centroids_ar)


In [None]:

m = Map(location=[48.7758, 9.1829], zoom_start=8)
# filter to just one device
did_target = "050CC00BB91751B47DD1E88117830D162024-06-19"
df_did = data_traj_1min[data_traj_1min.did == did_target].sort_values('stime').reset_index(drop=True)
df_did_5min = data_traj_5min[data_traj_5min.did == did_target].sort_values('stime').reset_index(drop=True)


import folium
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import numpy as np
from sklearn.cluster import DBSCAN
from scipy.stats import mode

# -----------------------------
# 1. Create the Map
# -----------------------------
m = folium.Map(location=[48.7758, 9.1829], zoom_start=11)

# -----------------------------
# 2. Define normalization bounds for the day (midnight to 23:59:59)
# -----------------------------
vmin = 0
vmax = 24
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.get_cmap('gist_rainbow')


for i in range(len(df_did) - 1):
    lat1, lon1 = df_did.loc[i, ['ping_lat', 'ping_lon']]
    lat2, lon2 = df_did.loc[i + 1, ['ping_lat', 'ping_lon']]
    stime = df_did.loc[i, 'stime']
    hour_value = stime.hour 
    color = mcolors.to_hex(cmap(norm(hour_value)))
    folium.PolyLine(
        [[lat1, lon1], [lat2, lon2]],
        color=color,
        weight=4,
        opacity=0.7
    ).add_to(m)
for lat, lon, stime in zip(df_did_5min.ping_lat,df_did_5min.ping_lon, df_did_5min.stime):
    folium.CircleMarker(
        location=[lat, lon],
         radius=1,
            color='black',
            fill=True,
            fill_color='black',
            fill_opacity=0.1,
             popup=f"{did_target} - {stime.strftime('%H:%M:%S')}"
        ).add_to(m)
# -----------------------------
# 5. Extract clusters for the target device and add circle markers
# -----------------------------
# Here, we filter the centroids for did_target.
clusters = []
row = did_centroids_ar[did_centroids_ar['did'] == did_target]
if not row.empty:
    centroids = row.iloc[0]['centroids_ar']
    print(len(centroids))
    print(centroids)
    for centroid in centroids:
        clusters.append(centroid)
for centroid in clusters:
    folium.CircleMarker(
        location=(centroid[1], centroid[0]),  # folium expects [lat, lon]
        radius=5,
        color='black',
        fill=True,
        fill_color='black'
    ).add_to(m)


m  # In a Jupyter notebook, displaying m will show the map


In [None]:
did_centroids_ar.loc[[5]]['centroids_ar']

## City and Amenities

In [None]:
import requests

def get_osm_amenities(lat, lon, radius=50):
    overpass_url = "http://overpass-api.de/api/interpreter"
    query = f"""
    [out:json];
    (
      node(around:{radius},{lat},{lon})[amenity];
      way(around:{radius},{lat},{lon})[amenity];
      relation(around:{radius},{lat},{lon})[amenity];
    );
    out center;
    """
    response = requests.post(overpass_url, data=query)
    data = response.json()
    
    results = []
    for element in data.get('elements', []):
        tags = element.get('tags', {})
        name = tags.get('name', 'unknown')
        amenity_type = tags.get('amenity')
        # Try to get the city name from the address if available.
        city = tags.get('addr:city')
        results.append({'name': name, 'type': amenity_type, 'city': city})
    
    return results

# Apply the OSM lookup to each centroid in your did_centroids_ar DataFrame.
osm_info = {}  # Dictionary to hold amenities info per did

for idx, row in did_centroids_ar.iterrows():
    print(idx)
    did = row['did']
    centroids = row['centroids_ar']
    print(centroids)# List of centroids, each stored as (lon, lat)
    amenities_for_centroids = []
    
    for centroid in centroids:
        # Remember: centroids are stored as (lon, lat), but the query expects (lat, lon)
        lon, lat = centroid
        amenities = get_osm_amenities(lat, lon, radius=50)
        amenities_for_centroids.append(amenities)
    
    osm_info[did] = amenities_for_centroids

print(osm_info)


In [None]:
# Helper function to extract amenity types for each centroid.
def extract_types(amenities_lists, expected_length):
    """
    Given a list of lists of amenity dictionaries (each with a key 'type'),
    returns a list of types with length equal to expected_length.
    
    For each sublist in amenities_lists:
      - If the sublist is empty or no dictionary contains a 'type' key,
        append "unknown".
      - Otherwise, append the type from the first dictionary that has a 'type'.
    
    If amenities_lists is empty or shorter than expected_length,
    the result is padded with "unknown" until its length equals expected_length.
    """
    result = []
    if not amenities_lists or len(amenities_lists) == 0:
        result = ["unknown"] * expected_length
    else:
        for centroid_amenities in amenities_lists:
            if not centroid_amenities:
                result.append("unknown")
            else:
                found = False
                for amenity in centroid_amenities:
                    if 'type' in amenity:
                        result.append(amenity['type'])
                        found = True
                        break
                if not found:
                    result.append("unknown")
    # Pad with "unknown" if result is shorter than expected_length.
    while len(result) < expected_length:
        result.append("unknown")
    return result

# Now, add a new column 'types' to did_centroids_ar.
# We assume that did_centroids_ar has a column 'centroids_ar' (a list of centroids)
# and osm_info is a dict mapping did to a list of amenity lists (one per centroid).
did_centroids_ar['types'] = did_centroids_ar.apply(
    lambda row: extract_types(osm_info.get(row['did'], []), len(row['centroids_ar'])),
    axis=1
)

print(did_centroids_ar[['did', 'types']])



In [None]:
import requests

def reverse_geocode(lat, lon):
    url = "https://nominatim.openstreetmap.org/reverse"
    params = {
        'lat': lat,
        'lon': lon,
        'format': 'json',
        'addressdetails': 1,
        'zoom': 20,  # Higher zoom = more specific (building-level)
    }
    headers = {
        'User-Agent': 'my-geo-app/1.0 (alrinconh@gmail.com)'  # Required by Nominatim usage policy
    }

    response = requests.get(url, params=params, headers=headers)
    data = response.json()

    address = data.get("address", {})
    display_name = data.get("type")
    amenity = data.get("class")
    osm_id=data.get("osm_id")
    city = address.get("city") or address.get("town") or address.get("village")
  
   
    return {
        "display_name": display_name,
        "amenity": amenity,
        "city": city,
        "osm_id": osm_id
    }




In [None]:
reverse_geocode(48.412119250927006,9.066829043028424)

In [None]:
# Create a dictionary that will map each device id (did) to its list of (city, amenity) pairs
nominatim = {}

for idx, row in did_centroids_ar.iterrows():
    did = row['did']
    centroids = row['centroids_ar']  # Expecting a list of centroids stored as (lon, lat) tuples
    triples = []
    
    for centroid in centroids:
        # Remember: centroids are stored as (lon, lat) but our function expects (lat, lon)
        lon, lat = centroid
        result = reverse_geocode(lat, lon)
        # Use defaults ("unknown") if values are missing
        city = result.get("city") if result.get("city") is not None else "unknown"
        amenity = result.get("amenity") if result.get("amenity") is not None else "unknown"
        name= result.get("display_name") if result.get("display_name") is not None else "unknown"
        osm_id= result.get("osm_id") if result.get("osm_id") is not None else "unknown"

        
        triples.append((city, amenity, name, osm_id))
    
    nominatim[did] = triples



In [None]:
nominatim

In [None]:
# Add the new column "nominatim_pairs" to your DataFrame by mapping each did to its corresponding list.
did_centroids_ar['nominatim'] = did_centroids_ar['did'].apply(lambda d: nominatim.get(d, []))

# Check the result:
print(did_centroids_ar[['did', 'nominatim']])

In [None]:
did_centroids_ar['len'] = did_centroids_ar['centroids_ar'].apply(len)

In [None]:
did_centroids_ar

In [None]:
#did_centroids_ar.to_csv('11_Sttutgart_before_after_german_matches.csv', index=False)

In [None]:
# filter to just one device
did_target = "0D6F298CF70CFF26F8C44C50660009362024-06-19"
df_did = data_traj_1min[data_traj_1min.did == did_target].sort_values('stime').reset_index(drop=True)
df_did_5min = data_traj_5min[data_traj_5min.did == did_target].sort_values('stime').reset_index(drop=True)

m = folium.Map(location=[48.7758, 9.1829], zoom_start=8)

# full-day normalization
day = df_did.stime.dt.date.iloc[0]
t0 = datetime.datetime.combine(day, datetime.time(0, 0))
t1 = datetime.datetime.combine(day, datetime.time(23, 59, 59))
vmin = t0.timestamp()
vmax = t1.timestamp()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.get_cmap('gist_rainbow')
clusters = []
row = did_centroids_ar[did_centroids_ar['did'] == did_target]
if not row.empty:
    centroids = row.iloc[0]['centroids_ar']
    print(len(centroids))
    print(centroids)
    for centroid in centroids:
        clusters.append(centroid)
for i in range(len(df_did) - 1):
    lat1, lon1 = df_did.loc[i, ['ping_lat', 'ping_lon']]
    lat2, lon2 = df_did.loc[i + 1, ['ping_lat', 'ping_lon']]
    stime = df_did.loc[i, 'stime']
    color = mcolors.to_hex(cmap(norm(stime.timestamp())))
    folium.PolyLine(
        [[lat1, lon1], [lat2, lon2]],
        color=color,
        weight=4,
        opacity=0.7
    ).add_to(m)

for centroid in clusters:
    folium.CircleMarker(
        location=(centroid[1], centroid[0]),  # folium expects [lat, lon]
        radius=5,
        color='black',
        fill=True,
        fill_color='black',
    ).add_to(m)
for lat, lon, stime in zip(df_did_5min.ping_lat,df_did_5min.ping_lon, df_did_5min.stime):
    folium.CircleMarker(
        location=[lat, lon],
         radius=1,
            color='black',
            fill=True,
            fill_color='black',
            fill_opacity=0.1,
             popup=f"{did_target} - {stime.strftime('%H:%M:%S')}"
        ).add_to(m)
# -----------------------------
# 6. Save or display the map
# -----------------------------
#m.save("map.html")
m  # In a Jupyter notebook, displaying m will show the map

In [None]:
did_centroids_ar['nominatim'][4]


In [None]:
did_centroids_ar['types'][4]


In [None]:
# Helper function to extract all nominatim entries after the stadium target.
def extract_after(nom_list):
    """
    Given a list of nominatim entries (tuples), find the last occurrence of the target entry:
      ('Stuttgart', 'leisure', 'stadium', 3869991)
    and return a list of all entries after it.
    If the target is not found, return an empty list.
    """
    target = ('Stuttgart', 'leisure', 'stadium', 3869991)
    if not nom_list:
        return []
    # Find all indices where the target occurs.
    indices = [i for i, entry in enumerate(nom_list) if entry == target]
    if indices:
        last_idx = indices[-1]
        return nom_list[last_idx+1:]
    else:
        return []

def extract_origin(nom_list):
    """
    Returns the city (first element) of the first nominatim entry, or None if empty.
    """
    if nom_list and len(nom_list) > 0:
        return nom_list[0][0]
    return None

def extract_final(nom_list):
    """
    Returns the city (first element) of the last nominatim entry, or None if empty.
    """
    if nom_list and len(nom_list) > 0:
        return nom_list[-1][0]
    return None

# Add the new columns to did_centroids_ar.
did_centroids_ar['after'] = did_centroids_ar['nominatim'].apply(extract_after)
did_centroids_ar['origin'] = did_centroids_ar['nominatim'].apply(extract_origin)
did_centroids_ar['final'] = did_centroids_ar['nominatim'].apply(extract_final)

print(did_centroids_ar[['did', 'after', 'origin', 'final']])


In [None]:
did_centroids_ar

In [None]:
# Count rows where origin != final and final is 'Stuttgart'
count = did_centroids_ar[(did_centroids_ar['origin'] != did_centroids_ar['final']) & (did_centroids_ar['final'] == 'Stuttgart')].shape[0]
print("Count of rows with origin different from final and final 'Stuttgart':", count/len(did_centroids_ar))


In [None]:
# Count rows where origin != final and final is 'Stuttgart'
count = did_centroids_ar[(did_centroids_ar['origin'] != did_centroids_ar['final'])].shape[0]
print("Count of rows with origin different from final:", count/len(did_centroids_ar))


In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import itertools

# Flatten the "after" column from did_centroids_ar into one list of tuples.
# Each row in did_centroids_ar['after'] is a list of tuples.
all_after = list(itertools.chain.from_iterable(did_centroids_ar['after']))

# Check that we have tuples and they are of the expected length.
if all_after and len(all_after[0]) >= 4:
    # Create counters for each tuple element.
    counter_city    = Counter([t[0] for t in all_after])
    counter_type    = Counter([t[1] for t in all_after])
    counter_subtype = Counter([t[2] for t in all_after])
    counter_id      = Counter([t[3] for t in all_after])
else:
    print("The 'after' column does not contain tuples of the expected length.")
    counter_city = counter_type = counter_subtype = counter_id = Counter()

# Plot histograms for each tuple element in a 2x2 grid.
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

# Histogram for the first element (city)
axs[0, 0].bar(counter_city.keys(), counter_city.values())
axs[0, 0].set_title("Histogram of City (First Entry)")
axs[0, 0].tick_params(axis='x', rotation=45)

# Histogram for the second element (type)
axs[0, 1].bar(counter_type.keys(), counter_type.values())
axs[0, 1].set_title("Histogram of Second Entry (Type)")
axs[0, 1].tick_params(axis='x', rotation=45)

# Histogram for the third element (sub-type)
axs[1, 0].bar(counter_subtype.keys(), counter_subtype.values())
axs[1, 0].set_title("Histogram of Third Entry (Sub-type)")
axs[1, 0].tick_params(axis='x', rotation=90)

# Histogram for the fourth element (id)
axs[1, 1].bar(counter_id.keys(), counter_id.values())
axs[1, 1].set_title("Histogram of Fourth Entry (ID)")
axs[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()
