In [1]:
import pandas as pd
initial_data = pd.read_csv ('combined_data_21_01_2025.csv')

In [2]:
# Group by 'device_id' and count the number of rows for each device
device_counts = initial_data.groupby('device_id').size()

# Filter out device_ids with fewer than 10 data entries
valid_device_ids = device_counts[device_counts >= 10].index

# Filter the original DataFrame to keep only rows with valid device_ids
initial_data = initial_data[initial_data['device_id'].isin(valid_device_ids)]

In [3]:
import pandas as pd
from geopy.geocoders import Nominatim
from shapely.geometry import Point
from shapely.ops import transform
import pyproj
from shapely import wkt

# Function to get coordinates of the building address
def get_coordinates(address):
    geolocator = Nominatim(user_agent="Buffer_Creation")
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        raise ValueError(f"Address '{address}' not found.")

# Function to filter out rows within the buffer
def filter_within_buffer(initial_data_w, address, radius_m):
    # Get the building's coordinates (latitude, longitude)
    building_lat, building_lon = get_coordinates(address)
    
    # Create a Point (building location)
    building_point = Point(building_lon, building_lat)
    
    # Set up the projection for UTM (meter-based projection)
    proj_wgs84 = pyproj.CRS('EPSG:4326')  # WGS84 (lat/lon)
    proj_utm = pyproj.CRS('EPSG:32632')  # UTM zone 32N (adjust for your location)

    # Transform the building point to UTM (to get meter-based coordinates)
    transformer = pyproj.Transformer.from_crs(proj_wgs84, proj_utm, always_xy=True)
    building_point_utm = transform(transformer.transform, building_point)

    # Create a buffer in meters (UTM system uses meters)
    building_buffer = building_point_utm.buffer(radius_m)

    # Ensure the geometry column is in the correct format (Shapely geometries)
    def safe_wkt_load(x):
        if isinstance(x, str):  # Only try to load WKT strings
            try:
                return wkt.loads(x)
            except:
                return None  # Return None if the WKT is invalid
        return None  # Return None for non-string entries

    # Apply the safe WKT loading function
    initial_data_w['geometry'] = initial_data_w['geometry'].apply(safe_wkt_load)

    # Reproject the geometry column to UTM
    initial_data_w['geometry_utm'] = initial_data_w['geometry'].apply(lambda point: transform(transformer.transform, point) if point is not None else None)
    
    # Filter rows where the UTM geometry is outside the buffer
    filtered_data = initial_data_w[~initial_data_w['geometry_utm'].apply(lambda point: building_buffer.contains(point) if point is not None else False)]

    # Drop the 'geometry_utm' column as it's no longer needed
    filtered_data = filtered_data.drop(columns=['geometry_utm'])
    
    return filtered_data

# Example usage
address = "Von-Steuben-Straße 21, 48143 Münster"
initial_data_w = initial_data.copy(deep=True)  # Explicitly create a deep copy of the DataFrame

# Apply the filter to get bike data within the buffer
atrai_bike_data = filter_within_buffer(initial_data_w, address, radius_m=15)

# View the filtered data
#print(atrai_bike_data)

In [4]:
atrai_bike_data_PM = atrai_bike_data[['createdAt', 'Rel. Humidity', 'Finedust PM1', 'Finedust PM2.5', 'Finedust PM4', 'Finedust PM10', 'geometry', 'device_id', 'lng', 'lat']] #filter for relevant parameters

In [5]:
import pandas as pd
import numpy as np

# Filter the data based on Relative Humidity
PM_data_filtered = atrai_bike_data_PM[(atrai_bike_data_PM['Rel. Humidity'] <= 75) & (atrai_bike_data_PM['Rel. Humidity'].notna())]

def replace_outliers_with_nan_by_device(PM_data_no_outliers, column):
    # Group by 'device_id' (sensebox) and apply the IQR calculation to each group
    def calculate_and_replace_outliers(group):
        # Calculate Q1, Q3, and IQR for the current column (outlier detection)
        Q1 = group[column].quantile(0.25)
        Q3 = group[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replace values outside the bounds with NaN
        group[column] = group[column].apply(lambda x: x if lower_bound <= x <= upper_bound else np.nan)
        return group

    # Apply the outlier replacement to each group (grouped by 'device_id')
    PM_data_no_outliers = PM_data_no_outliers.groupby('device_id', group_keys=False).apply(calculate_and_replace_outliers)
    
    return PM_data_no_outliers

# List of columns for PM values to check for outliers
pm_columns = ['Finedust PM1', 'Finedust PM2.5', 'Finedust PM4', 'Finedust PM10']

# Create a copy of the filtered data to avoid modifying the original
PM_data_no_outliers = PM_data_filtered.copy(deep=True)

# Loop through each column and replace outliers for each PM column
for column in pm_columns:
    PM_data_no_outliers = replace_outliers_with_nan_by_device(PM_data_no_outliers, column)

# Ensure the PM columns are of type float64 after replacing outliers
for column in pm_columns:
    PM_data_no_outliers[column] = PM_data_no_outliers[column].astype('float64')

  PM_data_no_outliers = PM_data_no_outliers.groupby('device_id', group_keys=False).apply(calculate_and_replace_outliers)
  PM_data_no_outliers = PM_data_no_outliers.groupby('device_id', group_keys=False).apply(calculate_and_replace_outliers)
  PM_data_no_outliers = PM_data_no_outliers.groupby('device_id', group_keys=False).apply(calculate_and_replace_outliers)
  PM_data_no_outliers = PM_data_no_outliers.groupby('device_id', group_keys=False).apply(calculate_and_replace_outliers)


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Drop rows with NaN values 
PM_data_filtered = PM_data_filtered.dropna()

# Set up the figure size for the subplots
plt.figure(figsize=(12, 10))

# Create a 2x2 grid of subplots for the PM values
# First subplot for PM1
plt.subplot(2, 2, 1)  # 2 rows, 2 columns, first subplot
sns.boxplot(data=PM_data_filtered, x='device_id', y='Finedust PM1')
plt.title('Finedust PM1 Concentration by Device ID')
plt.xlabel('Device ID')
plt.ylabel('Finedust PM1')
plt.xticks([])
#plt.ylim(0,80)

# Second subplot for PM2.5
plt.subplot(2, 2, 2)  # 2 rows, 2 columns, second subplot
sns.boxplot(data=PM_data_filtered, x='device_id', y='Finedust PM2.5')
plt.title('Finedust PM2.5 Concentration by Device ID')
plt.xlabel('Device ID')
plt.ylabel('Finedust PM2.5')
plt.xticks([])
#plt.ylim(0,80)

# Third subplot for PM4
plt.subplot(2, 2, 3)  # 2 rows, 2 columns, third subplot
sns.boxplot(data=PM_data_filtered, x='device_id', y='Finedust PM4')
plt.title('Finedust PM4 Concentration by Device ID')
plt.xlabel('Device ID')
plt.ylabel('Finedust PM4')
plt.xticks([])
#plt.ylim(0,80)

# Fourth subplot for PM10
plt.subplot(2, 2, 4)  # 2 rows, 2 columns, fourth subplot
sns.boxplot(data=PM_data_filtered, x='device_id', y='Finedust PM10')
plt.title('Finedust PM10 Concentration by Device ID')
plt.xlabel('Device ID')
plt.ylabel('Finedust PM10')
plt.xticks([])
#plt.ylim(0,80)

# Adjust layout to avoid overlap of labels
plt.tight_layout()
plt.close()

In [7]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# Assuming 'PM_data_filtered' has 'lng' and 'lat' columns with the longitude and latitude data
pm25_data_heatmap = PM_data_no_outliers[['lat', 'lng', 'Finedust PM2.5']]

pm25_data_heatmap = pm25_data_heatmap.dropna(subset=['lat', 'lng', 'Finedust PM2.5'])

# Create the map centered around the average lat/lon
m_PM25 = folium.Map(location=[51.9607, 7.6261], zoom_start=13)

# Prepare the heatmap data: [latitude, longitude, PM2.5 value]
heat_data_25 = pm25_data_heatmap[['lat', 'lng', 'Finedust PM2.5']].values

# Create and add the heatmap to the map
HeatMap(heat_data_25, radius = 15, blur=15).add_to(m_PM25)

m_PM25.save("PM2.5 heatmap.html")

In [8]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# Assuming 'PM_data_filtered' has 'lat', 'lng', 'Finedust PM2.5', and 'createdAt' columns

# Convert the 'createdAt' column to datetime
PM_data_filtered['createdAt'] = pd.to_datetime(PM_data_filtered['createdAt'])

# Extract the month and time from 'createdAt'
PM_data_filtered['month'] = PM_data_filtered['createdAt'].dt.month
PM_data_filtered['time_of_day'] = PM_data_filtered['createdAt'].dt.time

# Define a function to categorize the months into seasons
def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

# Apply the function to create a new 'season' column
PM_data_filtered['season'] = PM_data_filtered['month'].apply(get_season)

# Define the time range filter
start_time = pd.to_datetime("16:00", format="%H:%M").time()  # Start time (6:00 AM)
end_time = pd.to_datetime("18:00", format="%H:%M").time()  # End time (10:00 AM)

# Function to filter data based on both season and time
def filter_data(time_filter=True, season_filter=True):
    filtered_data = PM_data_filtered
    
    # Apply the time filter if enabled
    if time_filter:
        filtered_data = filtered_data[(filtered_data['time_of_day'] >= start_time) & 
                                      (filtered_data['time_of_day'] <= end_time)]
    
    # Apply the season filter if enabled
    if season_filter:
        # Select a season to filter (change as needed or pass as an argument)
        selected_season = 'Autumn' 
        filtered_data = filtered_data[filtered_data['season'] == selected_season]
    
    return filtered_data[['lat', 'lng', 'Finedust PM2.5']].dropna(subset=['lat', 'lng', 'Finedust PM2.5'])

# Now, you can choose to filter by time, season, or both
data_filtered = filter_data(time_filter=True, season_filter=True)

# Create the map centered around the average lat/lon
m_PM25_combined = folium.Map(location=[51.9607, 7.6261], zoom_start=13)

# Prepare the heatmap data: [latitude, longitude, PM2.5 value]
heat_data_combined = data_filtered[['lat', 'lng', 'Finedust PM2.5']].values

# Create and add the heatmap to the map
HeatMap(heat_data_combined, radius=15, blur=15).add_to(m_PM25_combined)

# Save the map to an HTML file
m_PM25_combined.save("Variable_PM2.5_heatmap.html")

In [9]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# Assuming 'PM_data_filtered' has 'lng' and 'lat' columns with the longitude and latitude data
pm10_data_heatmap = PM_data_no_outliers[['lat', 'lng', 'Finedust PM10']]

pm10_data_heatmap = pm10_data_heatmap.dropna(subset=['lat', 'lng', 'Finedust PM10'])

# Create the map centered around the average lat/lon
m_PM10 = folium.Map(location=[51.9607, 7.6261], zoom_start=13)

# Prepare the heatmap data: [latitude, longitude, PM10 value]
heat_data_10 = pm10_data_heatmap[['lat', 'lng', 'Finedust PM10']].values

# Create and add the heatmap to the map
HeatMap(heat_data_10, radius = 15, blur = 15).add_to(m_PM10)

m_PM10.save("PM10 heatmap.html")

In [10]:
#import time
#from sklearn.cluster import DBSCAN
#from geopy.geocoders import Nominatim
#import pandas as pd
#import numpy as np

#PM_city_clusters = PM_data_filtered.copy()
#PM_city_clusters = PM_data_no_outliers.copy() #ohne Ausreißer
#PM_city_clusters = PM_city_clusters.dropna()

#PM_city_clusters['lat'] = PM_city_clusters['lat'].astype('float32')
#PM_city_clusters['lng'] = PM_city_clusters['lng'].astype('float32')
#coords = np.radians(PM_city_clusters[['lat', 'lng']])

# Convert degrees to radians (required for haversine metric)
#coords = np.radians(PM_city_clusters[['lat', 'lng']])

# Apply DBSCAN clustering with haversine metric
#clusterer = DBSCAN(eps=0.001, min_samples=10, metric='haversine')  # Adjust eps as needed # eps=0.001 ~ 6,370 m
#PM_city_clusters['cluster'] = clusterer.fit_predict(coords)

# Initialize geolocator
#geolocator = Nominatim(user_agent="sensebox bike")

# Function to get city name
#def get_city(lat, lon):
#    time.sleep(1)
#    location = geolocator.reverse((lat, lon), exactly_one=True)
#    return location.raw['address'].get('city', 'Unknown')

# Reverse geocode one point per cluster
#cluster_centroids = PM_city_clusters.groupby('cluster')[['lat', 'lng']].first().reset_index()
#cluster_centroids['city'] = cluster_centroids.apply(lambda row: get_city(row['lat'], row['lng']), axis=1)

# Map the city names back to the original DataFrame
#PM_cities = PM_city_clusters.merge(cluster_centroids[['cluster', 'city']], on='cluster', how='left')
#PM_cities['city'] = PM_cities['city'].replace('São Paulo', 'Sao Paulo')

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define Münster's bounding box (example values)
min_lat, max_lat = 51.90, 52.10  # Approximate latitude range
min_lon, max_lon = 7.50, 7.75    # Approximate longitude range

# Filter data within the bounding box
muenster_data = PM_data_filtered[
    (PM_data_filtered['lat'] >= min_lat) & (PM_data_filtered['lat'] <= max_lat) &
    (PM_data_filtered['lng'] >= min_lon) & (PM_data_filtered['lng'] <= max_lon)
]

# Set up the figure size for the subplots
plt.figure(figsize=(12, 10))

# Create a 2x2 grid of subplots for the PM values
# First subplot for PM1
plt.subplot(2, 2, 1)  # 2 rows, 2 columns, first subplot
sns.boxplot(data=muenster_data, x='device_id', y='Finedust PM1')
plt.title('PM1 Concentration by Device ID, Münster')
plt.xlabel('Devices')
plt.ylabel('PM1')
plt.xticks([])
plt.ylim(0,80)

# Second subplot for PM2.5
plt.subplot(2, 2, 2)  # 2 rows, 2 columns, second subplot
sns.boxplot(data=muenster_data, x='device_id', y='Finedust PM2.5')
plt.title('PM2.5 Concentration by Device ID, Münster')
plt.xlabel('Devices')
plt.ylabel('PM2.5')
plt.xticks([])
plt.ylim(0,80)

# Third subplot for PM4
plt.subplot(2, 2, 3)  # 2 rows, 2 columns, third subplot
sns.boxplot(data=muenster_data, x='device_id', y='Finedust PM4')
plt.title('PM4 Concentration by Device ID, Münster')
plt.xlabel('Devices')
plt.ylabel('PM4')
plt.xticks([])
plt.ylim(0,80)

# Fourth subplot for PM10
plt.subplot(2, 2, 4)  # 2 rows, 2 columns, fourth subplot
sns.boxplot(data=muenster_data, x='device_id', y='Finedust PM10')
plt.title('PM10 Concentration by Device ID, Münster')
plt.xlabel('Devices')
plt.ylabel('PM10')
plt.xticks([])
plt.ylim(0,80)

# Adjust layout to avoid overlap of labels
plt.tight_layout()

plt.savefig('PM_Münster.png')
plt.close()

In [12]:
import folium
from folium.plugins import HeatMap
import numpy as np

# Prepare data: filter out rows with missing coordinates or PM2.5 values
heatmap_data_muenster_PM25 = muenster_data[['lat', 'lng', 'Finedust PM2.5']].dropna()

# Calculate the 95th percentile of the Finedust PM2.5 data
percentile_95 = np.percentile(heatmap_data_muenster_PM25['Finedust PM2.5'], 95)

# Normalize the data by dividing by the 95th percentile
heatmap_data_muenster_PM25['normalized_PM2.5'] = (
    heatmap_data_muenster_PM25['Finedust PM2.5'] / percentile_95
).clip(upper=1)  # Clip values greater than 1 to 1

# Create a Folium map centered around Münster
m_muenster_PM25 = folium.Map(
    location=[51.9607, 7.6261], 
    zoom_start=13
)

# Add the heatmap layer using the normalized PM2.5 values
HeatMap(
    data=heatmap_data_muenster_PM25[['lat', 'lng', 'normalized_PM2.5']].values.tolist(),
    radius=15,  # Adjust the radius for the desired heatmap spread
    blur=15,
    max_zoom=15
).add_to(m_muenster_PM25)

m_muenster_PM25.save("Münster_PM25_hm.html")

In [13]:
import pandas as pd
import matplotlib.pyplot as plt

muenster_data_diurnal = muenster_data.copy()

# Ensure 'createdAt' is in datetime format
muenster_data_diurnal['createdAt'] = pd.to_datetime(muenster_data_diurnal['createdAt'])

# Step 1: Extract the time (ignoring the date) and round it to 30-minute intervals
muenster_data_diurnal['time_30min'] = muenster_data_diurnal['createdAt'].dt.strftime('%H:%M')
muenster_data_diurnal['time_30min'] = pd.to_datetime(muenster_data_diurnal['time_30min'], format='%H:%M')

# Step 2: Round the time to the nearest 30 minutes
muenster_data_diurnal['time_30min'] = muenster_data_diurnal['time_30min'].dt.round('30min')

# Step 3: Group by the rounded time and calculate the mean PM2.5 concentration
diurnal_cycle = muenster_data_diurnal.groupby('time_30min')['Finedust PM2.5'].mean()

# Step 4: Create a full range of 30-minute intervals from 00:00 to 23:30
start_time = pd.to_datetime('00:00', format='%H:%M')
end_time = pd.to_datetime('23:30', format='%H:%M')
date_range = pd.date_range(start=start_time, end=end_time, freq='30min')

# Step 5: Reindex the diurnal_cycle to include all the 30-minute intervals and fill missing values with NaN
diurnal_cycle_full = diurnal_cycle.reindex(date_range, fill_value=None)

# Plot the diurnal cycle as a line graph with markers
plt.figure(figsize=(10, 6))
plt.plot(diurnal_cycle_full.index, diurnal_cycle_full.values, marker='o', linestyle='-', color='skyblue')

# Format the plot
plt.title("Diurnal Cycle of PM2.5 Concentrations, Münster")
plt.xlabel("Time of Day")
plt.ylabel("Average PM2.5 Concentration")
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(ticks=diurnal_cycle_full.index, labels=diurnal_cycle_full.index.strftime('%H:%M'), rotation=45)
plt.tight_layout()

plt.savefig('Diurnal_PM2.5_Münster.png')
plt.close()

In [14]:
import pandas as pd
import matplotlib.pyplot as plt

# Make a copy of the data
muenster_data_monthly = muenster_data.copy()

# Ensure 'createdAt' is in datetime format
muenster_data_monthly['createdAt'] = pd.to_datetime(muenster_data_monthly['createdAt'])

# Extract the month and year for grouping
muenster_data_monthly['month'] = muenster_data_monthly['createdAt'].dt.to_period('M')

# Group by month and calculate the mean PM2.5 concentration
muenster_monthly_averages = muenster_data_monthly.groupby('month')['Finedust PM2.5'].mean()

# Convert the PeriodIndex to a DatetimeIndex for plotting
muenster_monthly_averages.index = muenster_monthly_averages.index.to_timestamp()

# Plot the monthly averages as a line graph
plt.figure(figsize=(10, 6))
plt.plot(muenster_monthly_averages.index, muenster_monthly_averages.values, marker='o', linestyle='-', color='skyblue')

# Format the plot
plt.title("Monthly Average PM2.5 Concentrations, Münster")
plt.xlabel("Month")
plt.ylabel("Average PM2.5 Concentration")
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
#plt.legend()
plt.tight_layout()

plt.savefig('Monthly_PM2.5_Münster.png')
plt.close()

In [15]:
# Define São Paulo's bounding box
min_lat, max_lat = -24.0, -23.3
min_lon, max_lon = -46.825, -46.365

# Filter the dataset
sao_paulo_data = PM_data_filtered[
    (PM_data_filtered['lat'] >= min_lat) & (PM_data_filtered['lat'] <= max_lat) &
    (PM_data_filtered['lng'] >= min_lon) & (PM_data_filtered['lng'] <= max_lon)
]

# Set up the figure size for the subplots
plt.figure(figsize=(12, 10))

# Create a 2x2 grid of subplots for the PM values
# First subplot for PM1
plt.subplot(2, 2, 1)  # 2 rows, 2 columns, first subplot
sns.boxplot(data=sao_paulo_data, x='device_id', y='Finedust PM1')
plt.title('PM1 Concentration by Device ID, Sao Paulo')
plt.xlabel('Device ID')
plt.ylabel('PM1')
plt.xticks(rotation=90)
#plt.ylim(0,80)

# Second subplot for PM2.5
plt.subplot(2, 2, 2)  # 2 rows, 2 columns, second subplot
sns.boxplot(data=sao_paulo_data, x='device_id', y='Finedust PM2.5')
plt.title('PM2.5 Concentration by Device ID, Sao Paulo')
plt.xlabel('Device ID')
plt.ylabel('PM2.5')
plt.xticks(rotation=90)
#plt.ylim(0,80)

# Third subplot for PM4
plt.subplot(2, 2, 3)  # 2 rows, 2 columns, third subplot
sns.boxplot(data=sao_paulo_data, x='device_id', y='Finedust PM4')
plt.title('PM4 Concentration by Device ID, Sao Paulo')
plt.xlabel('Device ID')
plt.ylabel('PM4')
plt.xticks(rotation=90)
#plt.ylim(0,80)

# Fourth subplot for PM10
plt.subplot(2, 2, 4)  # 2 rows, 2 columns, fourth subplot
sns.boxplot(data=sao_paulo_data, x='device_id', y='Finedust PM10')
plt.title('PM10 Concentration by Device ID, Sao Paulo')
plt.xlabel('Device ID')
plt.ylabel('PM10')
plt.xticks(rotation=90)
#plt.ylim(0,80)

# Adjust layout to avoid overlap of labels
plt.tight_layout()

plt.savefig('PM_Sao_Paulo.png')
plt.close()

In [16]:
import pandas as pd
import matplotlib.pyplot as plt

sao_paulo_data_diurnal = sao_paulo_data.copy()

sao_paulo_data_diurnal = sao_paulo_data_diurnal[sao_paulo_data_diurnal['device_id'] != "Giro+bikeAtrai"]

# Ensure 'createdAt' is in datetime format
sao_paulo_data_diurnal['createdAt'] = pd.to_datetime(sao_paulo_data_diurnal['createdAt'])

# Step 1: Extract the time (ignoring the date) and round it to 30-minute intervals
sao_paulo_data_diurnal['time_30min'] = sao_paulo_data_diurnal['createdAt'].dt.strftime('%H:%M')
sao_paulo_data_diurnal['time_30min'] = pd.to_datetime(sao_paulo_data_diurnal['time_30min'], format='%H:%M')

# Step 2: Round the time to the nearest 30 minutes
sao_paulo_data_diurnal['time_30min'] = sao_paulo_data_diurnal['time_30min'].dt.round('30min')

# Step 3: Group by the rounded time and calculate the mean PM2.5 concentration
diurnal_cycle_sp = sao_paulo_data_diurnal.groupby('time_30min')['Finedust PM2.5'].mean()

# Step 4: Create a full range of 30-minute intervals from 00:00 to 23:30
start_time = pd.to_datetime('00:00', format='%H:%M')
end_time = pd.to_datetime('23:30', format='%H:%M')
date_range = pd.date_range(start=start_time, end=end_time, freq='30min')

# Step 5: Reindex the diurnal_cycle to include all the 30-minute intervals and fill missing values with NaN
diurnal_cycle_full_sp = diurnal_cycle_sp.reindex(date_range, fill_value=None)

# Plot the diurnal cycle as a line graph with markers
plt.figure(figsize=(10, 6))
plt.plot(diurnal_cycle_full_sp.index, diurnal_cycle_full_sp.values, marker='o', linestyle='-', color='skyblue')

# Format the plot
plt.title("Diurnal Cycle of PM2.5 Concentrations, Sao Paulo")
plt.xlabel("Time of Day")
plt.ylabel("Average PM2.5 Concentration")
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(ticks=diurnal_cycle_full_sp.index, labels=diurnal_cycle_full_sp.index.strftime('%H:%M'), rotation=45)

plt.tight_layout()
plt.savefig('Diurnal_PM2.5_Sao_Paolo.png')
plt.close()

In [17]:
traffic_station = pd.read_csv('Daten_Weseler_Strasse_Verkehr.csv')
background_station = pd.read_csv('Daten_Geist_Background.csv')

In [18]:
# Ensure 'datum_beginn' is in datetime format
background_station['datum_beginn'] = pd.to_datetime(background_station['datum_beginn'])
traffic_station['datum_beginn'] = pd.to_datetime(traffic_station['datum_beginn'])

# Create the 'month' column by extracting the year and month
background_station['month'] = background_station['datum_beginn'].dt.strftime('%Y-%m')
traffic_station['month'] = traffic_station['datum_beginn'].dt.strftime('%Y-%m')

# Convert 'MSGE LUFT ONLINE' in background_station to float
background_station['MSGE LUFT ONLINE'] = pd.to_numeric(background_station['MSGE LUFT ONLINE'], errors='coerce')

# Convert 'VMS2 LUFT ONLINE' in traffic_station to float
traffic_station['VMS2 LUFT ONLINE'] = pd.to_numeric(traffic_station['VMS2 LUFT ONLINE'], errors='coerce')

In [19]:
# If 'month' is the index, reset it to a column
muenster_comparison = muenster_monthly_averages.reset_index()
muenster_comparison['Finedust PM2.5'] = muenster_comparison['Finedust PM2.5'].round()
muenster_comparison = muenster_comparison.rename(columns={'Finedust PM2.5': 'monthly averages'})

In [20]:
# Convert 'month' columns to datetime format for all DataFrames
background_station['month'] = pd.to_datetime(background_station['month'])
traffic_station['month'] = pd.to_datetime(traffic_station['month'])
muenster_comparison['month'] = pd.to_datetime(muenster_comparison['month'])

# Find common months across all three datasets
common_months_all = (
    set(background_station['month'])
    .intersection(set(traffic_station['month']))
    .intersection(set(muenster_comparison['month']))
)

# Filter each DataFrame to include only rows with common months
background_station_filtered = background_station[background_station['month'].isin(common_months_all)]
traffic_station_filtered = traffic_station[traffic_station['month'].isin(common_months_all)]
muenster_monthly_averages_filtered = muenster_comparison[muenster_comparison['month'].isin(common_months_all)]

# Sort the dataframes by the month for consistent x-axis order
background_station_filtered = background_station_filtered.sort_values(by='month')
traffic_station_filtered = traffic_station_filtered.sort_values(by='month')
muenster_monthly_averages_filtered = muenster_monthly_averages_filtered.sort_values(by='month')

# Plot all three datasets
plt.figure(figsize=(12, 7))

plt.plot(
    background_station_filtered['month'], 
    background_station_filtered['MSGE LUFT ONLINE'], 
    marker='o', label='Background Station', linestyle='-'
)
plt.plot(
    traffic_station_filtered['month'], 
    traffic_station_filtered['VMS2 LUFT ONLINE'], 
    marker='x', label='Traffic Station', linestyle='--'
)
plt.plot(
    muenster_monthly_averages_filtered['month'], 
    muenster_monthly_averages_filtered['monthly averages'], 
    marker='s', label='senseBox:bike data', linestyle='-.'
)

plt.xticks(rotation=45)
plt.xlabel('Year and Month')
plt.ylabel('Average Monthly PM2.5 Concentrations (µg/m³)')
plt.title('Comparing PM2.5 in Münster')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('Comparison PM2.5 in Münster.png')
plt.close()