In [None]:
import pandas as pd
import random
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# plot layout, darkgrid
sns.set(style="darkgrid")

## Reducing the number of loop detectors 
For this part, we need to work with the largest dataset - cleaned, but not yet aggregated by averaging - and to make sure pandas imports it correctly, it is best to specify the dtype for each column.

In [2]:
# Read the processed data

# Specify which columns to parse as dates
parse_dates = ['timestamp']

dtype_dict = {
    'day': 'str',  
    'interval': 'int32',  
    'detid': 'str',  
    'flow': 'float32',  
    'occ': 'float32',  
    'error': 'float32',  
    'city': 'category',  
    'speed': 'float32',
    'length': 'float64',
    'pos': 'float64',
    'fclass': 'category',               
    'road': 'str',
    'limit': 'str',
    'lanes': 'float64',
    'linkid': 'float64',
    'long': 'float64',
    'lat': 'float64',
}

data = pd.read_csv('C:/DTU/Speciale/Kode/Speciale/data/processed/merged_data.csv', dtype=dtype_dict, parse_dates=parse_dates)

### Compute N datasets with K sampled detectors

In [None]:
# drop all cities with less than 100 detectors from filtered_data (as specified in the paper & report)

# for each city, count the number of detectors (unique detid)
detectors_per_city = data.groupby('city')['detid'].nunique()

# we ended up using only the following cities:
cities = ['Augsburg', 'Bern', 'Bordeaux', 'Bremen', 'Darmstadt', 'Graz', 'Hamburg', 'Kassel', 'London', 'Losangeles', 'Madrid', 'Marseille', 'Santander', 'Speyer', 'Strasbourg', 'Stuttgart', 'Taipeh', 'Toronto', 'Toulouse', 'Zurich']

# filter the data to only include the specified cities
filtered_data = data[data['city'].isin(cities)]

These functions handle the dataset creation - a specified number of detectors are sampled, averages are calculated, and this is repeated N time to create N datasets for each city.

In [None]:
# Function to sample detectors for each city
def sample_detectors(data, detectors_per_city, min_detectors=100, sample_size=25):
    cities = detectors_per_city[detectors_per_city > min_detectors].index
    sampled_detectors = (
        data[data['city'].isin(cities)]
        .groupby('city')['detid']
        .unique()
        .apply(lambda x: random.sample(list(x), sample_size))
    )
    return sampled_detectors.explode().reset_index()

# Function to calculate averages
def calculate_averages(data):
    results = []
    grouped = data.groupby(['city', 'day', 'interval'])
    for (city, day, interval), group in grouped:
        result = {
            'city': city,
            'day': day,
            'interval': interval,
            'avg_flow': group['flow'].mean(),
            'avg_occupancy': group['occ'].mean(),
        }
        results.append(result)
    return pd.DataFrame(results)

# Function to normalize data
def normalize_data(df):
    # scaler = MinMaxScaler()
    df['norm_flow'] = df.groupby('city')['avg_flow'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    df['norm_occ'] = df.groupby('city')['avg_occupancy'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    # df[['a_flow', 'avg_occupancy']] = scaler.fit_transform(df[['avg_flow', 'avg_occupancy']])
    return df

# Function to create N datasets
def create_datasets(data, detectors_per_city, N, min_detectors=100, sample_size=25):
    all_datasets = []
    for n in range(1, N + 1):
        # Step 1: Sample detectors
        sampled_detectors = sample_detectors(data, detectors_per_city, min_detectors, sample_size)
        
        # Step 2: Filter the data based on sampled detectors
        filtered_data = data[
            data.set_index(['city', 'detid']).index.isin(
                sampled_detectors.set_index(['city', 'detid']).index
            )
        ]
        
        # Step 3: Calculate averages
        averages = calculate_averages(filtered_data)
        
        # Step 4: Normalize the data
        normalized_averages = normalize_data(averages)
        
        # Add the dataset number
        normalized_averages['N'] = n
        all_datasets.append(normalized_averages)
    
    # Combine all datasets into one DataFrame
    combined_df = pd.concat(all_datasets, ignore_index=True)
    return combined_df

In [None]:
# Sample use - generate a 'collection' of 2 datasets for each city, each based on 10 randomly samapled detectors

N = 2  # Number of datasets to generate
K = 10 # Number of detectors to sample for each dataset
combined_datasets = create_datasets(filtered_data, detectors_per_city, N, sample_size=K)

# Save the combined dataframe to a CSV
# combined_datasets.to_csv('averages_random_datasets.csv', index=False)
combined_datasets.to_csv('C:/DTU/Speciale/Kode/Speciale/data/processed/reduced loop detectors/10_rand_det_2_times.csv', index=False)