In [None]:
pip install pyinstaller

Collecting pyinstaller
  Downloading pyinstaller-6.16.0-py3-none-manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting altgraph (from pyinstaller)
  Downloading altgraph-0.17.4-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting pyinstaller-hooks-contrib>=2025.8 (from pyinstaller)
[0m  Downloading pyinstaller_hooks_contrib-2025.9-py3-none-any.whl.metadata (16 kB)
Downloading pyinstaller-6.16.0-py3-none-manylinux2014_x86_64.whl (733 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m733.8/733.8 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyinstaller_hooks_contrib-2025.9-py3-none-any.whl (444 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.3/444.3 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading altgraph-0.17.4-py2.py3-none-any.whl (21 kB)
Installing collected packages: altgraph, pyinstaller-hooks-contrib, pyinstaller
Successfully installed altgraph-0.17.4 pyinstaller-6.16.0 pyinstaller-hooks-contrib-2025.9


In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=1000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']
    supplier_locations = ['Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf']
    destination_locations = ['Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf']
    transport_modes = ['Straße', 'Schiene', 'Luft', 'See']
    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'distance_km': np.random.uniform(50, 2500, num_samples).round(2),
        'transport_mode': np.random.choice(transport_modes, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.0) # Langsamer per See
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 1000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ processin ┆ weather_c ┆ holiday_s ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ g_time_da ┆ onditions ┆ eason     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ys        ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ ---       ┆ str       ┆ bool      ┆ f64       │
│          ┆            ┆          ┆           ┆   ┆ f64       ┆           ┆           ┆           │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Elektronik ┆ 496      ┆ Hamburg   ┆ … ┆ 3.8       ┆ Klar      ┆ true      ┆ 10.5      │
│ 2        ┆ Chemikalie ┆ 505      ┆ Köln      ┆ … ┆ 3.0       ┆ Regnerisc ┆ false     ┆ 7.3       │
│          ┆ n          ┆          ┆           ┆   ┆           ┆ h         ┆

In [None]:
# Exportiere den Datensatz zu CSV mit Polars
synthetic_data.write_csv('delivery_data.csv')
print('Datensatz erfolgreich als delivery_data.csv exportiert.')

Datensatz erfolgreich als delivery_data.csv exportiert.


# Task
Improve the data generation script in cell `RiB8v2X7m6GY` by expanding the list of cities, implementing a function to generate realistic distances between them, assigning transport modes dynamically based on these distances, adjusting delivery time calculations accordingly, and increasing the sample size to 15000.

## Expand City List

### Subtask:
Increase the number of cities in the `ALL_CITIES` list for greater variety in `supplier_location` and `destination_location`.


**Reasoning**:
To expand the list of cities, I need to modify the `generate_delivery_data` function in cell `RiB8v2X7m6GY` by creating a comprehensive `ALL_CITIES` list and assigning it to both `supplier_locations` and `destination_locations`.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=1000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
    ]

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    transport_modes = ['Straße', 'Schiene', 'Luft', 'See']
    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'distance_km': np.random.uniform(50, 2500, num_samples).round(2),
        'transport_mode': np.random.choice(transport_modes, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.0) # Langsamer per See
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 1000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ processin ┆ weather_c ┆ holiday_s ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ g_time_da ┆ onditions ┆ eason     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ys        ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ ---       ┆ str       ┆ bool      ┆ f64       │
│          ┆            ┆          ┆           ┆   ┆ f64       ┆           ┆           ┆           │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Chemikalie ┆ 627      ┆ Saarbrück ┆ … ┆ 2.9       ┆ Sturm     ┆ false     ┆ 11.3      │
│          ┆ n          ┆          ┆ en        ┆   ┆           ┆           ┆           ┆           │
│ 2        ┆ Bauteile   ┆ 304      ┆ Mönchengl ┆ … ┆ 3.8       ┆ Schnee    ┆

**Reasoning**:
To generate realistic distances between cities, I need to define geographical coordinates for each city and implement a Haversine formula to calculate the distance. I will add a `CITY_COORDINATES` dictionary and a `haversine` function within the `generate_delivery_data` function, then update the `distance_km` generation to use these.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=1000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512)
    }

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    transport_modes = ['Straße', 'Schiene', 'Luft', 'See']
    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'transport_mode': np.random.choice(transport_modes, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            distances.append(np.random.uniform(10, 50))
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            distances.append(haversine(lat1, lon1, lat2, lon2))

    df = df.with_columns(pl.Series(name='distance_km', values=np.array(distances).round(2)))

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.0) # Langsamer per See
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ weather_c ┆ holiday_s ┆ distance_ ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ onditions ┆ eason     ┆ km        ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ str       ┆ bool      ┆ f64       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Rohstoffe  ┆ 633      ┆ Wuppertal ┆ … ┆ Schnee    ┆ false     ┆ 343.18    ┆ 2.1       │
│ 2        ┆ Chemikalie ┆ 29       ┆ Münster   ┆ … ┆ Klar      ┆ true      ┆ 67.87     ┆ 7.6       │
│          ┆ n          ┆          ┆           ┆   ┆           ┆           ┆           ┆           │
│ 3        ┆ Fertigprod ┆ 757      ┆ Münster   ┆ … ┆ Klar      ┆ false     ┆

**Reasoning**:
The next step is to dynamically assign transport modes based on the calculated distances. This involves updating the `transport_mode` generation logic within the `generate_delivery_data` function to reflect typical usage for different distance ranges.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=1000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512)
    }

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Dynamically assign transport mode based on distance
        if dist < 100:
            # Short distances: Road, sometimes Rail
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
        elif 100 <= dist < 500:
            # Medium distances: Road, Rail, sometimes Air
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
        else:
            # Long distances: Rail, Air, and if coastal, Sea
            if sup_loc in COASTAL_CITIES or dest_loc in COASTAL_CITIES:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
            else:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.0) # Langsamer per See
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ holiday_s ┆ distance_ ┆ transport ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ eason     ┆ km        ┆ _mode     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ bool      ┆ f64       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Bauteile   ┆ 635      ┆ Osnabrück ┆ … ┆ false     ┆ 129.48    ┆ Straße    ┆ 2.7       │
│ 2        ┆ Rohstoffe  ┆ 163      ┆ Mannheim  ┆ … ┆ false     ┆ 510.25    ┆ Schiene   ┆ 3.6       │
│ 3        ┆ Chemikalie ┆ 493      ┆ Osnabrück ┆ … ┆ false     ┆ 415.28    ┆ Schiene   ┆ 1.2       │
│          ┆ n          ┆          ┆           ┆   ┆           ┆           ┆

## Update Delivery Time Calculation

### Subtask:
Review and adjust the `delivery_time_days` calculation to ensure it aligns with the updated `transport_mode` logic, specifically removing the 'See' case.


**Reasoning**:
I need to modify the `generate_delivery_data` function to remove the 'See' transport mode impact from the delivery time calculation as per the instructions.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=1000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512)
    }

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Dynamically assign transport mode based on distance
        if dist < 100:
            # Short distances: Road, sometimes Rail
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
        elif 100 <= dist < 500:
            # Medium distances: Road, Rail, sometimes Air
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
        else:
            # Long distances: Rail, Air, and if coastal, Sea (only if both supplier and destination are coastal)
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
            else:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße' and 'See' (if any, will be default)
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ holiday_s ┆ distance_ ┆ transport ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ eason     ┆ km        ┆ _mode     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ bool      ┆ f64       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Elektronik ┆ 7        ┆ Solingen  ┆ … ┆ false     ┆ 470.73    ┆ Straße    ┆ 7.2       │
│ 2        ┆ Rohstoffe  ┆ 977      ┆ Chemnitz  ┆ … ┆ false     ┆ 133.33    ┆ Schiene   ┆ 1.0       │
│ 3        ┆ Bauteile   ┆ 813      ┆ Hagen     ┆ … ┆ false     ┆ 50.54     ┆ Schiene   ┆ 1.0       │
│ 4        ┆ Rohstoffe  ┆ 934      ┆ Solingen  ┆ … ┆ false     ┆ 137.76    ┆

## Increase Sample Size

### Subtask:
Increase the default `num_samples` in the `generate_delivery_data` function to generate a larger, more diverse dataset (e.g., 15000).


**Reasoning**:
To increase the default `num_samples` to 15000, I will update the `generate_delivery_data` function definition and its example call in the latest version of the code.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512)
    }

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Dynamically assign transport mode based on distance
        if dist < 100:
            # Short distances: Road, sometimes Rail
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
        elif 100 <= dist < 500:
            # Medium distances: Road, Rail, sometimes Air
            transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
        else:
            # Long distances: Rail, Air, and if coastal, Sea (only if both supplier and destination are coastal)
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
            else:
                transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße' and 'See' (if any, will be default)
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ holiday_s ┆ distance_ ┆ transport ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ eason     ┆ km        ┆ _mode     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ bool      ┆ f64       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Chemikalie ┆ 865      ┆ Magdeburg ┆ … ┆ false     ┆ 323.17    ┆ Straße    ┆ 9.3       │
│          ┆ n          ┆          ┆           ┆   ┆           ┆           ┆           ┆           │
│ 2        ┆ Fertigprod ┆ 659      ┆ Mainz     ┆ … ┆ false     ┆ 139.54    ┆ Luft      ┆ 6.3       │
│          ┆ ukte       ┆          ┆           ┆   ┆           ┆           ┆

## Summary:

### Data Analysis Key Findings

*   The `ALL_CITIES` list was successfully expanded from 7 to 48 German cities, significantly increasing the variety for `supplier_location` and `destination_location`.
*   Realistic distances (`distance_km`) are now calculated using the Haversine formula based on geographical coordinates (`CITY_COORDINATES`), replacing the previous random distance generation. Deliveries between the same city now have a small random distance (10 to 50 km).
*   Transport modes are dynamically assigned based on `distance_km` thresholds and whether locations are `COASTAL_CITIES`:
    *   Distances under 100 km are primarily "Straße" (80%) with some "Schiene" (20%).
    *   Distances between 100 km and 500 km use "Straße" (50%), "Schiene" (30%), and "Luft" (20%).
    *   Longer distances (>500 km) primarily use "Schiene" (60%) and "Luft" (40%), with "See" (30%) introduced if both supplier and destination are coastal cities.
*   The `delivery_time_days` calculation was updated by removing the specific multiplier for the "See" transport mode, ensuring it now defaults to the base speed (200 km/day) if not explicitly adjusted by "Luft" or "Schiene" multipliers.
*   The sample size for data generation was successfully increased to 15,000 records, providing a larger dataset for analysis.

### Insights or Next Steps

*   **Enhanced Realism**: The generated dataset now offers a more realistic representation of logistics scenarios due to geographically accurate distances and dynamic transport mode assignments, which will improve the robustness of any models trained on this data.
*   **Further Refinement of Delivery Time Logic**: The delivery time calculation could be further refined by adding specific speed parameters for "Straße" and "See" transport modes, rather than relying on a default or implicit calculation. This would ensure all transport modes have explicit, realistic speed profiles.


# Task
Modify the `generate_delivery_data` function to include a much larger set of international cities, along with their geographical coordinates, and create a mapping to identify the country for each city.

## Expand City List and Coordinates

### Subtask:
Modify the `generate_delivery_data` function to include a much larger set of international cities, along with their geographical coordinates. Create a mapping (e.g., a dictionary) to identify the country for each city, which will be crucial for classifying routes.


**Reasoning**:
I need to update the `generate_delivery_data` function in the provided notebook cell `6726e4f9` to incorporate international cities, their coordinates, and a mapping of cities to countries as per the subtask instructions. This involves editing the `ALL_CITIES` list, expanding the `CITY_COORDINATES` dictionary, and adding a new `CITY_TO_COUNTRY` dictionary.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Dynamically assign transport mode based on distance
        # Consider intercontinental vs. intracontinental travel
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        if supplier_country != destination_country:
            # Intercontinental: prioritize Air or Sea for long distances
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.6, 0.4])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'], p=[1.0])) # Air is primary for non-coastal intercontinental
        else: # Intracontinental/National
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if coastal, Sea (only if both supplier and destination are coastal)
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 12)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ holiday_s ┆ distance_ ┆ transport ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ eason     ┆ km        ┆ _mode     ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ bool      ┆ f64       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Fertigprod ┆ 877      ┆ Rio de    ┆ … ┆ false     ┆ 9887.81   ┆ Luft      ┆ 40.5      │
│          ┆ ukte       ┆          ┆ Janeiro   ┆   ┆           ┆           ┆           ┆           │
│ 2        ┆ Elektronik ┆ 788      ┆ Braunschw ┆ … ┆ false     ┆ 282.77    ┆ Straße    ┆ 1.0       │
│          ┆            ┆          ┆ eig       ┆   ┆           ┆           ┆

## Develop Route Classification Logic

### Subtask:
Implement logic within the `generate_delivery_data` function to classify each order's route as 'domestic_germany', 'intra_european', or 'intercontinental' based on the countries of the supplier and destination locations.


**Reasoning**:
I need to modify the `generate_delivery_data` function to include a new 'route_type' column, classify routes based on supplier and destination countries, and update the DataFrame accordingly.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # List of European countries for route classification
    EUROPEAN_COUNTRIES = [
        'Germany', 'UK', 'France'
    ]

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    route_types = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Determine supplier and destination countries
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        # Classify route type
        if supplier_country == 'Germany' and destination_country == 'Germany':
            route_type = 'domestic_germany'
        elif (supplier_country in EUROPEAN_COUNTRIES and destination_country in EUROPEAN_COUNTRIES):
            route_type = 'intra_european'
        else:
            route_type = 'intercontinental'
        route_types.append(route_type)

        # Dynamically assign transport mode based on distance and route type
        if route_type == 'intercontinental':
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES and dist > 1000: # Only use sea for long intercontinental coastal routes
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.7, 0.3])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'])) # Air is primary for non-coastal or shorter intercontinental
        else: # Intracontinental/National
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if coastal, Sea (only if both supplier and destination are coastal)
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic),
        pl.Series(name='route_type', values=route_types)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 13)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ distance_ ┆ transport ┆ route_typ ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ km        ┆ _mode     ┆ e         ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ f64       ┆ str       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Bauteile   ┆ 754      ┆ Melbourne ┆ … ┆ 16672.22  ┆ Luft      ┆ intercont ┆ 54.0      │
│          ┆            ┆          ┆           ┆   ┆           ┆           ┆ inental   ┆           │
│ 2        ┆ Bauteile   ┆ 253      ┆ Halle     ┆ … ┆ 9858.99   ┆ Luft      ┆ intercont ┆ 24.6      │
│          ┆            ┆          ┆ (Saale)   ┆   ┆           ┆           ┆

## Refine Dynamic Transport Mode Assignment

### Subtask:
Update the transport mode assignment to consider the new route classification and distance. This will involve more granular probability distributions for 'Straße', 'Schiene', 'Luft', and 'See'.


**Reasoning**:
The subtask requires refining the dynamic transport mode assignment based on the new route classification and distance, incorporating granular probability distributions. I will modify the `generate_delivery_data` function to update the `transport_modes_dynamic` assignment logic accordingly, applying different probability distributions for 'domestic_germany', 'intra_european', and 'intercontinental' routes, and considering coastal cities for 'See' transport.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # List of European countries for route classification
    EUROPEAN_COUNTRIES = [
        'Germany', 'UK', 'France'
    ]

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    route_types = []
    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Determine supplier and destination countries
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        # Classify route type
        if supplier_country == 'Germany' and destination_country == 'Germany':
            route_type = 'domestic_germany'
        elif (supplier_country in EUROPEAN_COUNTRIES and destination_country in EUROPEAN_COUNTRIES and supplier_country != destination_country):
            route_type = 'intra_european'
        else:
            route_type = 'intercontinental'
        route_types.append(route_type)

        # Dynamically assign transport mode based on distance and route type
        if route_type == 'domestic_germany' or route_type == 'intra_european':
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if both coastal, Sea
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))
        elif route_type == 'intercontinental':
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES and dist > 1000: # Use sea for long intercontinental coastal routes
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.7, 0.3])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'])) # Air is primary for non-coastal or shorter intercontinental

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic),
        pl.Series(name='route_type', values=route_types)
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 13)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ distance_ ┆ transport ┆ route_typ ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ km        ┆ _mode     ┆ e         ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ f64       ┆ str       ┆ str       ┆ f64       │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Fertigprod ┆ 5        ┆ Hamburg   ┆ … ┆ 9075.58   ┆ Luft      ┆ intercont ┆ 25.8      │
│          ┆ ukte       ┆          ┆           ┆   ┆           ┆           ┆ inental   ┆           │
│ 2        ┆ Bauteile   ┆ 705      ┆ Leverkuse ┆ … ┆ 152.59    ┆ Schiene   ┆ domestic_ ┆ 7.7       │
│          ┆            ┆          ┆ n         ┆   ┆           ┆           ┆

## Adjust Delivery Time Impact

### Subtask:
Review and adjust the multipliers for `delivery_time_days` for each transport mode, considering the new, more realistic assignment logic, especially for 'See' (longer transit times) and 'Luft' (shorter transit times) on intercontinental routes. Also, consider the impact of 'customs_clearance_days' for international shipments.


**Reasoning**:
I need to modify the `generate_delivery_data` function to include a `customs_clearance_days` column, which will be populated based on the `route_type` and added to the `delivery_time_days` calculation.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # List of European countries for route classification
    EUROPEAN_COUNTRIES = [
        'Germany', 'UK', 'France'
    ]

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    route_types = []
    customs_clearance_days = [] # Initialize new list for customs clearance days

    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Determine supplier and destination countries
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        # Classify route type
        if supplier_country == 'Germany' and destination_country == 'Germany':
            route_type = 'domestic_germany'
        elif (supplier_country in EUROPEAN_COUNTRIES and destination_country in EUROPEAN_COUNTRIES and supplier_country != destination_country):
            route_type = 'intra_european'
        else:
            route_type = 'intercontinental'
        route_types.append(route_type)

        # Assign customs clearance days
        if route_type == 'intercontinental':
            customs_clearance_days.append(np.random.uniform(2, 7).round(1))
        else:
            customs_clearance_days.append(0.0)

        # Dynamically assign transport mode based on distance and route type
        if route_type == 'domestic_germany' or route_type == 'intra_european':
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if both coastal, Sea
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))
        elif route_type == 'intercontinental':
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES and dist > 1000: # Use sea for long intercontinental coastal routes
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.7, 0.3])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'])) # Air is primary for non-coastal or shorter intercontinental

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic),
        pl.Series(name='route_type', values=route_types),
        pl.Series(name='customs_clearance_days', values=np.array(customs_clearance_days).round(1)) # Add new column
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Add customs clearance days before noise and final rounding
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('customs_clearance_days')).alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

AttributeError: 'float' object has no attribute 'round'

**Reasoning**:
The error `AttributeError: 'float' object has no attribute 'round'` occurred because `np.random.uniform` returns a float, which does not have a `.round()` method. To fix this, I need to use `np.round()` as a function, passing the generated float and the desired number of decimal places.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # List of European countries for route classification
    EUROPEAN_COUNTRIES = [
        'Germany', 'UK', 'France'
    ]

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    route_types = []
    customs_clearance_days = [] # Initialize new list for customs clearance days

    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Determine supplier and destination countries
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        # Classify route type
        if supplier_country == 'Germany' and destination_country == 'Germany':
            route_type = 'domestic_germany'
        elif (supplier_country in EUROPEAN_COUNTRIES and destination_country in EUROPEAN_COUNTRIES and supplier_country != destination_country):
            route_type = 'intra_european'
        else:
            route_type = 'intercontinental'
        route_types.append(route_type)

        # Assign customs clearance days
        if route_type == 'intercontinental':
            customs_clearance_days.append(np.round(np.random.uniform(2, 7), 1))
        else:
            customs_clearance_days.append(0.0)

        # Dynamically assign transport mode based on distance and route type
        if route_type == 'domestic_germany' or route_type == 'intra_european':
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if both coastal, Sea
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))
        elif route_type == 'intercontinental':
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES and dist > 1000: # Use sea for long intercontinental coastal routes
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.7, 0.3])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'])) # Air is primary for non-coastal or shorter intercontinental

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic),
        pl.Series(name='route_type', values=route_types),
        pl.Series(name='customs_clearance_days', values=np.array(customs_clearance_days).round(1)) # Add new column
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Add customs clearance days before noise and final rounding
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('customs_clearance_days')).alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 14)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ transport ┆ route_typ ┆ customs_c ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ _mode     ┆ e         ┆ learance_ ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ days      ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ str       ┆ str       ┆ ---       ┆ f64       │
│          ┆            ┆          ┆           ┆   ┆           ┆           ┆ f64       ┆           │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Chemikalie ┆ 854      ┆ Solingen  ┆ … ┆ Straße    ┆ domestic_ ┆ 0.0       ┆ 6.4       │
│          ┆ n          ┆          ┆           ┆   ┆           ┆ germany   ┆           ┆           │
│ 2        ┆ Fertigprod ┆ 814      ┆ Chicago   ┆ … ┆ Luft      ┆ intercont ┆

**Reasoning**:
The previous code caused an `AttributeError` because `np.random.uniform` returns a float, which does not have a `.round()` method. To fix this, I have replaced `.round(1)` with `np.round(..., 1)` to correctly round the float value before appending it to the `customs_clearance_days` list. This will allow the data generation to proceed as intended, incorporating customs clearance days for intercontinental routes.



In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

def generate_delivery_data(num_samples=15000):
    """
    Generiert einen synthetischen Datensatz für die Vorhersage von Lieferzeiten.

    Args:
        num_samples (int): Die Anzahl der zu generierenden Datensätze.

    Returns:
        pl.DataFrame: Ein DataFrame mit synthetischen Lieferdaten.
    """

    # Listen für Kategorische Merkmale
    material_types = ['Elektronik', 'Rohstoffe', 'Fertigprodukte', 'Bauteile', 'Chemikalien']

    # Expanded list of German and International cities for greater variety
    ALL_CITIES = [
        'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
        'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
        'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
        'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
        'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
        'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
        'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt',
        # International Cities
        'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami', # USA
        'London', 'Manchester', 'Birmingham', # UK
        'Paris', 'Marseille', 'Lyon', # France
        'Shanghai', 'Beijing', 'Guangzhou', # China
        'Rio de Janeiro', 'Sao Paulo', 'Brasilia', # Brazil
        'Sydney', 'Melbourne', 'Brisbane', # Australia
        'Tokyo', 'Osaka', # Japan
        'Dubai', # UAE
        'Mumbai', # India
        'Moscow', # Russia
        'Toronto', # Canada
        'Mexico City' # Mexico
    ]

    # Approximate coordinates for the cities (latitude, longitude)
    CITY_COORDINATES = {
        'Berlin': (52.5200, 13.4050), 'München': (48.1351, 11.5820), 'Hamburg': (53.5511, 9.9937),
        'Frankfurt': (50.1109, 8.6821), 'Stuttgart': (48.7758, 9.1829), 'Köln': (50.9375, 6.9603),
        'Düsseldorf': (51.2277, 6.7735), 'Leipzig': (51.3397, 12.3731), 'Dresden': (51.0504, 13.7373),
        'Hannover': (52.3759, 9.7320), 'Nürnberg': (49.4521, 11.0767), 'Duisburg': (51.4332, 6.7663),
        'Bochum': (51.4818, 7.2165), 'Wuppertal': (51.2562, 7.1507), 'Bielefeld': (52.0211, 8.5347),
        'Bonn': (50.7374, 7.0982), 'Münster': (51.9616, 7.6280), 'Karlsruhe': (49.0069, 8.4037),
        'Mannheim': (49.4875, 8.4660), 'Augsburg': (48.3705, 10.8978), 'Wiesbaden': (50.0833, 8.2333),
        'Gelsenkirchen': (51.5167, 7.1000), 'Mönchengladbach': (51.1960, 6.4385), 'Braunschweig': (52.2690, 10.5215),
        'Chemnitz': (50.8333, 12.9167), 'Kiel': (54.3233, 10.1228), 'Aachen': (50.7750, 6.0833),
        'Halle (Saale)': (51.4828, 11.9692), 'Magdeburg': (52.1205, 11.6276), 'Freiburg im Breisgau': (47.9990, 7.8421),
        'Krefeld': (51.3391, 6.5847), 'Lübeck': (53.8655, 10.6865), 'Oberhausen': (51.4691, 6.8660),
        'Erfurt': (50.9787, 11.0292), 'Mainz': (50.0000, 8.2711), 'Rostock': (54.0900, 12.1400),
        'Kassel': (51.3172, 9.4926), 'Hagen': (51.3592, 7.4692), 'Hamm': (51.6811, 7.8174),
        'Saarbrücken': (49.2333, 7.0000), 'Herne': (51.5333, 7.2167), 'Mülheim an der Ruhr': (51.4326, 6.8837),
        'Osnabrück': (52.2793, 8.0494), 'Solingen': (51.1712, 7.0836), 'Leverkusen': (51.0406, 6.9912),
        'Heidelberg': (49.4076, 8.6905), 'Darmstadt': (49.8728, 8.6512),
        # International Coordinates
        'New York': (40.7128, -74.0060), 'Los Angeles': (34.0522, -118.2437), 'Chicago': (41.8781, -87.6298), 'Houston': (29.7604, -95.3698), 'Miami': (25.7617, -80.1918),
        'London': (51.5074, -0.1278), 'Manchester': (53.4808, -2.2426), 'Birmingham': (52.4862, -1.8904),
        'Paris': (48.8566, 2.3522), 'Marseille': (43.2965, 5.3698), 'Lyon': (45.7640, 4.8357),
        'Shanghai': (31.2304, 121.4737), 'Beijing': (39.9042, 116.4074), 'Guangzhou': (23.1300, 113.2600),
        'Rio de Janeiro': (-22.9068, -43.1729), 'Sao Paulo': (-23.5505, -46.6333), 'Brasilia': (-15.7801, -47.9292),
        'Sydney': (-33.8688, 151.2093), 'Melbourne': (-37.8136, 144.9631), 'Brisbane': (-27.4698, 153.0251),
        'Tokyo': (35.6762, 139.6503), 'Osaka': (34.6937, 135.5023),
        'Dubai': (25.2048, 55.2708),
        'Mumbai': (19.0760, 72.8777),
        'Moscow': (55.7558, 37.6173),
        'Toronto': (43.6532, -79.3832),
        'Mexico City': (19.4326, -99.1332)
    }

    # Mapping of cities to countries
    CITY_TO_COUNTRY = {
        **{city: 'Germany' for city in [
            'Berlin', 'München', 'Hamburg', 'Frankfurt', 'Stuttgart', 'Köln', 'Düsseldorf',
            'Leipzig', 'Dresden', 'Hannover', 'Nürnberg', 'Duisburg', 'Bochum', 'Wuppertal',
            'Bielefeld', 'Bonn', 'Münster', 'Karlsruhe', 'Mannheim', 'Augsburg', 'Wiesbaden',
            'Gelsenkirchen', 'Mönchengladbach', 'Braunschweig', 'Chemnitz', 'Kiel', 'Aachen',
            'Halle (Saale)', 'Magdeburg', 'Freiburg im Breisgau', 'Krefeld', 'Lübeck', 'Oberhausen',
            'Erfurt', 'Mainz', 'Rostock', 'Kassel', 'Hagen', 'Hamm', 'Saarbrücken', 'Herne',
            'Mülheim an der Ruhr', 'Osnabrück', 'Solingen', 'Leverkusen', 'Heidelberg', 'Darmstadt'
        ]},
        'New York': 'USA', 'Los Angeles': 'USA', 'Chicago': 'USA', 'Houston': 'USA', 'Miami': 'USA',
        'London': 'UK', 'Manchester': 'UK', 'Birmingham': 'UK',
        'Paris': 'France', 'Marseille': 'France', 'Lyon': 'France',
        'Shanghai': 'China', 'Beijing': 'China', 'Guangzhou': 'China',
        'Rio de Janeiro': 'Brazil', 'Sao Paulo': 'Brazil', 'Brasilia': 'Brazil',
        'Sydney': 'Australia', 'Melbourne': 'Australia', 'Brisbane': 'Australia',
        'Tokyo': 'Japan', 'Osaka': 'Japan',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'Moscow': 'Russia',
        'Toronto': 'Canada',
        'Mexico City': 'Mexico'
    }

    # List of European countries for route classification
    EUROPEAN_COUNTRIES = [
        'Germany', 'UK', 'France'
    ]

    # Haversine formula to calculate distance between two lat/lon points
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Earth radius in kilometers
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c

    supplier_locations = ALL_CITIES
    destination_locations = ALL_CITIES

    # Coastal cities for 'See' transport option
    COASTAL_CITIES = ['Hamburg', 'Kiel', 'Rostock', 'Lübeck', 'New York', 'Los Angeles', 'Miami', 'London', 'Marseille', 'Rio de Janeiro', 'Sydney', 'Melbourne', 'Brisbane', 'Dubai', 'Mumbai', 'Osaka']

    weather_conditions = ['Klar', 'Regnerisch', 'Schnee', 'Nebel', 'Sturm']

    # Generate initial data using numpy/lists
    data = {
        'order_id': list(range(1, num_samples + 1)),
        'material_type': np.random.choice(material_types, num_samples),
        'quantity': np.random.randint(1, 1000, num_samples),
        'supplier_location': np.random.choice(supplier_locations, num_samples),
        'destination_location': np.random.choice(destination_locations, num_samples),
        'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(num_samples)],
        'processing_time_days': np.random.uniform(0.5, 5, num_samples).round(1),
        'weather_conditions': np.random.choice(weather_conditions, num_samples),
        'holiday_season': np.random.choice([True, False], num_samples, p=[0.1, 0.9])
    }

    df = pl.DataFrame(data)

    # Calculate distance based on supplier and destination locations
    distances = []
    transport_modes_dynamic = []
    route_types = []
    customs_clearance_days = [] # Initialize new list for customs clearance days

    for i in range(num_samples):
        sup_loc = df[i, 'supplier_location']
        dest_loc = df[i, 'destination_location']

        # Ensure supplier and destination are not the same to avoid zero distance issues
        if sup_loc == dest_loc:
            # Assign a small random distance if locations are the same
            dist = np.random.uniform(10, 50)
        else:
            lat1, lon1 = CITY_COORDINATES[sup_loc]
            lat2, lon2 = CITY_COORDINATES[dest_loc]
            dist = haversine(lat1, lon1, lat2, lon2)
        distances.append(dist)

        # Determine supplier and destination countries
        supplier_country = CITY_TO_COUNTRY[sup_loc]
        destination_country = CITY_TO_COUNTRY[dest_loc]

        # Classify route type
        if supplier_country == 'Germany' and destination_country == 'Germany':
            route_type = 'domestic_germany'
        elif (supplier_country in EUROPEAN_COUNTRIES and destination_country in EUROPEAN_COUNTRIES and supplier_country != destination_country):
            route_type = 'intra_european'
        else:
            route_type = 'intercontinental'
        route_types.append(route_type)

        # Assign customs clearance days
        if route_type == 'intercontinental':
            customs_clearance_days.append(np.round(np.random.uniform(2, 7), 1))
        else:
            customs_clearance_days.append(0.0)

        # Dynamically assign transport mode based on distance and route type
        if route_type == 'domestic_germany' or route_type == 'intra_european':
            if dist < 100:
                # Short distances: Road, sometimes Rail
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene'], p=[0.8, 0.2]))
            elif 100 <= dist < 500:
                # Medium distances: Road, Rail, sometimes Air
                transport_modes_dynamic.append(np.random.choice(['Straße', 'Schiene', 'Luft'], p=[0.5, 0.3, 0.2]))
            else:
                # Long distances: Rail, Air, and if both coastal, Sea
                if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft', 'See'], p=[0.4, 0.3, 0.3]))
                else:
                    transport_modes_dynamic.append(np.random.choice(['Schiene', 'Luft'], p=[0.6, 0.4]))
        elif route_type == 'intercontinental':
            if sup_loc in COASTAL_CITIES and dest_loc in COASTAL_CITIES and dist > 1000: # Use sea for long intercontinental coastal routes
                transport_modes_dynamic.append(np.random.choice(['Luft', 'See'], p=[0.7, 0.3])) # More air for speed
            else:
                transport_modes_dynamic.append(np.random.choice(['Luft'])) # Air is primary for non-coastal or shorter intercontinental

    df = df.with_columns(
        pl.Series(name='distance_km', values=np.array(distances).round(2)),
        pl.Series(name='transport_mode', values=transport_modes_dynamic),
        pl.Series(name='route_type', values=route_types),
        pl.Series(name='customs_clearance_days', values=np.array(customs_clearance_days).round(1)) # Add new column
    )

    # Berechnung der Lieferzeit (Zielvariable) basierend auf anderen Merkmalen
    df = df.with_columns(
        (pl.col('distance_km') / 200).alias('delivery_time_days') # Annahme: 200 km/Tag als Basisgeschwindigkeit
    )

    # Auswirkungen des Transportmittels
    df = df.with_columns(
        pl.when(pl.col('transport_mode') == 'Luft').then(pl.col('delivery_time_days') * 0.5) # Schneller per Luft
        .when(pl.col('transport_mode') == 'Schiene').then(pl.col('delivery_time_days') * 0.8) # Schneller per Schiene
        .when(pl.col('transport_mode') == 'See').then(pl.col('delivery_time_days') * 2.5) # Sea transport made slower due to longer routes
        .otherwise(pl.col('delivery_time_days')) # Handles 'Straße'
        .alias('delivery_time_days')
    )

    # Auswirkungen der Wetterbedingungen
    df = df.with_columns(
        pl.when(pl.col('weather_conditions').is_in(['Schnee', 'Sturm'])).then(pl.col('delivery_time_days') * 1.5)
        .when(pl.col('weather_conditions') == 'Nebel').then(pl.col('delivery_time_days') * 1.2)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Auswirkungen der Bearbeitungszeit
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('processing_time_days')).alias('delivery_time_days')
    )

    # Auswirkungen der Ferienzeit
    df = df.with_columns(
        pl.when(pl.col('holiday_season')).then(pl.col('delivery_time_days') * 1.3)
        .otherwise(pl.col('delivery_time_days'))
        .alias('delivery_time_days')
    )

    # Add customs clearance days before noise and final rounding
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.col('customs_clearance_days')).alias('delivery_time_days')
    )

    # Zufälliges Rauschen hinzufügen
    df = df.with_columns(
        (pl.col('delivery_time_days') + pl.Series(name="noise", values=np.random.normal(0, 2, num_samples))).alias('delivery_time_days')
    )

    # Sicherstellen, dass die Lieferzeit mindestens 1 Tag beträgt und runden
    df = df.with_columns(
        pl.when(pl.col('delivery_time_days').round(1) < 1.0)
        .then(pl.lit(1.0))
        .otherwise(pl.col('delivery_time_days').round(1))
        .alias('delivery_time_days')
    )

    # Die `order_date` als Datums-Objekt beibehalten oder in String konvertieren, je nach Bedarf
    df = df.with_columns(
        pl.col('order_date').dt.strftime('%Y-%m-%d').alias('order_date')
    )

    return df

# Beispielaufruf, um 15000 Datensätze zu generieren
synthetic_data = generate_delivery_data(num_samples=15000)

# Die ersten 5 Zeilen des generierten Datensatzes anzeigen
print(synthetic_data.head())

# Informationen zum Datensatz anzeigen
print("\nDatensatz-Schema:")
print(synthetic_data.schema) # Polars equivalent for column names and types

# Statistiken der numerischen Spalten
print("\nStatistische Zusammenfassung:")
print(synthetic_data.describe())

shape: (5, 14)
┌──────────┬────────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ order_id ┆ material_t ┆ quantity ┆ supplier_ ┆ … ┆ transport ┆ route_typ ┆ customs_c ┆ delivery_ │
│ ---      ┆ ype        ┆ ---      ┆ location  ┆   ┆ _mode     ┆ e         ┆ learance_ ┆ time_days │
│ i64      ┆ ---        ┆ i64      ┆ ---       ┆   ┆ ---       ┆ ---       ┆ days      ┆ ---       │
│          ┆ str        ┆          ┆ str       ┆   ┆ str       ┆ str       ┆ ---       ┆ f64       │
│          ┆            ┆          ┆           ┆   ┆           ┆           ┆ f64       ┆           │
╞══════════╪════════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1        ┆ Fertigprod ┆ 839      ┆ Mannheim  ┆ … ┆ Schiene   ┆ domestic_ ┆ 0.0       ┆ 3.0       │
│          ┆ ukte       ┆          ┆           ┆   ┆           ┆ germany   ┆           ┆           │
│ 2        ┆ Elektronik ┆ 782      ┆ Paris     ┆ … ┆ Straße    ┆ intra_eur ┆

## Summary:

### Data Analysis Key Findings
*   **Expanded Geographical Scope**: The `generate_delivery_data` function now includes a much larger set of international cities (e.g., New York, London, Shanghai, Sydney) alongside German cities, expanding the potential origin and destination points for deliveries.
*   **Comprehensive Location Mapping**: Geographical coordinates have been added for all new international cities, and a `CITY_TO_COUNTRY` mapping was established, which accurately links each city to its respective country. A `COASTAL_CITIES` list was also updated to include international port cities.
*   **Route Classification Introduced**: A new `route_type` column has been successfully implemented, classifying each delivery route as 'domestic\_germany', 'intra\_european', or 'intercontinental' based on the countries of the supplier and destination.
*   **Refined Transport Mode Assignment**: The logic for assigning transport modes (`Straße`, `Schiene`, `Luft`, `See`) has been made more granular. It now considers both distance and the newly introduced `route_type`. For instance, 'Luft' is prioritized for intercontinental routes, and 'See' is used for long intercontinental coastal routes, reflecting more realistic shipping practices.
*   **Incorporation of Customs Clearance**: A new `customs_clearance_days` column has been added to the dataset. Intercontinental routes are assigned a random customs clearance time between 2 to 7 days, while domestic and intra-European routes have 0 days, adding a crucial realistic delay factor to international shipments.
*   **Adjusted Delivery Time Calculation**: The overall `delivery_time_days` calculation now accounts for `customs_clearance_days` for international shipments, in addition to existing factors like distance, transport mode multipliers (e.g., 'Luft' x0.5, 'See' x2.5), processing time, weather conditions, and holiday seasons.

### Insights or Next Steps
*   The enriched synthetic dataset, featuring international routes, diverse transport modes, and customs delays, is now significantly more realistic and robust for training machine learning models to predict delivery times in complex global logistics scenarios.
*   The next logical step is to utilize this comprehensive dataset to train and evaluate predictive models, potentially exploring the differential impact of `route_type` and `customs_clearance_days` on prediction accuracy.


In [None]:
# Exportiere den Datensatz zu CSV mit Polars
synthetic_data.write_csv('delivery_data.csv')
print('Datensatz erfolgreich als delivery_data.csv exportiert.')

Datensatz erfolgreich als delivery_data.csv exportiert.
