In [0]:
%pip install faker
dbutils.library.restartPython()

In [0]:
import yaml

with open('params.yml', 'r') as file:
    params = yaml.safe_load(file)


data_params = params.get('data_params')
CATALOG = data_params.get('catalog')
SCHEMA = data_params.get('schema')

In [0]:
import pandas as pd 

colnames = ['radio', 'mcc', 'net', 'area', 'cell', 'unit', 'lon', 'lat', 'range', 'samples', 'changeable', 'created', 'updated', 'averageSignal']

mobilelocations = pd.read_csv(f"/Volumes/{CATALOG}/{SCHEMA}/{data_params.get('towers_volume')}/Tower Locations.csv", names=colnames)

spark.createDataFrame(mobilelocations).write.saveAsTable(f"{CATALOG}.{SCHEMA}.mobilelocations", mode="overwrite")
for col, comment in zip(colnames, [
    "Network type. One of the strings GSM, UMTS, LTE or CDMA.",
    "Mobile Country Code, for example 260 for Poland.",
    "For GSM, UMTS and LTE networks, this is the Mobile Network Code (MNC). For CDMA networks, this is the System IDentification number (SID).",
    "Location Area Code (LAC) for GSM and UMTS networks. Tracking Area Code (TAC) for LTE networks. Network IDenfitication number (NID) for CDMA networks.",
    "Cell ID (CID) for GSM and LTE networks. UTRAN Cell ID / LCID for UMTS networks, which is the concatenation of 2 or 4 bytes of Radio Network Controller (RNC) code and 4 bytes of Cell ID. Base station IDentifier number (BID) for CDMA networks.",
    "Primary Scrambling Code (PSC) for UMTS networks. Physical Cell ID (PCI) for LTE networks. An empty value for GSM and CDMA networks.",
    "Longitude in degrees between -180.0 and 180.0 changeable=1: average of longitude values of all related measurements changeable=0: exact GPS position of the cell tower",
    "Latitude in degrees between -90.0 and 90.0 changeable=1: average of latitude values of all related measurements changeable=0: exact GPS position of the tower",
    "Estimate of cell range, in meters.",
    "Total number of measurements assigned to the cell tower",
    "Defines if coordinates of the cell tower are exact or approximate. changeable=1: the GPS position of the cell tower has been calculated from all available measurements changeable=0: the GPS position of the cell tower is precise - no measurements have been used to calculate it.",
    "The first time when the cell tower was seen and added to the OpenCellID database. A date in timestamp format: number of seconds since the UTC Unix Epoch of 1970-01-01T00:00:00Z For example 1409522613 is the timestamp for 2014-08-31T22:03:33Z.",
    "The last time when the cell tower was seen and update. A date in timestamp format: number of seconds since the UTC Unix Epoch of 1970-01-01T00:00:00Z For example 1409522613 is the timestamp for 2014-08-31T22:03:33Z.",
    "Average signal strength."
]):
    spark.sql(f"ALTER TABLE {CATALOG}.{SCHEMA}.mobilelocations CHANGE COLUMN {col} COMMENT '{comment}'")

In [0]:
from pyspark.sql.functions import col, expr, from_unixtime

mobilelocations = (spark.table(f"{CATALOG}.{SCHEMA}.mobilelocations")
                    .withColumn("created", from_unixtime(col("created")).cast("timestamp"))
                    .withColumn("updated", from_unixtime(col("updated")).cast("timestamp")))


sf_mobilelocations = mobilelocations.filter(
    (mobilelocations.lat >= 37.6) & (mobilelocations.lat <= 37.9) &
    (mobilelocations.lon >= -123.0) & (mobilelocations.lon <= -122.3) &
    (mobilelocations.radio == 'LTE')
).withColumn("tower_id", expr("uuid()"))


sf_mobilelocations.write.option("mergeSchema", "true").saveAsTable(f"{CATALOG}.{SCHEMA}.sf_mobilelocations", mode="overwrite")

# Field Technician Data Simulation


In [0]:
from pyspark.sql import Row
import pandas as pd
import random
from faker import Faker

fake = Faker()

# Generate data for field technicians
tech_data = [
    Row(tech_id=fake.uuid4(), name=fake.name(), truckid=random.randint(1000, 9999))
    for _ in range(params.get('simulation_params').get('n_fieldtechs'))
]

# Create a DataFrame from the list of Rows
field_technicians = spark.createDataFrame(tech_data)
field_technicians.write.mode('overwrite').saveAsTable(f"{CATALOG}.{SCHEMA}.field_technicians")
display(field_technicians)

In [0]:
@udf
def choose_scenario():
    import yaml
    import random
    with open('/Workspace/Shared/DAIS_2025_Demos/Telco/NetworkOptimization/params.yml', 'r') as file:
        params = yaml.safe_load(file)
    scenarios = params.get('Scenarios')
    return random.choice(scenarios)

choose_scenario()

In [0]:
scenarios = params.get("Scenarios")
scenarios_df = spark.createDataFrame([(scenario,) for scenario in scenarios], ["scenarios"])
scenarios_df.write.mode('overwrite').saveAsTable(f"{CATALOG}.{SCHEMA}.scenarios")
display(scenarios)

In [0]:
devicelist = ['Nokia AirScale Baseband Unit', 'Ericsson RBS 2106', 'Ericsson RRH 6000', 'Ericsson AIR Antenna 5331', 'Delta D750 DC Power System', 'Delta DPR 4000B EnergE', 'Generac MDG25IF4-STD3', 'Swift Sensors SS3-301 Wireless Door Sensor', 'Cisco Meraki MT10 Sensor', 'Cisco Catalyst 2960', 'Cisco ISR 4000']

In [0]:
from pyspark.sql.functions import lit, explode, expr
towers = spark.table(f"{CATALOG}.{SCHEMA}.sf_mobilelocations").withColumn('devices', lit(devicelist))

towers = towers.withColumn('device', explode('devices')).drop('devices').withColumn('device_id', expr("uuid()"))
towers.write.mode('overwrite').saveAsTable(f"{CATALOG}.{SCHEMA}.tower_devices")

In [0]:
display(towers)

In [0]:
from pyspark.sql.functions import col, sequence, to_date, expr
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
import random

# Define the schema for the simulated data
schema = StructType([
    StructField("tower_id", IntegerType(), False),
    StructField("device_name", StringType(), False),
    StructField("outage_detected_date", StringType(), False),
    StructField("outage_resolved_date", StringType(), False)
])

# Generate a list of dates for 5 years
start_date = "2020-06-06"
end_date = "2025-06-06"
date_seq = spark.sql(f"SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) as date_seq").selectExpr("explode(date_seq) as date")

# Simulate data
data = []
for row in date_seq.collect():
    date = row['date']
    for tower_id in range(1, 11):  # Assuming 10 towers
        for device_id in range(1, 6):  # Assuming 5 devices per tower
            outage_detected_date = date
            outage_resolved_date = date + expr("INTERVAL 1 DAY")
            data.append((tower_id, f"device_{device_id}", str(outage_detected_date), str(outage_resolved_date)))

# Create DataFrame
simulated_df = spark.createDataFrame(data, schema)

# Display the simulated data
display(simulated_df)

In [0]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Set seed for reproducibility
np.random.seed(42)
towers = spark.table(f"{CATALOG}.{SCHEMA}.tower_devices")
# tower_ids = towers.select('tower_id').toPandas()['tower_id'].to_list()
# device_ids = towers.select('device_id').toPandas()['device_id'].to_list()
# num_towers = towers.select('device_id').rdd.flatMap(lambda x: x).collect()

# Create device dataframe
device_df = towers.select(['tower_id', 'device_id']).toPandas()

# Generate outage data
start_date = datetime.now() - timedelta(days=5*365)  # Five years ago
end_date = datetime.now()
last_month_start = datetime.now() - timedelta(days=30)  # Define last month's start

dates = pd.date_range(start_date, end_date, freq='D')

data = []
for date in dates:
    num_outages = np.random.randint(0, len(device_df) // 10)  # Random number of outages per day
    sampled_devices = device_df.sample(num_outages)
    
    for _, row in sampled_devices.iterrows():
        if date >= last_month_start:
            outage_resolved_days = np.random.randint(1, 15)  # Faster resolution in last month
        else:
            outage_resolved_days = np.random.randint(1, 30)  # Standard resolution time

        outage_resolved_date = date + timedelta(days=outage_resolved_days)
        
        data.append({
            'outage_date': date,
            'tower_id': row['tower_id'],
            'device_id': row['device_id'],
            'outage_resolved_date': outage_resolved_date
        })

# Create DataFrame
outage_df = pd.DataFrame(data)

# Display sample data
print(outage_df.tail())


In [0]:
from pyspark.sql.functions import datediff, col

# Convert Pandas DataFrame to Spark DataFrame
outage_spark_df = spark.createDataFrame(outage_df)

# Add date difference column
outage_spark_df = outage_spark_df.withColumn("date_diff", datediff(col("outage_resolved_date"), col("outage_date")))

# Display the DataFrame with the new column
outage_spark_df.write.mode('overwrite').saveAsTable(f"{CATALOG}.{SCHEMA}.outage_resolution")