In [12]:
%%pyspark
df = spark.read.load('abfss://files@datalakefv41bzh.dfs.core.windows.net/data/RLC_bearer.csv', format='csv'
## If header exists uncomment line below
, header=True
)
display(df.limit(10))

StatementMeta(sparkfv41bzh, 7, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 770953b3-c35c-44f6-aedc-3f7bb7fbd8ff)

In [13]:
df.count()

StatementMeta(sparkfv41bzh, 7, 9, Finished, Available, Finished)

8931

In [14]:
df.printSchema()


StatementMeta(sparkfv41bzh, 7, 10, Finished, Available, Finished)

root
 |-- tstamp: string (nullable = true)
 |-- ngran_node: string (nullable = true)
 |-- mcc: string (nullable = true)
 |-- mnc: string (nullable = true)
 |-- mnc_digit_len: string (nullable = true)
 |-- nb_id: string (nullable = true)
 |-- cu_du_id: string (nullable = true)
 |-- txpdu_pkts: string (nullable = true)
 |-- txpdu_bytes: string (nullable = true)
 |-- txpdu_wt_ms: string (nullable = true)
 |-- txpdu_dd_pkts: string (nullable = true)
 |-- txpdu_dd_bytes: string (nullable = true)
 |-- txpdu_retx_pkts: string (nullable = true)
 |-- txpdu_retx_bytes: string (nullable = true)
 |-- txpdu_segmented: string (nullable = true)
 |-- txpdu_status_pkts: string (nullable = true)
 |-- txpdu_status_bytes: string (nullable = true)
 |-- txbuf_occ_bytes: string (nullable = true)
 |-- txbuf_occ_pkts: string (nullable = true)
 |-- rxpdu_pkts: string (nullable = true)
 |-- rxpdu_bytes: string (nullable = true)
 |-- rxpdu_dup_pkts: string (nullable = true)
 |-- rxpdu_dup_bytes: string (nullable 

In [15]:
# Packet Transmission Metrics
# These columns measure how much data is transmitted and its associated issues.

# txpdu_pkts → Number of transmitted PDUs (Packet Data Units).
# txpdu_bytes → Total bytes transmitted.
# txpdu_wt_ms → Transmission waiting time (in milliseconds).
# txpdu_dd_pkts, txpdu_dd_bytes → Discarded Downlink PDUs (packets lost or discarded).
# txpdu_retx_pkts, txpdu_retx_bytes → Retransmitted PDUs due to errors.
# txpdu_segmented → Number of PDUs that were segmented (split into smaller packets).
# txpdu_status_pkts, txpdu_status_bytes → Status reports sent for transmission PDUs (used in ARQ feedback).

StatementMeta(sparkfv41bzh, 7, 11, Finished, Available, Finished)

In [16]:
from pyspark.sql.functions import col, sum, avg, when


df_pack = df.withColumn("txpdu_pkts", col("txpdu_pkts").cast("int")) \
       .withColumn("txpdu_bytes", col("txpdu_bytes").cast("int")) \
       .withColumn("txpdu_retx_pkts", col("txpdu_retx_pkts").cast("int")) \
       .withColumn("txpdu_dd_pkts", col("txpdu_dd_pkts").cast("int"))

# Aggregating transmission metrics
df_pack.selectExpr(
    "SUM(txpdu_pkts) as total_tx_pkts",
    "SUM(txpdu_bytes) as total_tx_bytes",
    "SUM(txpdu_retx_pkts) as total_retransmissions",
    "AVG(txpdu_retx_pkts / txpdu_pkts) as retransmission_rate",
    "AVG(txpdu_dd_pkts / txpdu_pkts) as discarded_pkt_rate"
).show()


StatementMeta(sparkfv41bzh, 7, 12, Finished, Available, Finished)

+-------------+--------------+---------------------+-------------------+------------------+
|total_tx_pkts|total_tx_bytes|total_retransmissions|retransmission_rate|discarded_pkt_rate|
+-------------+--------------+---------------------+-------------------+------------------+
|      5079248|       4843195|              5260726|  8.270201598530184| 4.253294704577163|
+-------------+--------------+---------------------+-------------------+------------------+



In [17]:
# Packet Reception Metrics
# These columns measure how much data is received and its quality.

# rxpdu_pkts → Number of received PDUs.
# rxpdu_bytes → Total bytes received.
# rxpdu_dup_pkts, rxpdu_dup_bytes → Duplicate PDUs received (could indicate network issues).
# rxpdu_dd_pkts, rxpdu_dd_bytes → Discarded Downlink PDUs (packets lost after reception).
# rxpdu_ow_pkts, rxpdu_ow_bytes → Out-of-window PDUs (packets arriving outside their expected order).
# rxpdu_status_pkts, rxpdu_status_bytes → Status reports received for received PDUs.

StatementMeta(sparkfv41bzh, 7, 13, Finished, Available, Finished)

In [18]:
# Buffer Occupancy Metrics (Queueing Delay & Congestion Indicators)
# These columns help analyze network congestion and buffer utilization.

# txbuf_occ_bytes, txbuf_occ_pkts → Transmission Buffer Occupancy (data waiting to be transmitted).
# rxbuf_occ_bytes, rxbuf_occ_pkts → Reception Buffer Occupancy (data waiting to be processed).

StatementMeta(sparkfv41bzh, 7, 14, Finished, Available, Finished)

In [21]:
# Average transmission and reception BUFFER occupancy

dfbuff = df.withColumn("txbuf_occ_bytes", col("txbuf_occ_bytes").cast("int")) \
       .withColumn("txpdu_retx_pkts", col("txpdu_retx_pkts").cast("int")) \
       .withColumn("rxbuf_occ_bytes", col("rxbuf_occ_bytes").cast("int"))


dfbuff.selectExpr(
    "AVG(txbuf_occ_bytes) as avg_tx_buffer_bytes",
    "AVG(rxbuf_occ_bytes) as avg_rx_buffer_bytes"
).show()

# Correlation between buffer occupancy and retransmissions
corr_txbuf_retx = dfbuff.stat.corr("txbuf_occ_bytes", "txpdu_retx_pkts")
corr_rxbuf_retx = dfbuff.stat.corr("rxbuf_occ_bytes", "txpdu_retx_pkts")

print(f"Correlation between TX buffer occupancy and retransmissions: {corr_txbuf_retx}")
print(f"Correlation between RX buffer occupancy and retransmissions: {corr_rxbuf_retx}")


StatementMeta(sparkfv41bzh, 7, 17, Finished, Available, Finished)

+-------------------+-------------------+
|avg_tx_buffer_bytes|avg_rx_buffer_bytes|
+-------------------+-------------------+
|  474.9706639793976|  547.6097861381704|
+-------------------+-------------------+

Correlation between TX buffer occupancy and retransmissions: 0.0401767088239992
Correlation between RX buffer occupancy and retransmissions: -0.2098824579205666


In [19]:
# SDU-Level Statistics (Higher-Layer Packets)
# These columns relate to SDUs (Service Data Units), which are higher-level packets before they are segmented into PDUs.

# txsdu_pkts, txsdu_bytes → Number of transmitted SDUs and their size.
# txsdu_avg_time_to_tx → Average time taken to transmit an SDU (in ms).
# txsdu_wt_us → Waiting time for SDUs before transmission (in µs).
# rxsdu_pkts, rxsdu_bytes → Number of received SDUs and their size.
# rxsdu_dd_pkts, rxsdu_dd_bytes → Discarded SDUs (dropped due to errors or congestion).


StatementMeta(sparkfv41bzh, 7, 15, Finished, Available, Finished)