In [9]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
import numpy as np

This notebook uses data from the Canadian Radio-Television and Telecommunications Commission (CRTC).  Every broadcaster is mandated to provide a complete log of the programs and commercials showcased to the Canadian public.

We're interest in answering the question: What are the channels with the greatest and least proportion of commercials?

The full dataset is available from https://open.canada.ca/data/en/dataset/800106c1-0b08-401e-8be2-ac45d62e662e. In this notebook, we operate on a sample of this data from **Q3 2018**.

In [2]:
spark = SparkSession.builder.getOrCreate()

In [7]:
# read in the broadcasting information
DIRECTORY = "../../data/broadcast_logs"
logs = spark.read.csv(
    os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.csv"),
    sep="|",
    header=True,
    inferSchema=True,
    timestampFormat="yyyy-MM-dd"
)

In [10]:
# split the data frame columns into groups of three so that
# we can display them in small groups
column_split = np.array_split(
    np.array(logs.columns), len(logs.columns) // 3
)

for x in column_split:
    logs.select(*x).show(5, False)

+--------------+------------+----------+
|BroadcastLogID|LogServiceID|LogDate   |
+--------------+------------+----------+
|1196192316    |3157        |2018-08-01|
|1196192317    |3157        |2018-08-01|
|1196192318    |3157        |2018-08-01|
|1196192319    |3157        |2018-08-01|
|1196192320    |3157        |2018-08-01|
+--------------+------------+----------+
only showing top 5 rows

+----------+-------------------+----------------------+
|SequenceNO|AudienceTargetAgeID|AudienceTargetEthnicID|
+----------+-------------------+----------------------+
|1         |4                  |null                  |
|2         |null               |null                  |
|3         |null               |null                  |
|4         |null               |null                  |
|5         |null               |null                  |
+----------+-------------------+----------------------+
only showing top 5 rows

+----------+---------------+-----------------+
|CategoryID|ClosedCaptionID|Co

Let's remove a couple of columns which we won't be using:
- BroadcastLogID is the primary key of the table and won't help us in our analysis
- SequenceNo doesn't appear in the accompanying data dictionary

In [11]:
# Let's remove a couple of columns which we won't be using:
logs = logs.drop("BroadcastLogID", "SequenceNO")

Let's convert the Duration column the duration in seconds, and store this as a new column, `Duration_seconds`.

In [12]:
logs.select(
    F.col("Duration"),
    (
        F.col("Duration").substr(1, 2).cast("int") * 60 * 60
        + F.col("Duration").substr(4, 2).cast("int") * 60
        + F.col("Duration").substr(7, 2).cast("int")
    ).alias("Duration_seconds"),
).distinct().show(5)

+----------------+----------------+
|        Duration|Duration_seconds|
+----------------+----------------+
|01:59:30.0000000|            7170|
|00:31:00.0000000|            1860|
|00:28:08.0000000|            1688|
|00:32:00.0000000|            1920|
|00:30:00.0000000|            1800|
+----------------+----------------+
only showing top 5 rows



In [13]:
logs = logs.withColumn(
    "duration_seconds",
    (
        F.col("Duration").substr(1, 2).cast("int") * 60 * 60
        + F.col("Duration").substr(4, 2).cast("int") * 60
        + F.col("Duration").substr(7, 2).cast("int")
    )
)

In [17]:
for i in logs.columns:
    logs.select(i).summary().show()

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962554782|
|    min|              3157|
|    25%|              3287|
|    50%|              3379|
|    75%|              3627|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    25%|
|    50%|
|    75%|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745122|
|    min|                  1|
|    25%|                  4|
|    50%|                  4|
|    75%|                  4|
|    max|                  4|
+-------+-------------------+

+-------+----------------------+
|summary|AudienceTargetEthnicID|
+-------+----------------------+
|  count|                  1710|
|   mean|    120.56432748538012|
| st