In [77]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
import numpy as np

This notebook uses data from the Canadian Radio-Television and Telecommunications Commission (CRTC).  Every broadcaster is mandated to provide a complete log of the programs and commercials showcased to the Canadian public.

We're interest in answering the question: What are the channels with the greatest and least proportion of commercials?

The full dataset is available from https://open.canada.ca/data/en/dataset/800106c1-0b08-401e-8be2-ac45d62e662e. In this notebook, we operate on a sample of this data from **Q3 2018**.

In [43]:
spark = SparkSession.builder.getOrCreate()

In [44]:
# read in the broadcasting information
DIRECTORY = "../../data/broadcast_logs"
logs = spark.read.csv(
    os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.csv"),
    sep="|",
    header=True,
    inferSchema=True,
    timestampFormat="yyyy-MM-dd"
)

In [45]:
# split the data frame columns into groups of three so that
# we can display them in small groups
column_split = np.array_split(
    np.array(logs.columns), len(logs.columns) // 3
)

for x in column_split:
    logs.select(*x).show(5, False)

+--------------+------------+----------+
|BroadcastLogID|LogServiceID|LogDate   |
+--------------+------------+----------+
|1196192316    |3157        |2018-08-01|
|1196192317    |3157        |2018-08-01|
|1196192318    |3157        |2018-08-01|
|1196192319    |3157        |2018-08-01|
|1196192320    |3157        |2018-08-01|
+--------------+------------+----------+
only showing top 5 rows

+----------+-------------------+----------------------+
|SequenceNO|AudienceTargetAgeID|AudienceTargetEthnicID|
+----------+-------------------+----------------------+
|1         |4                  |null                  |
|2         |null               |null                  |
|3         |null               |null                  |
|4         |null               |null                  |
|5         |null               |null                  |
+----------+-------------------+----------------------+
only showing top 5 rows

+----------+---------------+-----------------+
|CategoryID|ClosedCaptionID|Co

Let's remove a couple of columns which we won't be using:
- BroadcastLogID is the primary key of the table and won't help us in our analysis
- SequenceNo doesn't appear in the accompanying data dictionary

In [46]:
# Let's remove a couple of columns which we won't be using:
logs = logs.drop("BroadcastLogID", "SequenceNO")

Let's convert the Duration column the duration in seconds, and store this as a new column, `Duration_seconds`.

In [47]:
logs.select(
    F.col("Duration"),
    (
        F.col("Duration").substr(1, 2).cast("int") * 60 * 60
        + F.col("Duration").substr(4, 2).cast("int") * 60
        + F.col("Duration").substr(7, 2).cast("int")
    ).alias("Duration_seconds"),
).distinct().show(5)

+----------------+----------------+
|        Duration|Duration_seconds|
+----------------+----------------+
|01:59:30.0000000|            7170|
|00:31:00.0000000|            1860|
|00:28:08.0000000|            1688|
|00:32:00.0000000|            1920|
|00:30:00.0000000|            1800|
+----------------+----------------+
only showing top 5 rows



In [48]:
logs = logs.withColumn(
    "duration_seconds",
    (
        F.col("Duration").substr(1, 2).cast("int") * 60 * 60
        + F.col("Duration").substr(4, 2).cast("int") * 60
        + F.col("Duration").substr(7, 2).cast("int")
    )
)

In [49]:
for i in logs.columns:
    logs.select(i).summary().show()

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962554782|
|    min|              3157|
|    25%|              3287|
|    50%|              3379|
|    75%|              3627|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    25%|
|    50%|
|    75%|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745122|
|    min|                  1|
|    25%|                  4|
|    50%|                  4|
|    75%|                  4|
|    max|                  4|
+-------+-------------------+

+-------+----------------------+
|summary|AudienceTargetEthnicID|
+-------+----------------------+
|  count|                  1710|
|   mean|    120.56432748538012|
| st

By joining with the `LogIdentifier.csv` reference table, we can filter our logs to just those for primary channels.

In [50]:
DIRECTORY = "../../data/broadcast_logs"
log_identifier = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"),
    sep="|",
    header=True,
    inferSchema=True
)

In [51]:
log_identifier.printSchema()

root
 |-- LogIdentifierID: string (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- PrimaryFG: integer (nullable = true)



In [52]:
log_identifier.show(5)

+---------------+------------+---------+
|LogIdentifierID|LogServiceID|PrimaryFG|
+---------------+------------+---------+
|           13ST|        3157|        1|
|         2000SM|        3466|        1|
|           70SM|        3883|        1|
|           80SM|        3590|        1|
|           90SM|        3470|        1|
+---------------+------------+---------+
only showing top 5 rows



In [53]:
logs_and_channels = logs.join(log_identifier, "LogServiceID", how="left")

We will link two additional tables:
- `CategoryID` which contains information about the types of programs
- `ProgramClassID` which contains data that allows us to pinpoint the commercials

In [54]:
cd_category = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"),
    sep="|",
    header=True,
    inferSchema=True
).select(
    "CategoryID",
    "CategoryCD",
    F.col("EnglishDescription").alias("Category_Description")
)

In [55]:
cd_program_class = spark.read.csv(
    os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"),
    sep="|",
    header=True,
    inferSchema=True
).select(
    "ProgramClassID",
    "ProgramClassCD",
    F.col("EnglishDescription").alias("ProgramClass_Description")
)

In [56]:
full_log = (
    logs_and_channels
    .join(cd_category, "CategoryID", how="left")
    .join(cd_program_class, "ProgramClassID", how="left")
)

Here, we show the total duration minutes for each program class:

In [57]:
(
    full_log
    .groupby("ProgramClassCD", "ProgramClass_Description")
    .agg(F.sum("duration_seconds").alias("duration_total"))
    .orderBy("duration_total", ascending=False).show(100, False)
 )

+--------------+--------------------------------------+--------------+
|ProgramClassCD|ProgramClass_Description              |duration_total|
+--------------+--------------------------------------+--------------+
|PGR           |PROGRAM                               |29440180      |
|COM           |COMMERCIAL MESSAGE                    |4959005       |
|PFS           |PROGRAM FIRST SEGMENT                 |1897637       |
|SEG           |SEGMENT OF A PROGRAM                  |1535873       |
|PRC           |PROMOTION OF UPCOMING CANADIAN PROGRAM|1359433       |
|PGI           |PROGRAM INFOMERCIAL                   |765074        |
|PRO           |PROMOTION OF NON-CANADIAN PROGRAM     |416717        |
|OFF           |SCHEDULED OFF AIR TIME PERIOD         |187304        |
|ID            |NETWORK IDENTIFICATION MESSAGE        |117735        |
|MAG           |MAGAZINE PROGRAM                      |75624         |
|NRN           |No recognized nationality             |72195         |
|PSA  

Here are the program types which we'll consider as commercials:

| ProgramClassCD  | ProgramClass_Description               |
|-----------------|----------------------------------------|
| COM             | COMMERCIAL MESSAGE                     |
| PRC             | PROMOTION OF UPCOMING CANADIAN PROGRAM |
| PGI             | PROGRAM INFOMERCIAL                    |
| PRO             | PROMOTION OF NON-CANADIAN PROGRAM      |
| LOC             | LOCAL ADVERTISING                      |
| SPO             | SPONSORSHIP MESSAGE                    |
| MER             | MERCHANDISING                          |
| SOL             | SOLICITATION MESSAGE                   |

We can now aggregate our `full_log` DataFrame to answer the question: _which TV stations have the highest ratio of commercials?_

In [58]:
answer = (
    full_log.groupby("LogIdentifierID")
    .agg(
        F.sum(
            F.when(
                F.trim(F.col("ProgramclassCD")).isin(
                    ["COM", "PRC", "PGI", "PRO", "LOC", "SPO", "MER", "SOL"]
                ),
                F.col("duration_seconds")
            ).otherwise(0)
        ).alias("duration_commercial"),
        F.sum("duration_seconds").alias("duration_total")
    ).withColumn(
        "commercial_ratio", F.col("duration_commercial") / F.col("duration_total")
    )
)

We can see a few channels which broadbast _only_ commercials also have just a few seconds of programming in our DataFrame.

In [68]:
answer_no_null = answer.fillna(0)

(
    answer_no_null.orderBy("commercial_ratio", ascending=False)
    .show(1000, False)
)

+---------------+-------------------+--------------+---------------------+
|LogIdentifierID|duration_commercial|duration_total|commercial_ratio     |
+---------------+-------------------+--------------+---------------------+
|CIMT           |775                |775           |1.0                  |
|MSET           |2700               |2700          |1.0                  |
|TLNSP          |15480              |15480         |1.0                  |
|TELENO         |17790              |17790         |1.0                  |
|TRN            |13                 |13            |1.0                  |
|HPITV          |13                 |13            |1.0                  |
|TANG           |8125               |8125          |1.0                  |
|MUSIMAX        |23333              |23582         |0.9894410991434145   |
|MMAX           |23333              |23582         |0.9894410991434145   |
|MUSIP          |20587              |20912         |0.9844586840091814   |
|MPLU           |20587   

## Additional Exercises

### Exercise 5.5

Using the data from the `data/broadcast_logs/Call_Signs.csv` (careful: the delimiter here is the comma, not the pipe!), add the Undertaking_Name to our final table to display a human-readable description of the channel.

In [60]:
# read in the broadcasting information
call_signs = spark.read.csv(
    os.path.join(DIRECTORY, "Call_Signs.csv"),
    sep=",",
    header=True,
    inferSchema=True,
    timestampFormat="yyyy-MM-dd"
)

In [69]:
answer_no_null = (
    answer_no_null
    .join(call_signs, "LogIdentifierID", how="left")
    .drop(F.col("UndertakingNO"))
)

answer_no_null = answer_no_null.drop(F.col("UndertakingNO"))

(answer_no_null
 .orderBy("commercial_ratio", ascending=False)
 .show(1000, False))

+---------------+-------------------+--------------+--------------------+--------------------+
|LogIdentifierID|duration_commercial|duration_total|    commercial_ratio|    Undertaking_Name|
+---------------+-------------------+--------------+--------------------+--------------------+
|          BRAVO|              22370|        108920| 0.20538009548292324|              Bravo!|
|             CI|              22567|        108982| 0.20707089244095356|Crime + Investiga...|
|         BBCKID|               3689|         92104|0.040052549292104576|                null|
|           BOOK|              19305|        105885| 0.18232044198895028|Book Television (...|
|           BITE|              23315|        110196|  0.2115775527242368|Makeful TV (forme...|
|           CBKT|              16950|        103410| 0.16391064693936758|Canadian Broadcas...|
|           CBHT|              17319|        103779| 0.16688347353510827|Canadian Broadcas...|
|           CANZ|              21542|         2196

### Exercise 5.6

The government of Canada is asking for your analysis, but they’d like the PRC to be weighted differently. They’d like each PRC second to be considered 0.75 commercial
seconds. Modify the program to account for this change.

In [70]:
answer_weighted = (
    full_log.groupby("LogIdentifierID")
    .agg(
        F.sum(
            F.when(
                F.trim(F.col("ProgramclassCD")).isin(
                    ["COM", "PGI", "PRO", "LOC", "SPO", "MER", "SOL"]
                ),
                F.col("duration_seconds")
            )
            .when(
                F.trim(F.col("ProgramclassCD")) == "PRC",
                F.col("duration_seconds") * 0.75
            ).otherwise(0)
        ).alias("duration_commercial"),
        F.sum("duration_seconds").alias("duration_total")
    ).withColumn(
        "commercial_ratio", F.col("duration_commercial") / F.col("duration_total")
    )
)

### Exercise 5.7

On the data frame returned from commercials.py, return the number of channels in each bucket based on their commercial_ratio. (Hint: look at the documentation for
round on how to round a value.)

In [102]:
bucket_counts = (
    answer_no_null
    .groupby(F.round(F.col("commercial_ratio"), 1).alias("commercial_ratio"))
    .count()
)

+-------------------+-----+
|                bin|count|
+-------------------+-----+
|                0.0|   81|
|                0.2|  147|
|0.30000000000000004|   12|
|                0.1|  165|
|                1.0|    7|
|                0.8|   10|
|                0.5|    1|
|                0.4|    3|
|                0.9|   20|
+-------------------+-----+



In [100]:
(
    bucket_counts
    .orderBy("commercial_ratio", ascending=False)
    .show()
)

+----------------+-----+
|commercial_ratio|count|
+----------------+-----+
|             1.0|   26|
|             0.9|    5|
|             0.8|    6|
|             0.5|    2|
|             0.4|    4|
|             0.3|   44|
|             0.2|  229|
|             0.1|   73|
|             0.0|   57|
+----------------+-----+

