In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = SparkSession.builder.getOrCreate()

In [9]:
DATA_DIRECTORY = "../../data/backblaze/data_Q3_2019"

q3 = spark.read.csv(
    DATA_DIRECTORY,
    header=True,
    inferSchema=True
)

backblaze_2019 = q3

# Set the layour for each column according to the schema

backblaze_2019 = backblaze_2019.select(
    [
        F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x)
        for x in backblaze_2019.columns
    ]
)

backblaze_2019.createOrReplaceTempView("backblaze_stats_2019")

### Exercise 7.2

If we look at the code that follows, we can simplify it even further and avoid creating two tables outright. Can you write a summarized_data without having to use a table
other than full_data and no join? (Bonus: Try using pure PySpark, then pure Spark
SQL, and then a combo of both.)

```
full_data = backblaze2019.selectExpr(
 "model", "capacity_bytes / pow(1024, 3) capacity_GB", "date", "failure"
)

drive_days = full_data.groupby("model", "capacity_GB").agg(
 F.count("*").alias("drive_days")
)

failures = (
 full_data.where("failure = 1")
 .groupby("model", "capacity_GB")
 .agg(F.count("*").alias("failures"))
)

summarized_data = (
 drive_days.join(failures, on=["model", "capacity_GB"], how="left")
 .fillna(0.0, ["failures"])
 .selectExpr("model", "capacity_GB", "failures / drive_days failure_rate")
 .cache()
)
```

In [11]:
# Pure PySpark version

full_data = backblaze_2019.selectExpr(
 "model", "capacity_bytes / pow(1024, 3) AS capacity_GB", "date", "failure"
)

# Calculate drive_days
drive_days = full_data.groupBy("model", "capacity_GB").agg(
    F.count("*").alias("drive_days")
)

# Calculate failures
failures = full_data.where("failure = 1").groupBy("model", "capacity_GB").agg(
    F.count("*").alias("failures")
)

# Join and calculate failure_rate
summarized_data = drive_days.join(failures, on=["model", "capacity_GB"], how="left")\
    .fillna(0.0, ["failures"])\
    .withColumn("failure_rate", F.col("failures") / F.col("drive_days"))\
    .select("model", "capacity_GB", "failure_rate")\
    .cache()

# Show the summarized data
summarized_data.show(5)

+--------------------+--------------------+--------------------+
|               model|         capacity_GB|        failure_rate|
+--------------------+--------------------+--------------------+
|       ST12000NM0117|             11176.0|0.019305019305019305|
|      WDC WD5000LPCX|   465.7617416381836|                 0.0|
|         ST6000DM004|    5589.02986907959|                 0.0|
|         ST4000DM005|   3726.023277282715|                 0.0|
|HGST HMS5C4040BLE641|   3726.023277282715|                 0.0|
|       ST500LM012 HN|   465.7617416381836|1.511585221015353...|
|HGST HUH721010ALE600|-9.31322574615478...|                 0.0|
|         ST6000DX000|    5589.02986907959|4.907252919815487...|
|       ST12000NM0007|-9.31322574615478...|                 0.0|
|         ST8000DM004|   7452.036460876465|                 0.0|
|         ST6000DM001|    5589.02986907959|                 0.0|
|      WDC WD5000BPKT|   465.7617416381836|                 0.0|
|        WDC WD60EFRX|   

In [6]:
# Pure SQL version
spark.sql(
    """
    -- Calculate drive_days
    WITH drive_days AS (
    SELECT
        model,
        capacity_bytes / pow(1024, 3) AS capacity_GB,
        COUNT(*) AS drive_days
    FROM
        backblaze_stats_2019
    GROUP BY
        model,
        capacity_bytes
    ),

    -- Calculate failures
    failures AS (
    SELECT
        model,
        capacity_bytes / pow(1024, 3) AS capacity_GB,
        COUNT(*) AS failures
    FROM
        backblaze_stats_2019
    WHERE
        failure = 1
    GROUP BY
        model,
        capacity_bytes
    )

    -- Calculate summarized_data
    SELECT
    dd.model,
    dd.capacity_GB,
    COALESCE(f.failures, 0) / dd.drive_days AS failure_rate
    FROM
    drive_days dd
    LEFT JOIN
    failures f ON dd.model = f.model AND dd.capacity_GB = f.capacity_GB;
    """ 
).show(5)

+------------------+------------------+--------------------+
|             model|       capacity_GB|        failure_rate|
+------------------+------------------+--------------------+
|       ST9250315AS|232.88591766357422|                 0.0|
|        ST320LT007|298.09114837646484|                 0.0|
|    WDC WD5000LPVX| 465.7617416381836|5.070222582771384E-5|
|TOSHIBA MQ01ABF050| 465.7617416381836|5.579360828423496E-4|
|   TOSHIBA HDWE160|  5589.02986907959|                 0.0|
+------------------+------------------+--------------------+
only showing top 5 rows



### Exercise 7.3

The analysis in the chapter is flawed in that the age of a drive is not taken into consideration. Instead of ordering the model by failure rate, order by average age at failure
(assume that every drive fails on the maximum date reported if they are still alive).
(Hint: Remember that you need to count the age of each drive first.)

### Exercise 7.4

What is the total capacity (in TB) that Backblaze records at the beginning of each month?

In [34]:
spark.sql(
    """
    SELECT date, round(sum(capacity_bytes) / pow(1024, 4), 0) AS capacity_TB
    FROM backblaze_stats_2019
    WHERE failure = 0
    AND date = DATE_FORMAT(date, 'yyyy-MM-01')
    GROUP BY date
    ORDER BY date
    """
).show()

+----------+-----------+
|      date|capacity_TB|
+----------+-----------+
|2019-07-01|   833798.0|
|2019-08-01|   846104.0|
|2019-09-01|   858427.0|
+----------+-----------+



### Exercise 7.5

If you look at the data, you’ll see that some drive models can report an erroneous
capacity. In the data preparation stage, restage the full_data data frame so that the
most common capacity for each drive is used.


In [24]:
# We can update the data preparation stage like this:

full_data = backblaze_2019.selectExpr(
 "model", "capacity_bytes / pow(1024, 3) AS capacity_GB", "date", "failure"
)

# Calculate the mode capacity_GB for each model
mode_capacity_gb = backblaze_2019.groupby("model").agg(
    F.expr("percentile_approx(capacity_bytes / pow(1024, 3), 0.5)").alias("mode_capacity_GB")
)

# Join the mode_capacity_gb with the full_data DataFrame
full_data = backblaze_2019.join(mode_capacity_gb, on="model", how="inner")

# Create a new column "capacity_GB" using the mode_capacity_GB
full_data = full_data.withColumn("capacity_GB", F.col("mode_capacity_GB"))

# Select the required columns and continue with your existing query
full_data = full_data.selectExpr("model", "capacity_GB", "date", "failure")

# Show the updated DataFrame
# full_data.show()