In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

from pyspark.sql import functions as F
from pyspark.sql import Column
from typing import List

In [2]:
spark = SparkSession.builder.master("local[2]").appName("AoC_2022_7[2]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/08 00:28:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
inputs = spark.read.text("./data/aoc_7.txt", wholetext=True).withColumn("filename", F.input_file_name())

In [4]:
# Construct Complete paths for Files

def remove_last_element(arr: Column) -> Column:
    return F.when(F.size(arr) == 1, arr).otherwise(F.slice(arr, F.lit(1), F.size(arr) - 1))

def construct_accumulator(path: Column, absolute_paths: Column) -> Column:
    return F.struct(F.lit(path).cast("array<string>").alias("path"), absolute_paths.cast("array<struct<path: array<string>, size: bigint>>").alias("absolute_paths"))

def process_file_listing(acc, x: Column) -> Column:
    file_parts = F.split(x, " ")
    size = file_parts[0].cast("bigint").alias("size")
    file_struct = F.array(F.struct(acc.path.alias("path"), size))
    return construct_accumulator(acc.path, F.concat(acc.absolute_paths, file_struct))

def process_listing(acc: Column, x: Column) -> Column:
    is_dir = x.startswith("dir")
    return F.when(is_dir, acc).otherwise(process_file_listing(acc, x))

def process_cd(acc: Column, command_parts: Column) -> Column:
    is_absolute = command_parts[2].startswith("/")
    is_level_up = command_parts[2] == F.lit("..")
    
    replace_level_up = remove_last_element(acc.path)
    
    absolute_accum = construct_accumulator(F.array(command_parts[2]), acc.absolute_paths)
    level_up_accum = construct_accumulator(replace_level_up, acc.absolute_paths)
    
    relative_accum = construct_accumulator(F.concat(acc.path, F.array(command_parts[2])), acc.absolute_paths)
    return (F.when(is_absolute, absolute_accum)
            .when(is_level_up, level_up_accum)
            .otherwise(relative_accum))
    

def process_command(acc: Column, x: Column) -> Column:
    command_parts = F.split(x, " ")
    is_cd = command_parts[1] == F.lit("cd")
    is_ls = command_parts[1] == F.lit("ls")
    return F.when(is_ls, acc).otherwise(process_cd(acc, command_parts))
    

def merge(acc: Column, x: Column) -> Column:
    is_command = x.startswith("$")
    return F.when(is_command, process_command(acc, x)).otherwise(process_listing(acc, x))

def path(splits: Column) -> Column:
    init_accum = construct_accumulator(F.array(F.lit("")), F.array(F.lit(None)))
    return F.aggregate(splits, init_accum, merge)

def construct_complete_paths(value: Column) -> Column:
    splits = F.split(value, "\n")
    non_null_paths = F.filter(path(splits).absolute_paths, lambda x: x.isNotNull())
    return non_null_paths.alias("files")

In [5]:
def contributions(arr: Column) -> Column:
    def merger(acc, x):
        next_v = F.concat(F.element_at(acc, -1), x, F.lit("/"))
        return F.when(acc.isNull(), F.array(x)).otherwise(F.concat(acc, F.array(next_v)))
    return F.aggregate(arr, F.lit(None).cast("array<string>"), merger)

In [6]:
# man_inputs = "$ cd /\n$ ls\ndir a\n14848514 b.txt\n8504156 c.dat\ndir d\n$ cd a\n$ ls\ndir e\n29116 f\n2557 g\n62596 h.lst\n$ cd e\n$ ls\n584 i\n$ cd ..\n$ cd ..\n$ cd d\n$ ls\n4060174 j\n8033020 d.log\n5626152 d.ext\n7214296 k"

man_inputs = "$ cd /\n$ ls\ndir a\n14848514 b.txt\n8504156 c.dat\ndir d\n$ cd a\n$ ls\ndir e\n29116 f\n2557 g\n62596 h.lst\n$ cd e\n$ ls\n584 i\n$ cd ..\n$ cd ..\n$ cd d\n$ ls\n4060174 j\n8033020 d.log\n5626152 d.ext\n7214296 k\ndir a\n$ cd a\n$ ls\n6259611 anested"

# file = F.explode(construct_complete_paths(F.lit(man_inputs))).alias("file")

file = F.explode(construct_complete_paths(F.col("value"))).alias("file")

file_size = inputs.select("*", file).select("filename", "file.*")


dirs_level = file_size.select("*", F.explode(contributions(F.col("path"))).alias("dir"))
total_size = dirs_level.groupBy("filename", "dir").agg(F.sum("size").alias("totalsize"))

total_size.where("totalsize <= 100000").select(F.sum("totalsize").alias("result")).head()

# dirs_level.show(200, truncate=False)

Row(result=1517599)

In [7]:
total_space = 70000000
unused_space = total_space - total_size.where("dir == '/'").head().totalsize
spark_for_upgrade = 30000000
space_to_free = 30000000 - unused_space
total_size.where(f"totalsize >= {space_to_free}").select(F.min("totalsize")).head()

Row(min(totalsize)=2481982)