# Solutions: Quiz 04 — Working with Files

This notebook contains reference solutions for Problems 0–5 from the quiz. Each solution is implemented as a small, well-documented Python function and followed by a short demonstration using temporary files. The demos create temporary files using the `tempfile` and `pathlib` modules so you can re-run cells safely.

### Problem 0 — Count non-empty lines
Implement `count_nonempty_lines(path)` that returns the number of non-empty (non-whitespace) lines in a file.

In [13]:
def count_nonempty_lines(path):
    """Return the count of lines that are not empty or whitespace-only.
    Streams the file line-by-line.
    """
    count = 0
    with open(path, 'r') as f:
        for line in f:
            if line.strip():
                count += 1
    return count

In [14]:
# Demo for Problem 0
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
p = Path(tmpdir.name) / 'demo0.txt'
p.write_text('hello\n\n  \nworld\n')
print('file contents:', p.read_text())
print('count_nonempty_lines ->', count_nonempty_lines(str(p)))

file contents: hello

  
world

count_nonempty_lines -> 2


### Problem 1 — Count word frequencies
Implement `count_word_freq(path)` which reads a UTF-8 text file and returns a dictionary mapping normalized words to counts. Normalization: lowercase and strip surrounding punctuation.

In [16]:
import string

def normalize_token(token):
    return token.strip(string.punctuation).lower()

def count_word_freq(path):
    counts = {}
    with open(path, 'r') as f:
        for line in f:
            for raw_token in line.split():
                token = normalize_token(raw_token)
                if token:
                    if token not in counts:
                        counts[token] = 0
                    counts[token] += 1
    return dict(counts)

In [17]:
# Demo for Problem 1
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
p = Path(tmpdir.name) / 'demo1.txt'
p.write_text('Hello hello world.\nHi! Hi? Hi...\nA, B; C: a b')
print(p.read_text())
print('count_word_freq ->', count_word_freq(str(p)))

Hello hello world.
Hi! Hi? Hi...
A, B; C: a b
count_word_freq -> {'hello': 2, 'world': 1, 'hi': 3, 'a': 2, 'b': 2, 'c': 1}


### Problem 2 — Top-N longest lines
Implement `top_n_longest_lines(path, n)` which returns the n longest lines (trimmed of trailing newline) in descending length order, ties by file order.

In [18]:
def top_n_longest_lines(path, n):
    if n <= 0:
        return []

    result = [] # a heap would be better but we don't know that yet
    with open(path, 'r') as f:
        for line_number, raw_line in enumerate(f):
            line = raw_line.rstrip('\n')
            val = (len(line), -line_number, line) # by storing the -line_number, we can sort easily later
            
            if len(result) < n:
                result.append(val)
            elif len(result) >= n and val > result[-1]:
                result.pop() # remove the smallest element
                result.append(val) # add the new element
            
            result.sort(reverse=True) # sort such that the longest lengths are first
                                      # and then by smallest indices
    
    # Extract and sort descending by length then by index
    return [t[2] for t in result]

In [20]:
# Demo for Problem 2
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
p = Path(tmpdir.name) / 'demo2.txt'
p.write_text('apple\nbanana\ncherry pie\n')
print(top_n_longest_lines(str(p), 2))
p.write_text('a\nbb\ncc\n')
print(top_n_longest_lines(str(p), 5))

['cherry pie', 'banana']
['bb', 'cc', 'a']


### Problem 3 — Merge two sorted integer files
Implement `merge_sorted_files(path_a, path_b)` to merge two sorted integer files efficiently (streaming) and return a single list of integers.

In [28]:
def merge_sorted_files(path_a, path_b):
    result = []
    a, b = 0, 0
    with open(path_a, 'r') as ita, open(path_b, 'r') as itb:
        try:
            a = int(ita.readline())
        except:
            return [int(b) for b in itb] # get all of b
        
        try:
            b = int(itb.readline())
        except:
            return [a] + [int(a) for a in ita] # get the remainder of a
        
        while a and b: # continue while both have values
            if a <= b:
                result.append(a)
                try:
                    a = int(ita.readline())
                except ValueError:
                    a = None
            else:
                result.append(b)
                try:
                    b = int(itb.readline())
                except ValueError:
                    b = None
            
        if a: # if a has values remaining
            result.extend([a] + [int(a) for a in ita])

        elif b: # if b has values remaining
            result.extend([b] + [int(b) for b in itb])
        
    return result

In [29]:
# Demo for Problem 3
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
pa = Path(tmpdir.name) / 'a.txt'
pb = Path(tmpdir.name) / 'b.txt'
pa.write_text('1\n3\n5\n')
pb.write_text('2\n4\n6\n')
print('merge ->', merge_sorted_files(str(pa), str(pb)))

merge -> [1, 2, 3, 4, 5, 6]


### Problem 4 — Lines containing all query words
Implement `lines_with_all_words(path, queries)` returning lines that contain every query word (case-insensitive, strip surrounding punctuation). Empty queries -> return [] by explicit choice.

In [None]:
import string

def tokenize(line):
    result = []
    for token in line.split():
        token = token.strip(string.punctuation) # this is just a helper for getting punctuation
        if token:
            result.append(token.lower())
    return result

def lines_with_all_words(path, queries):
    if not queries:
        return []
    query_words = [query.lower() for query in queries]
    result = []
    with open(path, 'r') as f:
        for line in f:
            tokens = set(tokenize(line))
            if all(word in tokens for word in query_words):
                result.append(line.rstrip('\n'))
    return result

In [30]:
# Demo for Problem 4
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
p = Path(tmpdir.name) / 't.txt'
p.write_text('I like apples and bananas\nBananas are tasty\n')
print(lines_with_all_words(str(p), ['apples','bananas']))
print(lines_with_all_words(str(p), ['bananas']))
print(lines_with_all_words(str(p), []))

['I like apples and bananas']
['I like apples and bananas', 'Bananas are tasty']
[]


### Problem 5 — Replace lines in a file safely
Implement `replace_lines_in_file(path, predicate, replacer)` that writes to a temporary file and atomically replaces the original. Returns number of replacements.

In [6]:
import tempfile
import os
from pathlib import Path


def replace_lines_in_file(path, predicate, replacer):
    path = Path(path)
    replaced = 0
    dirpath = path.parent

    temp_file = tempfile.TemporaryFile(
        "w", dir=dirpath, suffix=".txt", delete=False, delete_on_close=False
    )
    with open(path, "r") as src:
        for line in src:
            if predicate(line):
                temp_file.write(replacer(line))
                replaced += 1
            else:
                temp_file.write(line)
    temp_file.close()
    os.replace(temp_file.name, str(path))
    return replaced

In [8]:
# Demo for Problem 5
import tempfile
from pathlib import Path
tmpdir = tempfile.TemporaryDirectory()
p = Path(tmpdir.name) / 'colors.txt'
p.write_text('red\nblue\nyellow\n')
print('before ->', p.read_text().splitlines())
n = replace_lines_in_file(str(p), lambda l: l.strip()=='blue', lambda l: 'azure\n')
print('n replacements ->', n)
print('after ->', p.read_text().splitlines())

before -> ['red', 'blue', 'yellow']
n replacements -> 1
after -> ['red', 'azure', 'yellow']
