In [3]:
import math,heapq,collections,requests,io,re,struct,random
import numpy as np

GUTEN_URL="https://www.gutenberg.org/cache/epub/19/pg19.txt"
r=requests.get(GUTEN_URL,timeout=10); r.raise_for_status()
raw=r.text
m=re.search(r"THE SONG OF HIAWATHA(.*)End of the Project Gutenberg EBook",raw,flags=re.S|re.I)
if m:
    poem_text=m.group(1)
else:
    poem_text=raw
chapters=re.split(r"\n{2,}",poem_text.strip())
chosen=None
for block in chapters:
    if "Hiawatha" in block and len(block)>500:
        chosen=block; break
if chosen is None:
    chosen=poem_text[:4000]
text=re.sub(r"\r\n", "\n", chosen).strip()
data=text.encode("ascii","ignore")
if len(data)==0:
    raise RuntimeError("No ASCII text extracted")

class HuffNode:
    def __init__(self,freq,symbol=None,left=None,right=None):
        self.freq=freq; self.symbol=symbol; self.left=left; self.right=right
    def __lt__(self,other):
        return self.freq<other.freq

freq=collections.Counter(data)
heap=[HuffNode(f,s) for s,f in freq.items()]
heapq.heapify(heap)
if len(heap)==0:
    huffman_bits_len=0; huffman_codes={}
else:
    if len(heap)==1:
        node=heapq.heappop(heap)
        huffman_codes={node.symbol:"0"}
        huffman_bits_len=len(data)*1
    else:
        while len(heap)>1:
            a=heapq.heappop(heap); b=heapq.heappop(heap)
            heapq.heappush(heap,HuffNode(a.freq+b.freq,None,a,b))
        root=heap[0]
        huffman_codes={}
        def walk(node,prefix=""):
            if node is None:
                return
            if node.symbol is not None:
                huffman_codes[node.symbol]=prefix if prefix!="" else "0"
            else:
                walk(node.left,prefix+"0"); walk(node.right,prefix+"1")
        walk(root,"")
        huffman_bits_len=sum(len(huffman_codes[b]) for b in data)
huffman_table_size_bits=sum(8+len(v) for v in huffman_codes.values())

def huffman_encode(data,codes):
    out_bits=[]
    for b in data:
        out_bits.append(codes[b])
    return "".join(out_bits)

def huffman_decode(bitstr,root):
    out=bytearray()
    node=root
    for bit in bitstr:
        node=node.left if bit=="0" else node.right
        if node.symbol is not None:
            out.append(node.symbol); node=root
    return bytes(out)

max_table_size=4096
def lzw_encode_bytes(data):
    table={bytes([i]):i for i in range(256)}
    code=256
    w=b""
    out=[]
    for c in data:
        wc=w+bytes([c])
        if wc in table:
            w=wc
        else:
            out.append(table[w])
            if code<max_table_size:
                table[wc]=code; code+=1
            w=bytes([c])
    if w:
        out.append(table[w])
    width=1
    if out:
        width=math.ceil(math.log2(max(out)+1))
    total_bits=len(out)*width
    return out,total_bits,len(table)

lzw_codes,lzw_bits_len,lzw_table_entries=lzw_encode_bytes(data)
ascii_bits=8*len(data)
huffman_ratio=None if ascii_bits==0 else huffman_bits_len/ascii_bits
lzw_ratio=None if ascii_bits==0 else lzw_bits_len/ascii_bits

print(f"Original ascii bits: {ascii_bits}")
print(f"Huffman bits: {huffman_bits_len}")
print(f"Huffman ratio: {huffman_ratio}")
print(f"Huffman table size bits (approx): {huffman_table_size_bits}")
print(f"LZW bits (approx): {lzw_bits_len}")
print(f"LZW ratio: {lzw_ratio}")
print(f"LZW dictionary entries: {lzw_table_entries}")

print("\nTop 20 Huffman codes by frequency:")
items=sorted(huffman_codes.items(), key=lambda kv:-freq[kv[0]])[:20]
for b,code in items:
    ch=bytes([b]).decode("ascii","replace")
    print(f"{repr(ch)} ({b}) : {code}")

print("\nFirst 80 LZW output codes:")
print(lzw_codes[:80])


Original ascii bits: 1498224
Huffman bits: 862721
Huffman ratio: 0.5758291150054998
Huffman table size bits (approx): 1327
LZW bits (approx): 743016
LZW ratio: 0.4959311825201038
LZW dictionary entries: 4096

Top 20 Huffman codes by frequency:
' ' (32) : 110
'e' (101) : 001
't' (116) : 1001
'a' (97) : 1000
'h' (104) : 0110
'n' (110) : 0101
'o' (111) : 0100
'i' (105) : 0001
's' (115) : 0000
'r' (114) : 11110
'd' (100) : 10110
'\n' (10) : 10100
'l' (108) : 01110
',' (44) : 111111
'w' (119) : 111010
'g' (103) : 111001
'u' (117) : 111000
'm' (109) : 101110
'f' (102) : 101010
'y' (121) : 1111100

First 80 LZW output codes:
[84, 104, 105, 115, 32, 101, 98, 111, 111, 107, 32, 258, 32, 102, 111, 114, 32, 116, 104, 101, 32, 117, 115, 275, 111, 102, 32, 97, 110, 121, 111, 110, 275, 283, 121, 119, 274, 114, 275, 105, 110, 272, 274, 32, 85, 110, 105, 116, 101, 100, 32, 83, 116, 97, 303, 259, 283, 100, 10, 109, 111, 115, 116, 32, 111, 273, 101, 271, 112, 97, 114, 116, 259, 280, 297, 275, 119, 270, 

LZW was better: it compressed the poem to ~49.6% of the original versus Huffman’s ~57.6% because LZW learns and reuses repeated multi-symbol substrings (words and common phrases) across the text while Huffman only optimizes individual-symbol codes, so LZW captures higher-order redundancy beyond the per-symbol entropy that limits Huffman./

6. <h3>Try to invent and explain an algorithm for embedding the name of a state (from
Afghanistan to Zimbabwe https://en.wikipedia.org/wiki/List_of_sovereign_states)
in some world-wide data-set for some ML task. No code required. Only idea and its
brief description.

Method: content-aware semantic watermark via controlled label-conditional augmentation:

choose a small subset of records across geographically diverse source partitions.

select a robust natural-language transformation mapping: for textual fields, insert a short, semantically consistent phrase derived deterministically from the state name via keyed reversible mapping (example: map each character to a word from a curated synonym list, producing a terse phrase of 3–6 words that reads plausibly).

apply this phrase only in contexts where it is syntactically natural (e.g., as a location qualifier, a comment field, or an auxiliary description), ensuring minimal perturbation to feature distribution.

keep embedding frequency low and spread geographically to avoid introducing class/feature imbalance; record the mapping key privately.

detection/extraction: given the private key, transform dataset fields using the same deterministic mapping and search for matches; presence and frequency of the phrase reliably indicate which state name was embedded.

The phrase remains human-plausible so downstream models and human auditors see no obvious artifact; deterministic keyed mapping makes detection robust to noise and prevents accidental collisions across different keys or runs.