## 0. Bootstrap

In [1]:
import sys
import platform
from pyspark.sql import SparkSession
import pyspark

spark = (
    SparkSession.builder
    .appName("BDA-A01")
    .master("local[*]")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.shuffle.partitions", "8")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/06 14:40:50 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/06 14:40:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/06 14:40:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/06 14:40:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.10.19
Session timezone: UTC


In [2]:
spark.catalog.clearCache()

# 1. Load dataset

In [3]:
from pathlib import Path
import urllib.request

BASE_DIR = Path.cwd() 
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"

for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)


SHAKESPEARE_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
TEXT_PATH = DATA_DIR / "shakespeare.txt"

if not TEXT_PATH.exists():
    print(f"Downloading Shakespeare corpus...")
    urllib.request.urlretrieve(SHAKESPEARE_URL, TEXT_PATH)
    print(f"Downloaded to: {TEXT_PATH}")


raw_rdd = spark.sparkContext.textFile(str(TEXT_PATH)).cache()
lines_df = spark.read.text(str(TEXT_PATH)).withColumnRenamed("value", "line").cache()

num_lines = raw_rdd.count()
lines_df.count()

print(f"Data loaded from: {TEXT_PATH}")
print(f"Total lines: {num_lines:,}")
print("\nSample lines:")
lines_df.show(5, truncate=False)

                                                                                

Data loaded from: /home/img/BigData/Lab01/lab1-assignement/data/shakespeare.txt
Total lines: 122,458

Sample lines:
+----------------------+
|line                  |
+----------------------+
|1609                  |
|                      |
|THE SONNETS           |
|                      |
|by William Shakespeare|
+----------------------+
only showing top 5 rows


# 2. Part A — “perfect x” follower counts

In [4]:
import re
from pyspark.sql import functions as F
from contextlib import redirect_stdout
from io import StringIO

token_pattern = re.compile(r"[a-z]+")

def extract_perfect_followers(line):
    tokens = token_pattern.findall(line.lower())
    followers = []
    for i in range(len(tokens) - 1):
        if tokens[i] == "perfect":
            follower = tokens[i + 1]
            if follower:  # Non-empty
                followers.append(follower)
    return followers


followers_rdd = (
    raw_rdd
    .flatMap(extract_perfect_followers)
)

followers_df = followers_rdd.map(lambda token: (token,)).toDF(["follower"])

perfect_counts_df = (
    followers_df
    .groupBy("follower")
    .count()
    .filter(F.col("count") > 1) 
    .orderBy(F.desc("count"), F.asc("follower"))
)

perfect_counts_df.show(20, truncate=False)


perfect_counts_df.toPandas().to_csv(OUTPUTS_DIR / "perfect_followers.csv", index=False)
print(f"✓ Saved to: {OUTPUTS_DIR / 'perfect_followers.csv'}")

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    perfect_counts_df.explain("formatted")
(PROOF_DIR / "plan_perfect.txt").write_text(plan_buffer.getvalue())
print(f"✓ Plan saved to: {PROOF_DIR / 'plan_perfect.txt'}")

perfect_counts_df.explain("formatted")

+--------+-----+
|follower|count|
+--------+-----+
|in      |4    |
|love    |4    |
|honour  |2    |
|that    |2    |
|yellow  |2    |
+--------+-----+

✓ Saved to: /home/img/BigData/Lab01/lab1-assignement/outputs/perfect_followers.csv
✓ Plan saved to: /home/img/BigData/Lab01/lab1-assignement/proof/plan_perfect.txt
== Physical Plan ==
AdaptiveSparkPlan (18)
+- == Final Plan ==
   ResultQueryStage (12)
   +- * Sort (11)
      +- AQEShuffleRead (10)
         +- ShuffleQueryStage (9), Statistics(sizeInBytes=160.0 B, rowCount=5)
            +- Exchange (8)
               +- * Filter (7)
                  +- * HashAggregate (6)
                     +- AQEShuffleRead (5)
                        +- ShuffleQueryStage (4), Statistics(sizeInBytes=1232.0 B, rowCount=38)
                           +- Exchange (3)
                              +- * HashAggregate (2)
                                 +- * Scan ExistingRDD (1)
+- == Initial Plan ==
   Sort (17)
   +- Exchange (16)
      +- Filter (15

# 3. Part B — PMI with RDDs: pairs

In [5]:
import math
from itertools import combinations
from operator import add


MAX_TOKENS = 40       
PMI_THRESHOLD = 5     

def dedupe_preserve_order(tokens):
    seen = set()
    ordered = []
    for token in tokens:
        if token not in seen:
            seen.add(token)
            ordered.append(token)
    return ordered

pmi_token_pattern = re.compile(r"[a-z]+")

tokens_per_line = (
    raw_rdd
    .map(lambda line: pmi_token_pattern.findall(line.lower())[:MAX_TOKENS])
    .map(lambda tokens: [t for t in tokens if t])  
    .map(dedupe_preserve_order)
    .filter(lambda tokens: len(tokens) > 1)  
    .cache()
)

num_docs = tokens_per_line.count()

marginal_counts = (
    tokens_per_line
    .flatMap(lambda tokens: [(token, 1) for token in tokens])
    .reduceByKey(add)
)


marginal_dict = dict(marginal_counts.collect())
marginal_bc = spark.sparkContext.broadcast(marginal_dict)

print(f"Top 10 most frequent tokens:")
for token, count in sorted(marginal_dict.items(), key=lambda x: -x[1])[:10]:
    print(f"  {token}: {count}")

pair_counts = (
    tokens_per_line
    .flatMap(lambda tokens: [((min(a, b), max(a, b)), 1) for a, b in combinations(tokens, 2)])
    .reduceByKey(add)
    .filter(lambda kv: kv[1] >= PMI_THRESHOLD) 
)

print(f"Pairs with co-occurrence >= {PMI_THRESHOLD}: {pair_counts.count():,}")

def compute_pmi_pair(kv):
    (x, y), co_count = kv
    count_x = marginal_bc.value.get(x)
    count_y = marginal_bc.value.get(y)
    
    if not count_x or not count_y:
        return None
    
    pmi = math.log10((co_count * num_docs) / (count_x * count_y))
    return (x, y, float(pmi), int(co_count))

pmi_pairs_rdd = (
    pair_counts
    .map(compute_pmi_pair)
    .filter(lambda row: row is not None)
)

pmi_pairs_df = (
    spark.createDataFrame(pmi_pairs_rdd, schema=["x", "y", "pmi", "count"])
    .orderBy(F.desc("pmi"))
)

print("Top 20 pairs by PMI:")
pmi_pairs_df.show(20, truncate=False)

pmi_pairs_df.toPandas().to_csv(OUTPUTS_DIR / "pmi_pairs_sample.csv", index=False)
print(f" Saved to: {OUTPUTS_DIR / 'pmi_pairs_sample.csv'}")

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    pmi_pairs_df.explain("formatted")
(PROOF_DIR / "plan_pmi_pairs.txt").write_text(plan_buffer.getvalue())
print(f" Plan saved to: {PROOF_DIR / 'plan_pmi_pairs.txt'}")


Top 10 most frequent tokens:
  and: 24606
  the: 24300
  i: 20190
  to: 18337
  of: 16631
  a: 13281
  you: 12322
  my: 11549
  that: 10932
  in: 10723


                                                                                

Pairs with co-occurrence >= 5: 92,232
Top 20 pairs by PMI:


                                                                                

+---------+---------+------------------+-----+
|x        |y        |pmi               |count|
+---------+---------+------------------+-----+
|mell     |pell     |4.267109122884396 |5    |
|sauf     |votre    |4.091017863828714 |5    |
|jourdain |margery  |4.011836617781089 |5    |
|cine     |med      |4.003867688109814 |7    |
|phrynia  |timandra |3.9660791272204143|5    |
|dogberry |verges   |3.920321636659739 |6    |
|dit      |il       |3.789987868164733 |5    |
|clitus   |dardanius|3.753004301911563 |5    |
|envoy    |l        |3.72304107853412  |14   |
|cleomenes|dion     |3.644845745468146 |7    |
|antenor  |deiphobus|3.6138966091090516|5    |
|anjou    |maine    |3.590415513259529 |12   |
|cimber   |metellus |3.590415513259529 |8    |
|haven    |milford  |3.5774505360951614|11   |
|acold    |tom      |3.568139118548377 |5    |
|sisters  |weird    |3.551105779249596 |5    |
|cawdor   |glamis   |3.5469498194784386|6    |
|alice    |madame   |3.5223816279877016|9    |
|glendower|ow

                                                                                

 Saved to: /home/img/BigData/Lab01/lab1-assignement/outputs/pmi_pairs_sample.csv
 Plan saved to: /home/img/BigData/Lab01/lab1-assignement/proof/plan_pmi_pairs.txt


# 4. Part B — PMI with RDDs: stripes

In [6]:
from collections import Counter

def build_stripes(tokens):
    for x in tokens:
        counter = Counter()
        for y in tokens:
            if y != x:
                counter[y] += 1
        if counter:
            yield (x, counter)

def merge_counters(c1, c2):
    c1.update(c2)
    return c1

stripes_counts = (
    tokens_per_line
    .flatMap(build_stripes)
    .reduceByKey(merge_counters)
)

def stripe_to_pmi_rows(item):
    x, counter = item
    count_x = marginal_bc.value.get(x)
    if not count_x:
        return []
    
    rows = []
    for y, co_count in counter.items():
        if co_count >= PMI_THRESHOLD:
            count_y = marginal_bc.value.get(y)
            if not count_y:
                continue
            pmi = math.log10((co_count * num_docs) / (count_x * count_y))
            rows.append((x, y, float(pmi), int(co_count)))
    return rows


pmi_stripes_rdd = stripes_counts.flatMap(stripe_to_pmi_rows)

pmi_stripes_df = (
    spark.createDataFrame(pmi_stripes_rdd, schema=["x", "y", "pmi", "count"])
    .orderBy(F.desc("pmi"))
)
print("Top 20 pairs by PMI :")
pmi_stripes_df.show(20, truncate=False)


pmi_stripes_df.toPandas().to_csv(OUTPUTS_DIR / "pmi_stripes_sample.csv", index=False)
print(f" Saved to: {OUTPUTS_DIR / 'pmi_stripes_sample.csv'}")

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    pmi_stripes_df.explain("formatted")
(PROOF_DIR / "plan_pmi_stripes.txt").write_text(plan_buffer.getvalue())
print(f" Plan saved to: {PROOF_DIR / 'plan_pmi_stripes.txt'}")

                                                                                

Top 20 pairs by PMI :


                                                                                

+---------+---------+------------------+-----+
|x        |y        |pmi               |count|
+---------+---------+------------------+-----+
|pell     |mell     |4.267109122884396 |5    |
|mell     |pell     |4.267109122884396 |5    |
|sauf     |votre    |4.091017863828714 |5    |
|votre    |sauf     |4.091017863828714 |5    |
|margery  |jourdain |4.011836617781089 |5    |
|jourdain |margery  |4.011836617781089 |5    |
|med      |cine     |4.003867688109814 |7    |
|cine     |med      |4.003867688109814 |7    |
|timandra |phrynia  |3.9660791272204143|5    |
|phrynia  |timandra |3.9660791272204143|5    |
|dogberry |verges   |3.920321636659739 |6    |
|verges   |dogberry |3.920321636659739 |6    |
|il       |dit      |3.789987868164733 |5    |
|dit      |il       |3.789987868164733 |5    |
|dardanius|clitus   |3.753004301911563 |5    |
|clitus   |dardanius|3.753004301911563 |5    |
|envoy    |l        |3.72304107853412  |14   |
|l        |envoy    |3.72304107853412  |14   |
|cleomenes|di

                                                                                

 Saved to: /home/img/BigData/Lab01/lab1-assignement/outputs/pmi_stripes_sample.csv
 Plan saved to: /home/img/BigData/Lab01/lab1-assignement/proof/plan_pmi_stripes.txt


# 6. Environment and reproducibility

In [7]:
import json
import subprocess

def get_java_version():
    try:
        output = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
        return output.decode("utf-8").strip().splitlines()[0]
    except Exception as exc:
        return f"Unavailable ({exc})"

java_output = get_java_version()
print(f"Java: {java_output}")

print("Spark configuration (selected):")
conf_items = sorted(spark.sparkContext.getConf().getAll())
for key, value in conf_items:
    print(f" - {key} = {value}")

env_summary = {
    "python": sys.version,
    "spark": spark.version,
    "pyspark": pyspark.__version__,
    "java": java_output,
    "os": platform.platform(),
    "spark_conf": {k: v for k, v in conf_items if k.startswith("spark.")}
}

env_lines = [
    "# Environment Summary",
    "",
    f"- Python: {sys.version.split()[0]}",
    f"- Spark: {spark.version}",
    f"- PySpark: {pyspark.__version__}",
    f"- Java: {java_output}",
    f"- OS: {platform.platform()}",
    "",
    "## Spark Configuration"
]

env_lines.extend(f"- {k} = {v}" for k, v in env_summary["spark_conf"].items())

ENV_PATH = Path("ENV.md")
ENV_PATH.write_text("\n".join(env_lines) + "\n")

print(f"Environment details saved to {ENV_PATH.resolve()}")

Java: openjdk version "21.0.6" 2025-01-21
Spark configuration (selected):
 - spark.app.id = local-1765028453025
 - spark.app.name = BDA-A01
 - spark.app.startTime = 1765028451710
 - spark.app.submitTime = 1765028451221
 - spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-open