In [1]:
import sys
import platform
from pyspark.sql import SparkSession
import pyspark

spark = (
    SparkSession.builder
    .appName("BDA-PracticeLab02")
    .config("spark.sql.session.timeZone", "GMT+1")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")
print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 04:33:26 WARN Utils: Your hostname, a03-341a, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 04:33:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 04:33:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/07 04:33:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.10.19
Session timezone: GMT+1
Shuffle partitions: 4


In [2]:
from pathlib import Path
import urllib.request
import re

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"
for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)

SOURCE_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
TEXT_PATH = DATA_DIR / "shakespeare.txt"
if not TEXT_PATH.exists():
    urllib.request.urlretrieve(SOURCE_URL, TEXT_PATH)

lines_rdd = spark.sparkContext.textFile(str(TEXT_PATH)).cache()
lines_df = spark.read.text(str(TEXT_PATH)).withColumnRenamed("value", "line").cache()

pattern = re.compile(r"[a-z]+")

def tokenize(text: str):
    return pattern.findall(text.lower())

# Cached tokenized lines for reuse
from pyspark import StorageLevel

tokenized_lines_rdd = lines_rdd.map(tokenize).map(lambda toks: [t for t in toks if t])
tokenized_lines_rdd = tokenized_lines_rdd.persist(StorageLevel.MEMORY_ONLY)

num_lines = lines_df.count()
print(f"Loaded {num_lines} lines from {TEXT_PATH}")
lines_df.show(5, truncate=False)

                                                                                

Loaded 40000 lines from /home/img/BigData/Lab02/lab2-practice/data/shakespeare.txt
+---------------------------------------------+
|line                                         |
+---------------------------------------------+
|First Citizen:                               |
|Before we proceed any further, hear me speak.|
|                                             |
|All:                                         |
|Speak, speak.                                |
+---------------------------------------------+
only showing top 5 rows


In [3]:
from operator import add
from io import StringIO
from contextlib import redirect_stdout
from pyspark.sql import functions as F

# Bigram counts (pairs design)
pair_counts = (
    tokenized_lines_rdd
    .flatMap(lambda tokens: [((tokens[i], tokens[i + 1]), 1) for i in range(len(tokens) - 1)])
    .reduceByKey(add)
)

marginal_counts = (
    tokenized_lines_rdd
    .flatMap(lambda tokens: [((tokens[i], '*'), 1) for i in range(len(tokens) - 1)])
    .reduceByKey(add)
    .map(lambda kv: (kv[0][0], kv[1]))
)

relative_freq_rdd = (
    pair_counts
    .map(lambda kv: (kv[0][0], (kv[0][1], kv[1])))
    .join(marginal_counts)
    .map(lambda kv: (kv[0], kv[1][0][0], kv[1][0][1] / kv[1][1], kv[1][0][1]))
)

bigram_pairs_df = spark.createDataFrame(relative_freq_rdd, schema=["w1", "w2", "rel_freq", "count"])

pairs_top_df = (
    bigram_pairs_df
    .orderBy(F.desc("rel_freq"), F.desc("count"), F.asc("w1"), F.asc("w2"))
    .limit(50)
)

pairs_top_df.show(20, truncate=False)

(pairs_top_df
    .toPandas()
    .to_csv(OUTPUTS_DIR / "bigram_pairs_top.csv", index=False)
)

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    pairs_top_df.explain("formatted")
(PROOF_DIR / "plan_bigrams.txt").write_text(plan_buffer.getvalue())

25/12/07 04:33:42 WARN BlockManager: Block rdd_2_1 already exists on this machine; not re-adding it
25/12/07 04:33:42 WARN BlockManager: Block rdd_2_0 already exists on this machine; not re-adding it
                                                                                

+-------+-----+--------+-----+
|w1     |w2   |rel_freq|count|
+-------+-----+--------+-----+
|ne     |er   |1.0     |58   |
|ta     |en   |1.0     |24   |
|able   |to   |1.0     |10   |
|market |place|1.0     |10   |
|stabb  |d    |1.0     |10   |
|stopp  |d    |1.0     |9    |
|wont   |to   |1.0     |9    |
|whate  |er   |1.0     |8    |
|bottom |of   |1.0     |6    |
|scatter|d    |1.0     |6    |
|abhorr |d    |1.0     |5    |
|alban  |s    |1.0     |5    |
|begg   |d    |1.0     |5    |
|smother|d    |1.0     |5    |
|stol   |n    |1.0     |5    |
|unlook |d    |1.0     |5    |
|fruits |of   |1.0     |4    |
|ink    |and  |1.0     |4    |
|kites  |and  |1.0     |4    |
|looker |on   |1.0     |4    |
+-------+-----+--------+-----+
only showing top 20 rows


570

In [4]:
from collections import Counter


def build_stripes(tokens):
    stripes = {}
    for i in range(len(tokens) - 1):
        head = tokens[i]
        follower = tokens[i + 1]
        if head not in stripes:
            stripes[head] = Counter()
        stripes[head][follower] += 1
    return [(head, counter) for head, counter in stripes.items()]


def merge_counters(c1, c2):
    result = Counter()
    result.update(c1)
    result.update(c2)
    return result

stripes_counts = (
    tokenized_lines_rdd
    .flatMap(build_stripes)
    .reduceByKey(merge_counters)
)

stripes_rows = (
    stripes_counts
    .flatMap(lambda kv: [
        (kv[0], follower, count / sum(kv[1].values()), count)
        for follower, count in kv[1].items()
    ])
)

stripes_df = spark.createDataFrame(stripes_rows, schema=["w1", "w2", "rel_freq", "count"])

stripes_top_df = (
    stripes_df
    .orderBy(F.desc("rel_freq"), F.desc("count"), F.asc("w1"), F.asc("w2"))
    .limit(50)
)

stripes_top_df.show(20, truncate=False)

(stripes_top_df
    .toPandas()
    .to_csv(OUTPUTS_DIR / "bigram_stripes_top.csv", index=False)
)

                                                                                

+-------+-----+--------+-----+
|w1     |w2   |rel_freq|count|
+-------+-----+--------+-----+
|ne     |er   |1.0     |58   |
|ta     |en   |1.0     |24   |
|able   |to   |1.0     |10   |
|market |place|1.0     |10   |
|stabb  |d    |1.0     |10   |
|stopp  |d    |1.0     |9    |
|wont   |to   |1.0     |9    |
|whate  |er   |1.0     |8    |
|bottom |of   |1.0     |6    |
|scatter|d    |1.0     |6    |
|abhorr |d    |1.0     |5    |
|alban  |s    |1.0     |5    |
|begg   |d    |1.0     |5    |
|smother|d    |1.0     |5    |
|stol   |n    |1.0     |5    |
|unlook |d    |1.0     |5    |
|fruits |of   |1.0     |4    |
|ink    |and  |1.0     |4    |
|kites  |and  |1.0     |4    |
|looker |on   |1.0     |4    |
+-------+-----+--------+-----+
only showing top 20 rows


In [5]:
from operator import add
from pyspark.sql import types as T


doc_term_counts = (
    lines_rdd
    .zipWithIndex()
    .flatMap(lambda kv: [((kv[1] // 10, token), 1) for token in tokenize(kv[0]) if token])
    .reduceByKey(add)
)

term_postings_rdd = (
    doc_term_counts
    .map(lambda kv: (kv[0][1], (int(kv[0][0]), int(kv[1]))))
)

def to_postings(values):
    postings = sorted(values, key=lambda item: item[0])
    return [{"doc_id": doc_id, "tf": tf} for doc_id, tf in postings]

index_rows_rdd = term_postings_rdd.groupByKey().mapValues(to_postings)

schema = T.StructType([
    T.StructField("term", T.StringType(), False),
    T.StructField("df", T.IntegerType(), False),
    T.StructField("postings", T.ArrayType(T.StructType([
        T.StructField("doc_id", T.IntegerType(), False),
        T.StructField("tf", T.IntegerType(), False),
    ]), False), False),
])

index_df = spark.createDataFrame(
    index_rows_rdd.map(lambda kv: (kv[0], len(kv[1]), kv[1])),
    schema=schema,
)

output_index_path = OUTPUTS_DIR / "index_parquet"
index_df.write.mode("overwrite").parquet(str(output_index_path))

print(f"Inverted index written to {output_index_path}")
index_df.orderBy(F.desc("df"), F.asc("term")).show(10, truncate=False)

                                                                                

Inverted index written to /home/img/BigData/Lab02/lab2-practice/outputs/index_parquet
+----+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
import os
from collections import defaultdict

index_df = spark.read.parquet(str(OUTPUTS_DIR / "index_parquet"))
index_local = {row.term: row.postings for row in index_df.collect()}


def postings_to_dict(term):
    return {entry['doc_id']: entry['tf'] for entry in index_local.get(term, [])}


def evaluate_and(query_terms):
    if not query_terms:
        return []
    postings = [postings_to_dict(term) for term in query_terms]
    if any(len(p) == 0 for p in postings):
        return []
    common_docs = set(postings[0].keys())
    for p in postings[1:]:
        common_docs &= set(p.keys())
    scores = []
    for doc_id in common_docs:
        score = sum(p[doc_id] for p in postings if doc_id in p)
        scores.append((doc_id, score))
    return sorted(scores, key=lambda x: (-x[1], x[0]))


def evaluate_or(query_terms):
    scores = defaultdict(int)
    for term in query_terms:
        for doc_id, tf in postings_to_dict(term).items():
            scores[doc_id] += tf
    return sorted(scores.items(), key=lambda x: (-x[1], x[0]))

sample_queries = [
    ["romeo", "juliet"],
    ["king", "queen"],
    ["thou", "art"],
    ["love", "hate"],
    ["storm"],
]

md_lines = ["# Boolean Retrieval Results", ""]
for terms in sample_queries:
    and_hits = evaluate_and(terms)
    or_hits = evaluate_or(terms)
    md_lines.append(f"## Query: {' '.join(terms)}")
    md_lines.append("### AND")
    md_lines.append("DocID | Score")
    md_lines.append("--- | ---")
    if and_hits:
        md_lines.extend(f"{doc} | {score}" for doc, score in and_hits[:10])
    else:
        md_lines.append("*(no results)*")
    md_lines.append("")
    md_lines.append("### OR")
    md_lines.append("DocID | Score")
    md_lines.append("--- | ---")
    if or_hits:
        md_lines.extend(f"{doc} | {score}" for doc, score in or_hits[:10])
    else:
        md_lines.append("*(no results)*")
    md_lines.append("")

newline = os.linesep
queries_path = OUTPUTS_DIR / "queries_and_results.md"
queries_path.write_text(newline.join(md_lines) + newline)

print(f"Query results saved to {queries_path}")

Query results saved to /home/img/BigData/Lab02/lab2-practice/outputs/queries_and_results.md


In [7]:
import os
import subprocess


def get_java_version():
    try:
        output = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
        return output.decode("utf-8").strip().splitlines()[0]
    except Exception as exc:
        return f"Unavailable ({exc})"

java_version = get_java_version()
print(f"Java: {java_version}")

print("Spark configuration (selected):")
conf_items = sorted(spark.sparkContext.getConf().getAll())
for key, value in conf_items:
    print(f" - {key} = {value}")

env_lines = [
    "# Environment Summary",
    "",
    f"- Python: {sys.version.split()[0]}",
    f"- Spark: {spark.version}",
    f"- PySpark: {pyspark.__version__}",
    f"- Java: {java_version}",
    f"- OS: {platform.platform()}",
    "",
    "## Spark Configuration",
]

env_lines.extend(f"- {k} = {v}" for k, v in conf_items)

newline = os.linesep
ENV_PATH = BASE_DIR / "ENV.md"
ENV_PATH.write_text(newline.join(env_lines) + newline)

print(f"Environment summary saved to {ENV_PATH}")

Java: openjdk version "21.0.6" 2025-01-21
Spark configuration (selected):
 - spark.app.id = local-1765078410002
 - spark.app.name = BDA-PracticeLab02
 - spark.app.startTime = 1765078408217
 - spark.app.submitTime = 1765078407348
 - spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED 