In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

def mapper(line):
    words = line.split()
    return [(word.lower().strip(".,!?;:()[]{}\"'"), 1) for word in words]

def reducer(word_counts):
    return word_counts.reduceByKey(lambda a, b: a + b)

def run_word_count(file_path):
    spark = SparkSession.builder.appName("WordCount").getOrCreate()
    sc = spark.sparkContext

    # Read the file into an RDD
    rdd = sc.textFile(file_path)

    # Map and reduce steps
    mapped_rdd = rdd.flatMap(mapper)
    reduced_rdd = reducer(mapped_rdd)

    # Collect and print the results
    word_counts = reduced_rdd.collect()
    for word, count in sorted(word_counts, key=lambda x: -x[1])[:20]:  # Display top 20 words
        print(f"{word}: {count}")

    spark.stop()

# Example usage
if __name__ == "__main__":
    file_path = "/content/poem.txt"
    run_word_count(file_path)



and: 7
of: 5
to: 5
we: 5
letters: 3
that: 3
a: 3
them: 3
the: 3
poems: 2
they: 2
have: 2
context: 2
their: 2
own: 2
read: 2
are: 2
our: 2
epistolary: 2
itâ€™s: 2
