# Solution: Counting Bigrams

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark

In [None]:
cd ../spark/

In [None]:
from ds101_spark_tools import use_spark_context

In [None]:
cd ../exercises

## Exercise: Bigram Count

In [None]:
text_path = "../.assets/data/iliad/iliad.txt"

with use_spark_context("BigramCount") as spark:
    text_file = spark.textFile(text_path)
    bigrams = text_file.flatMap(lambda line: line.split(".")) \
                       .map(lambda line: line.strip().split(" ")) \
                       .flatMap(lambda xs: (tuple(x) for x in zip(xs, xs[1:])))
    counts = bigrams.map(lambda x: (x, 1)) \
            .reduceByKey(lambda x, y: x + y) \
            .sortBy(lambda kv: kv[1], ascending=False)
    for kv in counts.take(10):
        print(kv)

## Exercise: The Greatest Name in British Surreal Comedy

In [None]:
%%file scripts/python_names.txt
Eric Idle
Graham Chapman
John Cleese
Terry Gilliam
Terry Jones
Michael Palin
Monty Python


In [None]:
with open("scripts/python_names.txt", "r") as f:
    names = [line.strip() for line in f]
    print(names)

In [None]:
%%file scripts/spark_greatest_name.py

from contextlib import contextmanager
from pyspark import SparkContext, SparkConf
import argparse

@contextmanager
def use_spark_context(appName):
    conf = SparkConf().setAppName(appName) 
    spark_context = SparkContext(conf=conf)

    try:
        print("starting ", appName)
        yield spark_context
    finally:
        spark_context.stop()
        print("stopping ", appName)

        
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-t','--text', help='path to text data', required=True)
    parser.add_argument('-n','--names', help='path to names file, one name per line', required=True)
    args = vars(parser.parse_args())
    
    text_path = args["text"]
    names_path = args["names"]
    with open(names_path, "r") as names_file:
        names = [line.strip() for line in names_file]
        name_bigrams = [tuple(name.split(" ")) for name in names]

    print("input names: ", name_bigrams)
    with use_spark_context("The Greatest Name") as spark:
        text_file = spark.textFile(text_path)
        bigrams = text_file.flatMap(lambda line: line.split(".")) \
                           .map(lambda line: line.strip().split(" ")) \
                           .flatMap(lambda xs: (tuple(x) for x in zip(xs, xs[1:])))
        counts = bigrams.map(lambda bigram: (bigram, 1)) \
                .reduceByKey(lambda x, y: x + y) \
                .filter(lambda kv: kv[0] in name_bigrams) \
                .collect()
        print("Number of mentions for each name:")
        print(counts)

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright © 2018-2025 [Point 8 GmbH](https://point-8.de)_