In [2]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastavro<2,>=0.23.6 (from apache-beam)
  Downloading fastavro-1.12.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.8 kB)
Collecting fasteners<1.0,>=0.3 (from apache-beam)
  Downloading fasteners-0.20-py3-none-any.whl.metadata (4.8 kB)
Collecting grpcio!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0,<2,>=1.33.1 (from apache-beam)
  Downloading grpcio-1.65.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

# Try Apache Beam - Python

In this notebook, we set up your development environment and work through a simple example using the [DirectRunner](https://beam.apache.org/documentation/runners/direct/). You can explore other runners with the [Beam Capatibility Matrix](https://beam.apache.org/documentation/runners/capability-matrix/).

To navigate through different sections, use the table of contents. From **View**  drop-down list, select **Table of contents**.

To run a code cell, you can click the **Run cell** button at the top left of the cell, or by select it and press **`Shift+Enter`**. Try modifying a code cell and re-running it to see what happens.

To learn more about Colab, see [Welcome to Colaboratory!](https://colab.sandbox.google.com/notebooks/welcome.ipynb).

# Setup

First, you need to set up your environment, which includes installing `apache-beam` and downloading a text file from Cloud Storage to your local file system. We are using this file to test your pipeline.

In [1]:
import apache_beam as beam
import re

# Input text file
input_file = "alice_large.txt"

print("Dataset ready:", input_file)

Dataset ready: alice_large.txt


# Word count with comments

Below is mostly the same code as above, but with comments explaining every line in more detail.

In [10]:
import apache_beam as beam
import re

# DoFn class used to extract words from each line (different method from original code)
class ExtractWordsFn(beam.DoFn):
    def process(self, element):
        # Convert line to lowercase and extract only alphabetic words
        words = re.findall(r"[A-Za-z]+", element.upper())
        for w in words:
            yield w  # emit each word

output_path = "part"

with beam.Pipeline() as pipeline:

    word_counts = (
        pipeline
        | "ReadFile" >> beam.io.ReadFromText(input_file)
          # Read the text line by line

        | "ExtractWords" >> beam.ParDo(ExtractWordsFn())
          # NEW LOGIC: Use DoFn to extract words (instead of FlatMap)

        | "CountWords" >> beam.combiners.Count.PerElement()
          # NEW METHOD: Count how many times each word appears

        | "FormatOutput" >> beam.Map(lambda wc: str((wc[0].upper(), wc[1])))
          # Format as "word: count"

    )

    # Write output to txt file
    word_counts | "WriteOutput" >> beam.io.WriteToText(
        output_path,
        file_name_suffix=".txt"
    )

print("Output file created at:", output_path + "-00000-of-00001.txt")
print("Showing first 50 lines:\n")

# Print first few lines from generated file
with open(output_path + "-00000-of-00001.txt", "r") as f:
    for line in f.readlines()[:50]:
        print(line.strip())

Output file created at: part-00000-of-00001.txt
Showing first 50 lines:

('CHAPTER', 200)
('I', 200)
('DOWN', 400)
('THE', 1800)
('RABBIT', 600)
('HOLE', 400)
('ALICE', 400)
('WAS', 600)
('BEGINNING', 200)
('TO', 400)
('GET', 200)
('VERY', 400)
('TIRED', 200)
('OF', 1000)
('SITTING', 200)
('BY', 400)
('HER', 1000)
('SISTER', 400)
('ON', 400)
('BANK', 200)
('AND', 1000)
('HAVING', 200)
('NOTHING', 200)
('DO', 200)
('ONCE', 200)
('OR', 600)
('TWICE', 200)
('SHE', 600)
('HAD', 400)
('PEEPED', 200)
('INTO', 200)
('BOOK', 400)
('READING', 200)
('BUT', 200)
('IT', 400)
('NO', 200)
('PICTURES', 400)
('CONVERSATIONS', 400)
('IN', 400)
('WHAT', 200)
('IS', 200)
('USE', 200)
('A', 800)
('THOUGHT', 200)
('WITHOUT', 200)
('SO', 200)
('CONSIDERING', 200)
('OWN', 200)
('MIND', 200)
('AS', 400)
