# Answer 1 - Create a Pipeline
Write a simple Apache Beam pipeline that reads a list of integers [1, 2, 3, 4, 5], multiplies each integer by 2, and prints the results. The output should be [2, 4, 6, 8, 10].


In [4]:
import apache_beam as beam 
with beam.Pipeline() as pipeline:
    my_pipeline = (
        pipeline
        | beam.Create([1,2,3,4,5])
        | beam.Map(lambda x : x * 2)
        | beam.Map(print)
    )

2
4
6
8
10


# Answer 2 - Filtering Elements
 Create a pipeline that filters out even numbers from a list of integers [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] and prints only the odd numbers. The output should be [1, 3, 5, 7, 9].

In [5]:
import apache_beam as beam 
with beam.Pipeline() as pipeline:
    my_pipeline = (
        pipeline
        | beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] )
        | beam.Filter(lambda x : x%2)
        | beam.Map(print)
    )

1
3
5
7
9


# Answer 3 - Mapping Strings to Uppercase
 Write a pipeline that converts a list of strings ['apple', 'banana', 'cherry'] to uppercase. The output should be ['APPLE', 'BANANA', 'CHERRY'].

In [8]:
import apache_beam as beam 
with beam.Pipeline() as pipeline:
    my_pipeline = (
        pipeline
        | beam.Create(['apple', 'banana', 'cherry'] )
        | beam.Map(lambda x : x.upper())
        | beam.Map(print)
    )

APPLE
BANANA
CHERRY


# Answer 4 - Counting Words
 Create a pipeline that reads a list of sentences ['hello world', 'hello Beam', 'Beam is fun'] and counts the occurrences of each word. The output should be {'hello': 2, 'world': 1, 'Beam': 2, 'is': 1, 'fun': 1}.

In [22]:
import apache_beam as beam 
with beam.Pipeline() as pipeline:
    my_pipeline = (
        pipeline
        | beam.Create(['hello world', 'hello Beam', 'Beam is fun'] )
        | beam.Map(lambda x : x.split(" "))
        | beam.FlatMap(lambda x : x)
        | beam.combiners.Count.PerElement()
        | beam.Map(print)
    )

('hello', 2)
('world', 1)
('Beam', 2)
('is', 1)
('fun', 1)


# Assignment 5 - Calculating Average
 Write a pipeline that calculates the average of a list of integers [10, 20, 30, 40, 50]. The output should be 30.


In [19]:
import apache_beam as beam

with beam.Pipeline() as p:
    my_pipeline = (
        p
        | beam.Create([10,20,30,40,50])
        | beam.combiners.Mean.Globally()
        | beam.Map(print)
    )

30.0


# Assignment 6 - Flattening Collections
 Create a pipeline that flattens a list of lists of integers [[1, 2, 3], [4, 5, 6], [7, 8, 9]] into a single list of integers. The output should be [1, 2, 3, 4, 5, 6, 7, 8, 9].


In [23]:
import apache_beam as beam 

with beam.Pipeline() as p:
    ip = (
        p
        | beam.Create([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        | beam.FlatMap(lambda x : x)
        | beam.Map(print)
    )

1
2
3
4
5
6
7
8
9


# Assignment 7 - Group by Key
 Write a pipeline that groups a list of key-value pairs [('apple', 1), ('banana', 2), ('apple', 3), ('banana', 4)] by key and prints the grouped results. The output should be {'apple': [1, 3], 'banana': [2, 4]}.


In [25]:
import apache_beam as beam 

with beam.Pipeline() as p:
    ip = (
        p
        | beam.Create([('apple', 1), ('banana', 2), ('apple', 3), ('banana', 4)])
        | beam.GroupByKey()
        | beam.Map(print)
    )

('apple', [1, 3])
('banana', [2, 4])


# Assignment 8 - Combining Values by Key
 Create a pipeline that sums the values of each key in a list of key-value pairs [('apple', 1), ('banana', 2), ('apple', 3), ('banana', 4)]. The output should be {'apple': 4, 'banana': 6}.

In [26]:
import apache_beam as beam 

with beam.Pipeline() as p:
    ip = (
        p
        | beam.Create([('apple', 1), ('banana', 2), ('apple', 3), ('banana', 4)])
        | beam.CombinePerKey(sum)
        | beam.Map(print)
    )

('apple', 4)
('banana', 6)


# Assignment 9 - Joining Collections
 Write a pipeline that performs an inner join on two collections of key-value pairs [(1, 'Alice'), (2, 'Bob')] and [(1, 'Engineer'), (2, 'Doctor')] by key. The output should be [(1, ('Alice', 'Engineer')), (2, ('Bob', 'Doctor'))].


In [41]:
import apache_beam as beam
def format_result(element):
    key, groups = element
    users_list = groups['users']
    purchases_list = groups['purchases']
    for user in users_list:
        for purchase in purchases_list:
            yield (key, (user, purchase))

with beam.Pipeline() as p:
    users = p | 'Users' >> beam.Create([(1, 'Alice'), (2, 'Bob')])
    purchases = p | 'Purchases' >> beam.Create([(1, 'Engineer'), (2, 'Doctor')])

    # Apply CoGroupByKey to group users and purchases by key
    grouped = {'users': users, 'purchases': purchases} | beam.CoGroupByKey()
#     (1, {'users': ['Alice'], 'purchases': ['Engineer']})
#     (2, {'users': ['Bob'], 'purchases': ['Doctor']})

    result = grouped | beam.FlatMap(format_result)

    result | beam.Map(print)


(1, ('Alice', 'Engineer'))
(2, ('Bob', 'Doctor'))


# Assignment 10 - ParDo Transformation
 Create a pipeline that uses the ParDo transformation to filter out numbers less than 5 and then multiply each remaining integer in a list of integers [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] by 2. The output should be [10, 12, 14, 16, 18, 20].


In [45]:
class MyFilterAndMultiplyParDo(beam.DoFn):
    def process(self, element):
        if(element >= 5):
            yield element  * 2
        

with beam.Pipeline() as p:
    ip = (
        p 
        | beam.Create([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        | beam.ParDo(MyFilterAndMultiplyParDo())
        | beam.Map(print)
    )

25
30
35
40
45
50


# Assignment 11 
Reading from Text Files: Write a pipeline that reads lines from a text file containing:

hello world
hello Beam
Beam is fun
Splits each line into words, and counts the occurrences of each word. The output should be {'hello': 2, 'world': 1, 'Beam': 2, 'is': 1, 'fun': 1}.


In [49]:
import apache_beam as beam

with beam.Pipeline() as p:
    ip = (
        p 
        | beam.io.ReadFromText("file1.txt")
        | beam.FlatMap(lambda x: x.split(" "))
        | beam.combiners.Count.PerElement()
        | beam.Map(print)
    )

('hello', 2)
('world', 1)
('Beam', 2)
('is', 1)
('fun', 1)


# Assignment 12 - Writing to Text Files
 Create a pipeline that reads a list of integers [1, 2, 3, 4, 5], multiplies each integer by 2, and writes the results to a text file. The content of the file should be:
2
4
6
8
10


In [50]:
import apache_beam as beam

with beam.Pipeline() as p:
    ip = (
        p 
        | beam.Create([1,2,3,4,5])
        | beam.Map(lambda x: x * 2)
        | beam.io.WriteToText("result_12.txt")
    )

# Assignment 13 - Using Side Inputs
 Write a pipeline that reads a list of integers [1, 2, 3, 4, 5] and uses a side input with the value 10 to add this constant value to each integer. The output should be [11, 12, 13, 14, 15].

In [55]:
def addConstNum(element, toAdd):
    return element + toAdd

with beam.Pipeline() as p:
    ip = (
        p
        | beam.Create([1,2,3,4,5])
        | beam.Map(addConstNum, 10)
        | beam.Map(print)
    )

11
12
13
14
15


# Assignment 14 - Composite Transformations
 Create a composite transformation that reads a list of sentences ['Apache Beam is powerful', 'Beam makes data processing easy'], splits each sentence into words, and counts the occurrences of each word. The output should be {'Apache': 1, 'Beam': 2, 'is': 1, 'powerful': 1, 'makes': 1, 'data': 1, 'processing': 1, 'easy': 1}.

In [58]:
with beam.Pipeline() as p : 
    ip = (
        p 
        | beam.Create(['Apache Beam is powerful', 'Beam makes data processing easy'])
        | beam.FlatMap(lambda x: x.split(" "))
        | beam.combiners.Count.PerElement()
        | beam.Map(print)
    )

('Apache', 1)
('Beam', 2)
('is', 1)
('powerful', 1)
('makes', 1)
('data', 1)
('processing', 1)
('easy', 1)


In [59]:
import apache_beam as beam
from apache_beam import PTransform

class CountWords(PTransform):
    def expand(self, pcoll):
        return (pcoll
                | beam.FlatMap(lambda x: x.split())
                | beam.combiners.Count.PerElement())

with beam.Pipeline() as p:
    (p
     | beam.Create(['Apache Beam is powerful', 'Beam makes data processing easy'])
     | CountWords()
     | beam.Map(print))


('Apache', 1)
('Beam', 2)
('is', 1)
('powerful', 1)
('makes', 1)
('data', 1)
('processing', 1)
('easy', 1)


# Problem 15 - Data Enrichment
 Write a pipeline that reads a list of user data [(1, 'Alice'), (2, 'Bob')] and a list of purchase data [(1, 'Laptop'), (2, 'Smartphone')], and enriches the purchase data with user information. The output should be [(1, ('Alice', 'Laptop')), (2, ('Bob', 'Smartphone'))].


In [63]:
def format_nicely(element):
    key, value = element
    user = value["user"]
    purchase = value["purchase"]
    for u  in user:
        for p in purchase:
            yield(key, (u, p))

with beam.Pipeline() as p : 
    user = p | "Create users  Pcollection" >> beam.Create([(1, 'Alice'), (2, 'Bob')])
    purchase = p | "Create Purchases Pcollection" >> beam.Create([(1, 'Laptop'), (2, 'Smartphone')])
    # (user, purchase)| beam.CoGroupByKey()|beam.Map(print)
    grouped = {"user" : user , "purchase" : purchase} | beam.CoGroupByKey() 
    result = grouped | beam.FlatMap(format_nicely)
    result | beam.Map(print)

(1, ('Alice', 'Laptop'))
(2, ('Bob', 'Smartphone'))


# Problem 16 - Handling Missing Values
 Create a pipeline that reads a list of key-value pairs [('a', 1), ('b', None), ('a', 2), ('c', 3), ('b', 4)], filters out pairs with missing values, and calculates the average value for each key. The output should be {'a': 1.5, 'c': 3.0, 'b': 4.0}.

# Can not solve

# Problem 17 - Processing Unbounded Data
 Write a pipeline that reads an unbounded data source (e.g., a streaming service that emits numbers) and processes the data in real-time, filtering out numbers less than 10 and printing the results.

# On GCP

# problem 18 Combining with Custom Logic
 Create a pipeline that uses a custom combine function to calculate the median of a list of integers [1, 3, 3, 6, 7, 8, 9]. The output should be 6.


In [110]:
import apache_beam as beam

class MedianFn(beam.CombineFn):
    def create_accumulator(self):
        return []

    def add_input(self, accumulator, input):
        accumulator.append(input)
        return accumulator

    def merge_accumulators(self, accumulators):
        result = []
        for acc in accumulators:
            result.extend(acc)
        return result

    def extract_output(self, accumulator):
        accumulator.sort()
        mid = len(accumulator) // 2
        return accumulator[mid]

with beam.Pipeline() as p:
    (p
     | beam.Create([1, 3, 3, 6, 7, 8, 9])
     | beam.CombineGlobally(MedianFn())
     | beam.Map(print))


6


# Answer 19 - Error Handling 
Write a pipeline that reads a list of integers [1, 'two', 3, 'four', 5], includes error handling for transformation errors (e.g., converting to integers), logs errors, and continues processing valid integers. The output should log errors for 'two' and 'four' and print [1, 3, 5].


In [112]:
import apache_beam as beam
import logging

class ConvertToInt(beam.DoFn):
    def process(self, element):
        try:
            # Attempt to convert the element to an integer
            yield int(element)
        except ValueError:
            # Log an error if conversion fails
            logging.error(f"Failed to convert {element} to integer.")

class LogAndFilter(beam.DoFn):
    def process(self, element):
        # Simply yield the element for further processing
        yield element

def run_pipeline():
    # Set up logging configuration
    logging.basicConfig(level=logging.ERROR)

    with beam.Pipeline() as p:
        (p
         | 'Create Input' >> beam.Create([1, 'two', 3, 'four', 5])
         | 'Convert to Int' >> beam.ParDo(ConvertToInt())
         | 'Log and Filter' >> beam.ParDo(LogAndFilter())
         | 'Print Results' >> beam.Map(print))

if __name__ == '__main__':
    run_pipeline()


ERROR:root:Failed to convert two to integer.
ERROR:root:Failed to convert four to integer.


1
3
5


# Problem 20 Chaining Transformations
 Create a complex pipeline that reads a list of user actions [(1, 'login'), (2, 'purchase'), (1, 'logout'), (2, 'login'), (3, 'login')], filters out invalid actions (e.g., 'logout'), maps each action to a user, groups actions by user, and calculates the total number of actions per user. The output should be [(1, 2), (2, 2), (3, 1)].

In [118]:
import apache_beam as beam

class FilterInvalidActions(beam.DoFn):
    def process(self, element):
        user_id, action = element
        # Filter out invalid actions (e.g., 'logout')
        if action in ['login', 'purchase']:
            yield (user_id, action)

class CountActions(beam.DoFn):
    def process(self, element):
        user_id, actions = element
        # Count the number of actions for each user
        yield (user_id, len(actions))

def run_pipeline():
    with beam.Pipeline() as p:
        (p
         | 'Create Input' >> beam.Create([(1, 'login'), (2, 'purchase'), (1, 'logout'), (2, 'login'), (3, 'login')])
         | 'Filter Invalid Actions' >> beam.ParDo(FilterInvalidActions())
         | 'Group By User' >> beam.GroupByKey()
         | 'Count Actions' >> beam.ParDo(CountActions())
         | 'Print Results' >> beam.Map(print))

if __name__ == '__main__':
    run_pipeline()


(1, 1)
(2, 2)
(3, 1)
