In [2]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

# ParDo Transform

In [7]:
class SplitRow(beam.DoFn):
    
    def process(self, element):
        return [element.split(',')]

In [60]:
class FilterRow(beam.DoFn):
    
    def process(self, element) :
        if element[3] == "Accounts" :
            return [element]

In [61]:
class PairRow(beam.DoFn):
    
    def process(self, element) :
        return [(element[1],1)]

In [62]:
class Counting(beam.DoFn):
    
    def process(self, element) :
        (key, values) = element
        return [(key, sum(values))]

In [63]:
p1 = beam.Pipeline()
attendence_count = (
    p1 
    | beam.io.ReadFromText('dept_data.txt')
    | beam.ParDo(SplitRow())
#     | beam.ParDo(lambda element: [element.split(',')])
    | beam.ParDo(FilterRow())
    | beam.ParDo(PairRow())
    | beam.GroupByKey()
    | beam.ParDo(Counting())
    | beam.io.WriteToText('data/output')       
)

p1.run()

!{'head -n 20 data/output-00000-of-00001'}



('Marco', 31)
('Rebekah', 31)
('Itoe', 31)
('Edouard', 31)
('Kyle', 62)
('Kumiko', 31)
('Gaston', 31)
('Ayumi', 30)


-----------

In [44]:
class AverageFn(beam.CombineFn):
    def create_accumulator(self):
        return (0.0,0)
    
    def add_input(self, sum_count, inp):
        (sums, count) = sum_count
        return sums+inp, count+1
        
    def merge_accumulators(self, accum):
        ind_sum, ind_count = zip(*accum)
        return sum(ind_sum), sum(ind_count)
        
    def extract_output(self,sum_count):
        (sums,count) = sum_count
        return sums/count if count else float('NaN')

In [46]:
p2 = beam.Pipeline()

small_count = (
    p2 
    | beam.Create([15,5,7,7,9,23,13,5])
    | beam.CombineGlobally(AverageFn())
    | beam.io.WriteToText('data/output')       
)

p2.run()

!{'head -n 20 data/output-00000-of-00001'}



10.5


------------

# Composite Transform

In [66]:
def filter_on_count(element):
    name, count = element
    if count > 30: 
        return element

In [90]:
def format_output(element):
    name, count = element
    return name + ", " + str(count) + ", Regular Employee." 
# .join((name.encode('ascii')), str(count), "Regular Employee")

In [91]:
class MyTransform(beam.PTransform):
    
    def expand(self, input_col):
        return (
            input_col 
            | "Group and sum" >> beam.CombinePerKey(sum)
            | "Filter account" >> beam.Filter(filter_on_count)
            | "Regular Employee" >> beam.Map(format_output)
        )
        

In [92]:
p8 = beam.Pipeline()

input_collection = (
    p8
    | "Read from file" >> beam.io.ReadFromText('dept_data.txt')
    | "Map transform based on ," >> beam.Map(lambda record: record.split(','))
)

account_count = (
    input_collection
    | "Filtering based on 'accounts'" >> beam.Filter(lambda record: record[3] == "Accounts")
    | "Map transform based on account record" >> beam.Map(lambda record: ("Accounts, " + record[1],1))
    | "compostite account" >> MyTransform()
    | "write to account" >> beam.io.WriteToText('data/account')
)

hr_count = (
    input_collection
    | "Filtering based on 'hr'" >> beam.Filter(lambda record: record[3] == "HR")
    | "Map transform based on hr record" >> beam.Map(lambda record: ("HR, " + record[1],1))
    | "compostite hr" >> MyTransform()
    | "write to hr" >> beam.io.WriteToText('data/hr')
)


p8.run()


!{'head -n 20 data/account-00000-of-00001'}

!{'head -n 20 data/hr-00000-of-00001'}



Accounts, Marco, 31, Regular Employee.
Accounts, Rebekah, 31, Regular Employee.
Accounts, Itoe, 31, Regular Employee.
Accounts, Edouard, 31, Regular Employee.
Accounts, Kyle, 62, Regular Employee.
Accounts, Kumiko, 31, Regular Employee.
Accounts, Gaston, 31, Regular Employee.
HR, Beryl, 62, Regular Employee.
HR, Olga, 31, Regular Employee.
HR, Leslie, 31, Regular Employee.
HR, Mindy, 31, Regular Employee.
HR, Vicky, 31, Regular Employee.
HR, Richard, 31, Regular Employee.
HR, Kirk, 31, Regular Employee.
HR, Kaori, 31, Regular Employee.
HR, Oscar, 31, Regular Employee.


--------------------

In [93]:
def ret_tuple(element):
    element_row = element.split(',')
    return (element_row[0], element_row[1:])

In [95]:
p8 = beam.Pipeline()

dep_rows = (
    p8
    | "Read from file" >> beam.io.ReadFromText('dept_data.txt')
    | "Map transform1," >> beam.Map(ret_tuple)
)

loc_rows = (
    p8
    | "Read from loc file" >> beam.io.ReadFromText('location.txt')
    | "Map transform2," >> beam.Map(ret_tuple)

)

# account_count = (
#     input_collection
#     | "Filtering based on 'accounts'" >> beam.Filter(lambda record: record[3] == "Accounts")
#     | "Map transform based on account record" >> beam.Map(lambda record: ("Accounts, " + record[1],1))
#     | "compostite account" >> MyTransform()
#     | "write to account" >> beam.io.WriteToText('data/account')
# )

# hr_count = (
#     input_collection
#     | "Filtering based on 'hr'" >> beam.Filter(lambda record: record[3] == "HR")
#     | "Map transform based on hr record" >> beam.Map(lambda record: ("HR, " + record[1],1))
#     | "compostite hr" >> MyTransform()
#     | "write to hr" >> beam.io.WriteToText('data/hr')
# )


result = (
    {"dep_data": dep_rows, "loc_data": loc_rows}
    |beam.CoGroupByKey()
    | "write to result" >> beam.io.WriteToText('data/result')

)


p8.run()

!{'head -n 20 data/result-00000-of-00001'}

RuntimeError: A transform with label "[95]: Read from file" already exists in the pipeline. To apply a transform with a specified label write pvalue | "label" >> transform