In [1]:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

In [3]:
!{'head -n 4 exclude_ids.txt'}

149633CM
212539MU
231555ZZ
704275DC

In [4]:
!{'head -n 10 dept_data.txt'}

149633CM,Marco,10,Accounts,1-01-2019
212539MU,Rebekah,10,Accounts,1-01-2019
231555ZZ,Itoe,10,Accounts,1-01-2019
503996WI,Edouard,10,Accounts,1-01-2019
704275DC,Kyle,10,Accounts,1-01-2019
957149WC,Kyle,10,Accounts,1-01-2019
241316NX,Kumiko,10,Accounts,1-01-2019
796656IE,Gaston,10,Accounts,1-01-2019
331593PS,Beryl,20,HR,1-01-2019
560447WH,Olga,20,HR,1-01-2019


------

# Additional Input / Output

In [5]:
class FilterUsingLength(beam.DoFn):
    
    def process(self, element_row, side_list, lower, upper ):
        element = element_row.split(',')
        id = element[0]
        name = element[1]
        if (lower <= len(name) <= upper) and id not in side_list:
            return [element]

In [6]:
side_list = list()
with open ('exclude_ids.txt', 'r') as myfile:
    for line in myfile:
        side_list.append(line.rstrip())

print(side_list)

['149633CM', '212539MU', '231555ZZ', '704275DC']


In [7]:
with beam.Pipeline() as p1:
    attendence_count = ( 
        p1
        | "Read from file" >> beam.io.ReadFromText('dept_data.txt')
        | "Par do with side inputs" >> beam.ParDo(FilterUsingLength(), side_list, 3, 10)
        | "Filtering based on 'accounts'" >> beam.Filter(lambda record: record[3] == "Accounts")
        | "Map transform based on account record" >> beam.Map(lambda record: (record[0] + ", " + record[1],1))
        | "Group and sum" >> beam.CombinePerKey(sum)
        | "write to dep" >> beam.io.WriteToText('data/dep')
    )
    
!{'head -n 20 data/dep-00000-of-00001'}



('503996WI, Edouard', 31)
('957149WC, Kyle', 31)
('241316NX, Kumiko', 31)
('796656IE, Gaston', 31)
('718737IX, Ayumi', 30)


----

In [8]:
class ProcessWords(beam.DoFn):
    def process(self, element, cutoff, marker):
        name = element.split(",")[1]
        
        if len(name) <= cutoff:
            return [beam.pvalue.TaggedOutput("Short_Names",name)]
        else:
            return [beam.pvalue.TaggedOutput("Long_Names",name)]
        
        if name.startswith(marker):
            return name

In [17]:
p2 = beam.Pipeline()
    
attendence_count = ( 
    p2
    | beam.io.ReadFromText('dept_data.txt')
    | beam.ParDo(ProcessWords(), cutoff=4, marker='M').with_outputs("Short_Names","Long_Names",main="Names_M")
)
    
short_coll = attendence_count.Short_Names
long_coll = attendence_count.Long_Names
start_M = attendence_count.Names_M
    
short_coll | "short" >> beam.io.WriteToText('data/short')
long_coll | "long" >> beam.io.WriteToText('data/long')
start_M | "start" >> beam.io.WriteToText('data/start')

p2.run()



<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fbbbb28fe80>

In [18]:
!{'head -n 5 data/short-00000-of-00001'}

Itoe
Kyle
Kyle
Olga
Kirk


In [19]:
!{'head -n 5 data/long-00000-of-00001'}

Marco
Rebekah
Edouard
Kumiko
Gaston


In [20]:
!{'head -n 5 data/start-00000-of-00001'}