# Generators and Lazy Pipelines

- You can chain generator functions to form multi-stage data pipelines that process items one at a time.  
- No intermediate lists are built, so memory stays low even for very large streams.  
- Each generator only holds its own minimal state and passes items downstream on demand.  

## Memory Efficiency

- Lazy iterables maintain only minimal state (like start, stop, step) regardless of total length.  
- Eager collections (lists, tuples) grow in memory usage as you add items.  
- Use `sys.getsizeof()` to inspect the in-memory size of objects themselves (not their contents).  

In [38]:
# 1. Ingest the log lines from a file.
# 2.  Filter the log lines based on either level or message substring.
#3 . Extract and return only the message attributes from the filtered log lines.

import sys
import json
import os
from os import listdir

from pathlib import Path


file_path = Path.cwd()/"generators-decorators/large_logs.txt"
""" script_dir = os.path.dirname(__file__)           # Folder where the script is located
file_path = os.path.join(script_dir, "large_logs.txt") """

# print(os.getcwd())

def read_logs(filepath):
    """ Reads the  contents of a log file line by line and yields each line as a JSON object. 
    Args:
        filepath (str): The path to the log file.
    Yields:
        generator(dict(str)): A JSON object representing a log line.
    """

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue # Skip empty lines
            yield json.loads(line)

def filter_logs(logs, level=None, mesage_substring=None):
    """ Filters log entries based on log level or message substring.
    Args:
        logs (iterable(dict)): Iterable containing the logs to be filtered
        level (str): The log level to filter by (e.g., "ERROR", "INFO"). Defaults to None.
        message_substring (str): A substring to search for in the log messages. Defaults to None.
    Yields:
        generator(dict(str)): A filtered log entry as a JSON object.
    """
    for log in logs:
        if (
            level is not None 
            and log.get('level', "").lower() != level.lower()
            ):
            continue
        if (
            mesage_substring is not None 
            and mesage_substring.lower() not in log.get('message', '').lower()
            ):
            continue    
        yield log    

def extract_field(logs, field = "message"):
    """ Extracts a specific field from each log entry.
    Args:
        logs (iterable(dict)): Iterable containing the logs to extract the field from.
        field (str): The field to extract from each log entry. Defaults to "message".       
    Yields:
        generator(str): The extracted field value from each log entry.
    """
    for log in logs:
        yield log.get(field,"").strip()    


def get_first_n(logs, n=5):
    """ Retrieves the first n log entries from an iterable.
    Args:
        logs (iterable(dict)): Iterable containing the logs to retrieve from.
        n (int): The number of log entries to retrieve. Defaults to 5.  
    Yields:
        generator(dict(str)): The first n log entries as JSON objects.
    """
    count = 0
    for log in logs:
        if count >= n:
            break
        yield log
        count += 1

# print(listdir())
logs_gen = read_logs(file_path)
filter_gen = filter_logs(logs_gen, "INFO", "user")
#next(logs_gen)  # Advance the generator to the first log entry
#next(filter_gen)  # Advance the generator to the first filtered log entry
extract_gen = extract_field(filter_gen, "message")
for log in get_first_n(extract_gen, 4):
    print(log)

print("Generator object sizes (in bytes):",
      sys.getsizeof(logs_gen),
      sys.getsizeof(filter_gen),
      sys.getsizeof(extract_gen)
      )

User user77 logged out
User user75 logged out
User user1 logged in
User user85 logged in
Generator object sizes (in bytes): 232 240 224
