# 02 - Data Structures (Advanced)

Build stronger Python data structures for data engineering work.


## 1. Lists and List Comprehensions


In [1]:
raw_events = ['check_in', 'check_out', 'check_in', 'cancel', 'check_in']
unique_events = list(dict.fromkeys(raw_events))  # preserve order
print(unique_events)

recent_events = [event for event in raw_events if event != 'cancel']
print(recent_events)

room_numbers = [101, 102, 103, 104, 105]
upper_floor_rooms = room_numbers[2:]
print(upper_floor_rooms)


['check_in', 'check_out', 'cancel']
['check_in', 'check_out', 'check_in', 'check_in']
[103, 104, 105]


## 2. Tuples and Named Records


In [2]:
from collections import namedtuple

BookingKey = namedtuple('BookingKey', ['property_id', 'booking_id'])
booking_key = BookingKey(property_id='HTL-01', booking_id='BKG-7781')
print(booking_key)
print(f'Property: {booking_key.property_id}, Booking: {booking_key.booking_id}')


BookingKey(property_id='HTL-01', booking_id='BKG-7781')
Property: HTL-01, Booking: BKG-7781


## 3. Sets and Set Operations


In [None]:
pms_guest_ids = {'G-101', 'G-102', 'G-103', 'G-104'}
crm_guest_ids = {'G-103', 'G-104', 'G-105'}

matched_guests = pms_guest_ids & crm_guest_ids
only_in_pms = pms_guest_ids - crm_guest_ids
all_unique_guests = pms_guest_ids | crm_guest_ids

print('Matched:', matched_guests)
print('Only in PMS:', only_in_pms)
print('All unique:', all_unique_guests)


## 4. Dictionaries and Comprehensions


In [None]:
bookings = [
    {'booking_id': 'B1', 'revenue': 120.0},
    {'booking_id': 'B2', 'revenue': 180.0},
    {'booking_id': 'B3', 'revenue': 90.0},
]
booking_revenue = {item['booking_id']: item['revenue'] for item in bookings}
print(booking_revenue)

from collections import defaultdict, Counter

revenue_by_channel = defaultdict(float)
channel_events = ['Direct', 'OTA', 'OTA', 'Direct', 'Corporate']
event_revenue = [120, 200, 180, 90, 300]

for channel, revenue in zip(channel_events, event_revenue):
    revenue_by_channel[channel] += revenue

print(dict(revenue_by_channel))
print(Counter(channel_events))


## 5. Queues and Heaps


In [None]:
from collections import deque
import heapq

event_queue = deque(['check_in', 'check_out', 'cancel'])
event_queue.append('rate_update')
print(event_queue.popleft())
print(list(event_queue))

priority_jobs = [(3, 'backfill'), (1, 'daily_load'), (2, 'quality_checks')]
heapq.heapify(priority_jobs)

while priority_jobs:
    priority, job_name = heapq.heappop(priority_jobs)
    print(priority, job_name)


## 6. Mini Exercise


In [None]:
raw_logs = [
    {'guest_id': 'G-201', 'event': 'check_in'},
    {'guest_id': 'G-202', 'event': 'check_in'},
    {'guest_id': 'G-201', 'event': 'check_out'},
    {'guest_id': 'G-203', 'event': 'cancel'},
]

events_by_guest = defaultdict(list)
for log_item in raw_logs:
    events_by_guest[log_item['guest_id']].append(log_item['event'])

print(dict(events_by_guest))


## Next Steps

Continue your learning with:
- **03_file_handling.ipynb** - Reading/writing CSV, JSON, Parquet
- **04_pandas_deep_dive.ipynb** - Advanced Pandas operations
