# Python Dictionaries for Data Engineering

Dictionaries are **key-value mappings** - one of the most important data structures in Python.

**Key Properties:**
- Keys must be hashable (immutable): strings, numbers, tuples
- Values can be any type
- O(1) average lookup, insert, delete
- Maintains insertion order (Python 3.7+)

**DE Use Cases:**
- Lookup tables / mapping tables
- JSON data processing
- Configuration management
- Caching / memoization
- Record representation

---
# Section 1: Creating Dictionaries

In [None]:
# Method 1: Literal syntax
customer = {
    "id": "C001",
    "name": "John Doe",
    "email": "john@example.com"
}
print(f"Literal: {customer}")

# Method 2: dict() constructor
customer2 = dict(id="C002", name="Jane Smith", email="jane@example.com")
print(f"Constructor: {customer2}")

# Method 3: From list of tuples
pairs = [("id", "C003"), ("name", "Bob"), ("email", "bob@example.com")]
customer3 = dict(pairs)
print(f"From tuples: {customer3}")

# Method 4: Dict comprehension
squares = {x: x**2 for x in range(1, 6)}
print(f"Comprehension: {squares}")

In [None]:
# fromkeys() - Create dict with same value for all keys
fields = ["name", "email", "city", "country"]
empty_record = dict.fromkeys(fields, None)
print(f"Empty record template: {empty_record}")

# Initialize counters to 0
status_codes = [200, 201, 400, 404, 500]
counters = dict.fromkeys(status_codes, 0)
print(f"Status counters: {counters}")

---
# Section 2: Accessing Values

## 2.1 Basic Access and `get()`

In [None]:
customer = {"id": "C001", "name": "John Doe", "email": "john@example.com"}

# Direct access - raises KeyError if missing
print(f"Name: {customer['name']}")

# get() - returns None (or default) if missing
print(f"Phone: {customer.get('phone')}")
print(f"Phone (with default): {customer.get('phone', 'N/A')}")

In [None]:
# DE Use Case: Safe access to nested JSON data
api_response = {
    "status": "success",
    "data": {
        "user": {
            "name": "John",
            "profile": {
                "age": 30
            }
        }
    }
}

# Chained get() for safe nested access
age = api_response.get("data", {}).get("user", {}).get("profile", {}).get("age", "Unknown")
print(f"Age: {age}")

# Missing path returns default, no error
missing = api_response.get("data", {}).get("user", {}).get("address", {}).get("city", "Unknown")
print(f"City: {missing}")

## 2.2 `setdefault()` - Get or Set Default

In [None]:
# setdefault() - Returns value if exists, otherwise sets and returns default

config = {"host": "localhost"}

# Key exists - just returns value
host = config.setdefault("host", "default.server.com")
print(f"Host: {host}")
print(f"Config: {config}")

# Key doesn't exist - sets default AND returns it
port = config.setdefault("port", 5432)
print(f"Port: {port}")
print(f"Config: {config}")

In [None]:
# DE Use Case: Grouping without defaultdict
orders = [
    {"customer": "C001", "product": "laptop"},
    {"customer": "C002", "product": "mouse"},
    {"customer": "C001", "product": "keyboard"},
]

orders_by_customer = {}
for order in orders:
    # setdefault creates empty list if customer not seen
    orders_by_customer.setdefault(order["customer"], []).append(order["product"])

print("Orders by customer:")
for customer, products in orders_by_customer.items():
    print(f"  {customer}: {products}")

---
# Section 3: Modifying Dictionaries

## 3.1 Adding and Updating

In [None]:
customer = {"id": "C001", "name": "John"}
print(f"Original: {customer}")

# Add/update single key
customer["email"] = "john@example.com"
customer["name"] = "John Doe"  # Update existing
print(f"After updates: {customer}")

In [None]:
# update() - Merge another dict
customer = {"id": "C001", "name": "John"}
additional_info = {"email": "john@example.com", "city": "NYC"}

customer.update(additional_info)
print(f"After update(): {customer}")

# update() with keyword arguments
customer.update(country="USA", phone="123-456-7890")
print(f"After keyword update: {customer}")

In [None]:
# Merge operator | (Python 3.9+)
defaults = {"host": "localhost", "port": 5432, "timeout": 30}
overrides = {"host": "production.server.com", "timeout": 60}

# Merge: right dict overrides left
config = defaults | overrides
print(f"Merged config: {config}")

# In-place merge with |=
defaults |= overrides
print(f"In-place merge: {defaults}")

## 3.2 Removing Items

In [None]:
customer = {"id": "C001", "name": "John", "email": "john@example.com", "temp": "delete_me"}
print(f"Original: {customer}")

# del - Remove by key (raises KeyError if missing)
del customer["temp"]
print(f"After del: {customer}")

# pop() - Remove and return value
email = customer.pop("email")
print(f"Popped email: {email}")
print(f"After pop: {customer}")

# pop() with default - safe removal
phone = customer.pop("phone", "N/A")
print(f"Popped phone (missing): {phone}")

In [None]:
# popitem() - Remove and return last inserted item
customer = {"a": 1, "b": 2, "c": 3}
print(f"Original: {customer}")

last = customer.popitem()
print(f"Popped: {last}")
print(f"Remaining: {customer}")

In [None]:
# clear() - Remove all items
cache = {"key1": "value1", "key2": "value2"}
print(f"Before clear: {cache}")

cache.clear()
print(f"After clear: {cache}")

---
# Section 4: Iterating Over Dictionaries

In [None]:
customer = {"id": "C001", "name": "John", "email": "john@example.com"}

# Iterate over keys (default)
print("Keys:")
for key in customer:
    print(f"  {key}")

# Explicit keys()
print(f"\nkeys(): {list(customer.keys())}")

In [None]:
# values() - Iterate over values
print(f"values(): {list(customer.values())}")

# items() - Iterate over key-value pairs (most common)
print("\nitems():")
for key, value in customer.items():
    print(f"  {key}: {value}")

In [None]:
# DE Use Case: Processing records
records = [
    {"id": "C001", "name": "John", "amount": 100},
    {"id": "C002", "name": "Jane", "amount": 200},
    {"id": "C003", "name": "Bob", "amount": 150},
]

# Transform: Normalize all string fields to uppercase
normalized = []
for record in records:
    new_record = {
        key: value.upper() if isinstance(value, str) else value
        for key, value in record.items()
    }
    normalized.append(new_record)

print("Normalized records:")
for r in normalized:
    print(f"  {r}")

---
# Section 5: Dictionary Comprehension

In [None]:
# Basic dict comprehension
# {key_expr: value_expr for item in iterable}

# Create lookup: number -> square
squares = {x: x**2 for x in range(1, 6)}
print(f"Squares: {squares}")

# Create lookup: name -> length
names = ["John", "Jane", "Bob", "Alice"]
name_lengths = {name: len(name) for name in names}
print(f"Name lengths: {name_lengths}")

In [None]:
# Dict comprehension with condition
numbers = range(1, 11)

# Only even numbers
even_squares = {x: x**2 for x in numbers if x % 2 == 0}
print(f"Even squares: {even_squares}")

In [None]:
# Swap keys and values
original = {"a": 1, "b": 2, "c": 3}
swapped = {v: k for k, v in original.items()}

print(f"Original: {original}")
print(f"Swapped:  {swapped}")

In [None]:
# DE Use Case: Create lookup table from records
customers = [
    {"id": "C001", "name": "John", "email": "john@example.com"},
    {"id": "C002", "name": "Jane", "email": "jane@example.com"},
    {"id": "C003", "name": "Bob", "email": "bob@example.com"},
]

# Create lookup by ID
customer_by_id = {c["id"]: c for c in customers}

print("Lookup table:")
for id, customer in customer_by_id.items():
    print(f"  {id}: {customer}")

# Fast lookup
print(f"\nLookup C002: {customer_by_id['C002']}")

In [None]:
# Filter dict by keys or values
data = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}

# Keep only values > 2
filtered = {k: v for k, v in data.items() if v > 2}
print(f"Values > 2: {filtered}")

# Keep only certain keys
allowed_keys = {"a", "c", "e"}
filtered_keys = {k: v for k, v in data.items() if k in allowed_keys}
print(f"Selected keys: {filtered_keys}")

---
# Section 6: Checking Membership and Copying

In [None]:
customer = {"id": "C001", "name": "John", "email": "john@example.com"}

# Check if key exists
print(f"'name' in customer: {'name' in customer}")
print(f"'phone' in customer: {'phone' in customer}")

# Check if value exists
print(f"'John' in values: {'John' in customer.values()}")

In [None]:
# Copying dictionaries
original = {"a": 1, "b": [1, 2, 3]}

# Shallow copy
shallow = original.copy()
shallow["a"] = 999
shallow["b"].append(4)  # Modifies the original list too!

print(f"Original: {original}")
print(f"Shallow:  {shallow}")
print("Note: Nested list was modified in both!")

In [None]:
# Deep copy for nested structures
import copy

original = {"a": 1, "b": [1, 2, 3]}
deep = copy.deepcopy(original)

deep["b"].append(4)

print(f"Original: {original}")
print(f"Deep copy: {deep}")
print("Now they are independent!")

---
# Section 7: Practical DE Examples

In [None]:
# Example 1: Data transformation pipeline

raw_records = [
    {"USER_ID": "001", "USER_NAME": "John Doe", "USER_EMAIL": "JOHN@EXAMPLE.COM"},
    {"USER_ID": "002", "USER_NAME": "Jane Smith", "USER_EMAIL": "JANE@EXAMPLE.COM"},
]

def transform_record(record):
    """Normalize field names and values."""
    # Rename keys: USER_X -> x (lowercase, remove prefix)
    return {
        key.replace("USER_", "").lower(): value.lower() if key == "USER_EMAIL" else value
        for key, value in record.items()
    }

transformed = [transform_record(r) for r in raw_records]

print("Transformed records:")
for r in transformed:
    print(f"  {r}")

In [None]:
# Example 2: Merge records from different sources

source_a = [
    {"id": "C001", "name": "John", "email": "john@a.com"},
    {"id": "C002", "name": "Jane", "email": "jane@a.com"},
]

source_b = [
    {"id": "C001", "phone": "111-1111", "city": "NYC"},
    {"id": "C003", "phone": "333-3333", "city": "LA"},
]

# Convert to lookup by ID
lookup_a = {r["id"]: r for r in source_a}
lookup_b = {r["id"]: r for r in source_b}

# Merge: combine all unique IDs
all_ids = set(lookup_a.keys()) | set(lookup_b.keys())

merged = []
for id in all_ids:
    record = {"id": id}
    if id in lookup_a:
        record.update(lookup_a[id])
    if id in lookup_b:
        record.update(lookup_b[id])
    merged.append(record)

print("Merged records:")
for r in sorted(merged, key=lambda x: x["id"]):
    print(f"  {r}")

In [None]:
# Example 3: Validate records against schema

schema = {
    "required": ["id", "name", "email"],
    "optional": ["phone", "city"]
}

records = [
    {"id": "C001", "name": "John", "email": "john@example.com"},
    {"id": "C002", "name": "Jane"},  # Missing email
    {"id": "C003", "name": "Bob", "email": "bob@example.com", "extra": "field"},  # Extra field
]

def validate(record, schema):
    """Validate record against schema."""
    errors = []
    
    # Check required fields
    missing = set(schema["required"]) - set(record.keys())
    if missing:
        errors.append(f"Missing required: {missing}")
    
    # Check for unknown fields
    allowed = set(schema["required"]) | set(schema["optional"])
    unknown = set(record.keys()) - allowed
    if unknown:
        errors.append(f"Unknown fields: {unknown}")
    
    return errors

print("Validation results:")
for record in records:
    errors = validate(record, schema)
    status = "✓ Valid" if not errors else f"✗ {errors}"
    print(f"  {record['id']}: {status}")

In [None]:
# Example 4: Aggregate data

transactions = [
    {"customer": "C001", "category": "electronics", "amount": 100},
    {"customer": "C001", "category": "clothing", "amount": 50},
    {"customer": "C002", "category": "electronics", "amount": 200},
    {"customer": "C001", "category": "electronics", "amount": 75},
    {"customer": "C002", "category": "food", "amount": 30},
]

# Aggregate: total by customer
by_customer = {}
for t in transactions:
    cust = t["customer"]
    by_customer[cust] = by_customer.get(cust, 0) + t["amount"]

print(f"Total by customer: {by_customer}")

# Aggregate: total by category
by_category = {}
for t in transactions:
    cat = t["category"]
    by_category[cat] = by_category.get(cat, 0) + t["amount"]

print(f"Total by category: {by_category}")

---
# Quick Reference: Dict Methods

| Method | Description | Returns |
|--------|-------------|--------|
| `d[key]` | Get value | Value or KeyError |
| `d.get(key, default)` | Safe get | Value or default |
| `d.setdefault(key, default)` | Get or set | Value |
| `d[key] = value` | Set value | None |
| `d.update(other)` | Merge dict | None |
| `d \| other` | Merge (new dict) | New dict |
| `del d[key]` | Delete | None or KeyError |
| `d.pop(key, default)` | Remove and return | Value |
| `d.keys()` | All keys | View |
| `d.values()` | All values | View |
| `d.items()` | Key-value pairs | View |
| `key in d` | Check key | Bool |
| `d.copy()` | Shallow copy | New dict |