# Python Tuples for Data Engineering

Tuples are **immutable, ordered sequences**. They're like lists that cannot be changed after creation.

**Key Properties:**
- Immutable (cannot modify after creation)
- Ordered (maintains insertion order)
- Allows duplicates
- Hashable (can be used as dict keys, set elements)
- Slightly faster than lists

**DE Use Cases:**
- Database row representation
- Returning multiple values from functions
- Dictionary keys (composite keys)
- Immutable configuration
- Data integrity (prevent accidental modification)

---
# Section 1: Creating Tuples

In [None]:
# Method 1: Parentheses (optional but recommended)
point = (10, 20)
print(f"Point: {point}")

# Method 2: Without parentheses (tuple packing)
coordinates = 10, 20, 30
print(f"Coordinates: {coordinates}")

# Method 3: tuple() constructor
from_list = tuple([1, 2, 3])
print(f"From list: {from_list}")

# Method 4: From string
from_string = tuple("hello")
print(f"From string: {from_string}")

In [None]:
# IMPORTANT: Single element tuple needs trailing comma!

not_a_tuple = (42)      # This is just an int
is_a_tuple = (42,)      # This is a tuple with one element
also_tuple = 42,        # Also works without parentheses

print(f"(42) type: {type(not_a_tuple)}")
print(f"(42,) type: {type(is_a_tuple)}")
print(f"42, type: {type(also_tuple)}")

In [None]:
# Empty tuple
empty = ()
also_empty = tuple()

print(f"Empty tuple: {empty}")
print(f"Length: {len(empty)}")

---
# Section 2: Accessing Elements

In [None]:
# Indexing and slicing (same as lists)
record = ("C001", "John Doe", "john@example.com", 30, "NYC")

print(f"First element: {record[0]}")
print(f"Last element: {record[-1]}")
print(f"Slice [1:3]: {record[1:3]}")

In [None]:
# Tuple unpacking - Very powerful!
record = ("C001", "John Doe", "john@example.com")

# Unpack all elements
customer_id, name, email = record
print(f"ID: {customer_id}")
print(f"Name: {name}")
print(f"Email: {email}")

In [None]:
# Extended unpacking with *
record = ("C001", "John", "Doe", "john@example.com", "NYC", "USA")

# Get first, last, and everything in between
first, *middle, last = record
print(f"First: {first}")
print(f"Middle: {middle}")
print(f"Last: {last}")

In [None]:
# Ignore values with _
record = ("C001", "John Doe", "john@example.com", 30)

# Only need ID and email
customer_id, _, email, _ = record
print(f"ID: {customer_id}, Email: {email}")

# Ignore multiple values
customer_id, *_, age = record
print(f"ID: {customer_id}, Age: {age}")

---
# Section 3: Tuple Methods

Tuples only have 2 methods (because they're immutable):

In [None]:
# count() - Count occurrences of a value
data = (1, 2, 2, 3, 2, 4, 2, 5)

count_of_2 = data.count(2)
print(f"Data: {data}")
print(f"Count of 2: {count_of_2}")

In [None]:
# index() - Find index of first occurrence
data = ("apple", "banana", "cherry", "banana", "date")

banana_index = data.index("banana")
print(f"Data: {data}")
print(f"First 'banana' at index: {banana_index}")

# index() with start position
second_banana = data.index("banana", banana_index + 1)
print(f"Second 'banana' at index: {second_banana}")

In [None]:
# index() raises ValueError if not found
try:
    data.index("grape")
except ValueError as e:
    print(f"ValueError: {e}")

---
# Section 4: Immutability

In [None]:
# Tuples cannot be modified
point = (10, 20)

try:
    point[0] = 100  # This will fail
except TypeError as e:
    print(f"TypeError: {e}")

In [None]:
# HOWEVER: Mutable objects INSIDE a tuple can be modified!
data = ([1, 2, 3], "immutable string")

print(f"Before: {data}")

# The list inside can be modified
data[0].append(4)
print(f"After: {data}")

# But we still can't reassign the tuple element
try:
    data[0] = [5, 6, 7]
except TypeError as e:
    print(f"Cannot reassign: {e}")

In [None]:
# To "modify" a tuple, create a new one
original = (1, 2, 3)

# Concatenation creates new tuple
extended = original + (4, 5)
print(f"Extended: {extended}")

# Convert to list, modify, convert back
as_list = list(original)
as_list[0] = 100
modified = tuple(as_list)
print(f"Modified: {modified}")

---
# Section 5: Tuples as Dictionary Keys

Because tuples are hashable (immutable), they can be used as dict keys. Lists cannot!

In [None]:
# DE Use Case: Composite keys for lookups

# Sales data with (region, product) as key
sales = {
    ("east", "laptop"): 150,
    ("east", "phone"): 200,
    ("west", "laptop"): 180,
    ("west", "phone"): 220,
}

# Fast lookup by composite key
print(f"East laptop sales: {sales[('east', 'laptop')]}")
print(f"West phone sales: {sales[('west', 'phone')]}")

In [None]:
# Building composite key lookup from data
transactions = [
    {"date": "2024-01-15", "product": "laptop", "amount": 1000},
    {"date": "2024-01-15", "product": "phone", "amount": 500},
    {"date": "2024-01-16", "product": "laptop", "amount": 1200},
    {"date": "2024-01-15", "product": "laptop", "amount": 800},  # Same date+product
]

# Aggregate by (date, product)
totals = {}
for t in transactions:
    key = (t["date"], t["product"])  # Composite tuple key
    totals[key] = totals.get(key, 0) + t["amount"]

print("Sales by date and product:")
for (date, product), amount in sorted(totals.items()):
    print(f"  {date} - {product}: ${amount}")

In [None]:
# Lists cannot be dict keys
try:
    bad_dict = {[1, 2]: "value"}
except TypeError as e:
    print(f"TypeError: {e}")

---
# Section 6: Tuples in Functions

In [None]:
# Returning multiple values (actually returns a tuple)
def get_stats(numbers):
    """Return min, max, and average."""
    return min(numbers), max(numbers), sum(numbers) / len(numbers)

data = [10, 20, 30, 40, 50]

# Get as tuple
result = get_stats(data)
print(f"Result tuple: {result}")

# Unpack directly
minimum, maximum, average = get_stats(data)
print(f"Min: {minimum}, Max: {maximum}, Avg: {average}")

In [None]:
# DE Use Case: Validation function returning (is_valid, error_message)
def validate_record(record):
    """Validate a record, return (is_valid, error_or_none)."""
    if "id" not in record:
        return False, "Missing required field: id"
    if "email" not in record:
        return False, "Missing required field: email"
    if "@" not in record.get("email", ""):
        return False, "Invalid email format"
    return True, None

# Test records
records = [
    {"id": "C001", "email": "john@example.com"},
    {"id": "C002"},
    {"id": "C003", "email": "invalid-email"},
]

print("Validation results:")
for record in records:
    is_valid, error = validate_record(record)
    status = "✓" if is_valid else f"✗ {error}"
    print(f"  {record.get('id', 'N/A')}: {status}")

In [None]:
# *args creates a tuple
def process_items(*args):
    print(f"Type: {type(args)}")
    print(f"Items: {args}")

process_items(1, 2, 3, "a", "b")

---
# Section 7: Tuple Operations

In [None]:
# Concatenation
t1 = (1, 2, 3)
t2 = (4, 5, 6)
combined = t1 + t2
print(f"Concatenation: {combined}")

# Repetition
repeated = t1 * 3
print(f"Repetition: {repeated}")

# Membership
print(f"2 in t1: {2 in t1}")
print(f"10 in t1: {10 in t1}")

# Length
print(f"Length: {len(t1)}")

In [None]:
# Comparison (element by element)
print(f"(1, 2, 3) < (1, 2, 4): {(1, 2, 3) < (1, 2, 4)}")
print(f"(1, 2, 3) < (1, 3, 0): {(1, 2, 3) < (1, 3, 0)}")
print(f"(1, 2) < (1, 2, 3): {(1, 2) < (1, 2, 3)}")

In [None]:
# Built-in functions work with tuples
numbers = (5, 2, 8, 1, 9, 3)

print(f"Tuple: {numbers}")
print(f"min: {min(numbers)}")
print(f"max: {max(numbers)}")
print(f"sum: {sum(numbers)}")
print(f"sorted: {sorted(numbers)}")  # Returns a list!

---
# Section 8: Practical DE Examples

In [None]:
# Example 1: Database-style rows
# Often database queries return tuples

# Simulated DB result
db_rows = [
    ("C001", "John Doe", "john@example.com", 100.00),
    ("C002", "Jane Smith", "jane@example.com", 250.50),
    ("C003", "Bob Johnson", "bob@example.com", 75.25),
]

# Define column names
columns = ("id", "name", "email", "balance")

# Convert to dicts for easier processing
records = [dict(zip(columns, row)) for row in db_rows]

print("Converted records:")
for r in records:
    print(f"  {r}")

In [None]:
# Example 2: Sorting by multiple fields
employees = [
    ("Engineering", "Alice", 75000),
    ("Sales", "Bob", 65000),
    ("Engineering", "Charlie", 80000),
    ("Sales", "Diana", 70000),
    ("Engineering", "Eve", 75000),
]

# Sort by department, then by salary (descending)
sorted_employees = sorted(employees, key=lambda x: (x[0], -x[2]))

print("Employees sorted by dept, then salary (desc):")
for dept, name, salary in sorted_employees:
    print(f"  {dept:12} | {name:10} | ${salary:,}")

In [None]:
# Example 3: Using zip with tuples
ids = ("C001", "C002", "C003")
names = ("John", "Jane", "Bob")
balances = (100.00, 250.50, 75.25)

# Combine parallel data
records = list(zip(ids, names, balances))
print(f"Zipped: {records}")

# Unzip (transpose)
unzipped = list(zip(*records))
print(f"Unzipped: {unzipped}")

In [None]:
# Example 4: Enumerate returns tuples
items = ["apple", "banana", "cherry"]

# enumerate yields (index, value) tuples
for index, item in enumerate(items, start=1):
    print(f"  {index}. {item}")

In [None]:
# Example 5: Swapping values (uses tuple unpacking)
a = 10
b = 20

print(f"Before: a={a}, b={b}")

# Python's elegant swap
a, b = b, a  # Actually creates a tuple (b, a) and unpacks it

print(f"After: a={a}, b={b}")

In [None]:
# Example 6: Fixed configuration (immutable)
# Use tuple when config should never change

ALLOWED_EXTENSIONS = (".csv", ".json", ".parquet")
VALID_STATUS_CODES = (200, 201, 202, 204)
DB_CONFIG = ("localhost", 5432, "mydb")  # host, port, database

# These cannot be accidentally modified
def is_valid_file(filename):
    return any(filename.endswith(ext) for ext in ALLOWED_EXTENSIONS)

print(f"'data.csv' valid: {is_valid_file('data.csv')}")
print(f"'data.xlsx' valid: {is_valid_file('data.xlsx')}")

---
# Section 9: Tuple vs List - When to Use Which?

| Use Tuple | Use List |
|-----------|----------|
| Data should not change | Data will be modified |
| Need hashable (dict key, set element) | Don't need hashable |
| Heterogeneous data (different types) | Homogeneous data (same type) |
| Fixed structure (like a record) | Variable length collection |
| Want to prevent accidental modification | Need append/remove/etc. |
| Slightly better performance | Flexibility more important |

In [None]:
# Performance comparison
import sys

# Memory usage
list_data = [1, 2, 3, 4, 5]
tuple_data = (1, 2, 3, 4, 5)

print(f"List size: {sys.getsizeof(list_data)} bytes")
print(f"Tuple size: {sys.getsizeof(tuple_data)} bytes")
print(f"Tuple saves: {sys.getsizeof(list_data) - sys.getsizeof(tuple_data)} bytes")

---
# Quick Reference: Tuple Operations

| Operation | Description | Example |
|-----------|-------------|--------|
| `t[i]` | Access element | `t[0]` |
| `t[i:j]` | Slice | `t[1:3]` |
| `a, b, c = t` | Unpacking | Assign to variables |
| `t.count(x)` | Count occurrences | `t.count(2)` |
| `t.index(x)` | Find index | `t.index(2)` |
| `t1 + t2` | Concatenate | New tuple |
| `t * n` | Repeat | New tuple |
| `x in t` | Membership | Bool |
| `len(t)` | Length | Int |
| `tuple(iter)` | Convert to tuple | From iterable |