In [None]:
# 1. Threading Basics
# Thread Lifecycle: Understand thread creation, starting a thread, and how a thread terminates.
# threading.Thread Class: Learn to create and manage threads using the threading module, and how
#  to pass arguments to threads.
# join() Method: Blocking a thread until another thread completes. This is essential to prevent
#  race conditions or ensure proper task sequencing.

In [1]:
import threading # Import the threading module
import time # Import the time module for sleep functionality 

# Define a simple function for the thread to execute
def print_numbers(thread_name, start, end, delay):
    print(f"Thread {thread_name} started")
    for i in range(start, end + 1):
        time.sleep(delay)  # Simulate work with delay
        print(f"{thread_name}: {i}")
    print(f"Thread {thread_name} finished")

# Thread lifecycle and the threading.Thread class
def thread_lifecycle_example():
    # Create threads using the threading.Thread class
    thread1 = threading.Thread(target=print_numbers, args=("Thread 1", 1, 5, 1))
    thread2 = threading.Thread(target=print_numbers, args=("Thread 2", 6, 10, 0.5))

    print("Starting threads...")

    # Start threads (this invokes the run method in threading.Thread)
    thread1.start()  # Thread 1 will start running
    thread2.start()  # Thread 2 will start running concurrently

    # Join threads (wait for them to complete)
    # The join() method ensures the main thread waits for both threads to finish
    # before proceeding to the next line of code.
    thread1.join()  # Blocks the main thread until thread1 finishes
    thread2.join()  # Blocks the main thread until thread2 finishes
    # since we have two threads, we can join them one after the other
    # first join will block until thread1 finishes, then the second join will block until thread2 finishes
    # which mean thread1 will finish first before thread2 finishes
    # then execution will continue after both threads are done which is a sequential execution
    # if we want them to run concurrently, we can start them both and join them both
    # for example we can use a for loop to join them both
    # for thread in [thread1, thread2]:
    #     thread.join()
    

    print("All threads completed")

# Run the example
if __name__ == "__main__":
    thread_lifecycle_example()


Starting threads...
Thread Thread 1 started
Thread Thread 2 started
Thread 2: 6
Thread 1: 1Thread 2: 7

Thread 2: 8
Thread 1: 2
Thread 2: 9
Thread 2: 10
Thread Thread 2 finished
Thread 1: 3
Thread 1: 4
Thread 1: 5
Thread Thread 1 finished
All threads completed


In [None]:
# Key Concepts Highlighted:

# Thread Creation: Two threads are created to perform tasks concurrently.
# Thread Starting: Both threads are started using the start() method.
# Join: The join() method ensures that the main program waits for both threads to complete before proceeding.
# Concurrency: The output from both threads is interleaved, demonstrating that they are running concurrently.
# Thread Lifecycle: The threads start, execute their tasks, and terminate. The main thread waits for them to complete before continuing execution.
# This example provides a solid foundation in basic threading concepts in Python.

In [None]:
# 2. Thread Safety: Managing race conditions and synchronizing access to shared resources 
# using locks is crucial in multithreading to avoid inconsistent or incorrect results.
# When multiple threads access and modify shared data concurrently, race conditions may arise.
# Race conditions occur when the outcome depends on the order in which threads execute. 
# This leads to unpredictable and incorrect results.

In [2]:
# Scenario 1: Race Condition Without Lock
# In this scenario, two threads increment a shared counter. Without synchronization, 
# this can lead to race conditions.

In [None]:
import threading

# Shared resource (counter)
counter = 0

# Function to increment the counter
def increment_counter():
    global counter
    for _ in range(100000):
        counter += 1

# Create two threads that modify the shared counter
thread1 = threading.Thread(target=increment_counter)
thread2 = threading.Thread(target=increment_counter)

# Start the threads
thread1.start()
thread2.start()

# Wait for both threads to complete
thread1.join()
thread2.join()

print(f"Final counter value (without lock): {counter}")

In [None]:
# Explanation:
# Shared Resource (counter): Both threads increment the shared counter 100,000 times.
# Race Condition: Since both threads are trying to update the counter at the same time, the result will be unpredictable and often incorrect.
# Expected Outcome:
# The correct final value should be 200,000 (because each thread increments the counter by 100,000). However, due to race conditions, the output may be much lower than expected because both threads are modifying the counter simultaneously, and some increments are overwritten or lost.

In [None]:
# Scenario 2: Using threading.Lock to Prevent Race Conditions
# A lock ensures that only one thread can execute a block of code (critical section) at a time.
#  This prevents race conditions by serializing access to shared resources.

In [None]:
import threading

# Shared resource (counter)
counter = 0
lock = threading.Lock()  # Create a lock

# Function to increment the counter with lock
def increment_counter_safe():
    global counter
    for _ in range(100000):
        with lock:  # Acquire the lock before modifying the shared resource
            counter += 1

# Create two threads that modify the shared counter safely
thread1 = threading.Thread(target=increment_counter_safe)
thread2 = threading.Thread(target=increment_counter_safe)

# Start the threads
thread1.start()
thread2.start()

# Wait for both threads to complete
thread1.join()
thread2.join()

print(f"Final counter value (with lock): {counter}")

In [None]:
# Explanation:
# Lock (lock = threading.Lock()): A lock is created to ensure only one thread can modify 
# the shared counter at a time.
# Critical Section (with lock): The with lock statement acquires the lock before modifying 
# the counter. Other threads must wait until the lock is released before they can enter the 
# critical section.
# Expected Outcome:
# Since the lock prevents simultaneous access to the shared counter, the final value should 
# be 200,000 as expected, with no race conditions.

In [None]:
# Scenario 3: Using threading.RLock for Reentrant Locks
# An RLock (reentrant lock) allows a thread to acquire the same lock multiple times without
#  causing a deadlock. This is useful when a thread may need to re-enter the critical section.

In [None]:
import threading

# Shared resource (counter)
counter = 0
rlock = threading.RLock()  # Create a reentrant lock

# Function to increment the counter with reentrant lock
def increment_counter_reentrant():
    global counter
    with rlock:  # First lock
        with rlock:  # Re-enter the lock (allowed by RLock)
            for _ in range(100000):
                counter += 1

# Create two threads that modify the shared counter safely
thread1 = threading.Thread(target=increment_counter_reentrant)
thread2 = threading.Thread(target=increment_counter_reentrant)

# Start the threads
thread1.start()
thread2.start()

# Wait for both threads to complete
thread1.join()
thread2.join()

print(f"Final counter value (with RLock): {counter}")

In [None]:
# Explanation:
# RLock: Unlike a normal lock (Lock), an RLock can be acquired multiple times by the same 
# thread without causing a deadlock. In this example, the same lock is acquired twice (with
# rlock: nested).

# Critical Section: The shared counter is modified safely, even though the same thread
#  re-enters the critical section.

# Expected Outcome:
# The final counter value should still be 200,000. The RLock allows multiple acquisitions 
# by the same thread, but it still protects shared data from being accessed by other threads 
# concurrently.

In [None]:
# Scenario 4: Improper Use of Lock Leading to Deadlock

# Improper use of locks can lead to deadlock, where two or more threads are waiting 
# on each other to release locks.

In [None]:
import threading
import time

lock1 = threading.Lock()
lock2 = threading.Lock()

# Thread 1 tries to acquire lock1 and then lock2
def thread1_task():
    with lock1:
        time.sleep(1)  # Simulate work
        with lock2:
            print("Thread 1 has lock1 and lock2")

# Thread 2 tries to acquire lock2 and then lock1
def thread2_task():
    with lock2:
        time.sleep(1)  # Simulate work
        with lock1:
            print("Thread 2 has lock2 and lock1")

# Create threads
thread1 = threading.Thread(target=thread1_task)
thread2 = threading.Thread(target=thread2_task)

# Start threads
thread1.start()
thread2.start()

# Wait for threads to finish (this will never happen due to deadlock)
thread1.join()
thread2.join()

print("This message will never print due to deadlock")

In [None]:
# Explanation:

# Deadlock: Thread 1 acquires lock1 and waits for lock2, while Thread 2 acquires lock2 and waits
#  for lock1. Both threads are stuck, waiting for the other to release the lock.

# Solution: Avoid acquiring multiple locks or ensure locks are always acquired in a consistent order.
# Outcome:
# The program will hang indefinitely due to deadlock.

In [None]:
# Key Concepts:
# Race Condition: Multiple threads accessing/modifying shared data simultaneously without proper 
# synchronization, leading to unpredictable results.
# Lock (threading.Lock): A basic synchronization primitive to prevent race conditions by allowing 
# only one thread to execute a critical section at a time.
# RLock (threading.RLock): A reentrant lock that allows the same thread to acquire the lock multiple
#  times without causing deadlock.
# Deadlock: A situation where two or more threads are waiting for each other to release resources,
#  causing them to freeze indefinitely.

In [None]:
# 3. Deadlocks and Starvation in Multithreading
# Deadlock:
# A deadlock occurs when two or more threads are waiting for each other to release resources,
#  resulting in a situation where none of the threads can proceed. For example, Thread A holds
# Lock 1 and is waiting for Lock 2, while Thread B holds Lock 2 and is waiting for Lock 1. Both
#  threads are stuck, and this creates a deadlock.

# Thread Starvation:
# Thread starvation happens when a low-priority thread is unable to get access to a resource 
# (CPU time or shared data) because higher-priority threads are continuously executing and 
# monopolizing the resource.

In [3]:
# Scenario 1: Deadlock Example
# In this example, we have two threads, each trying to acquire two locks in a different order,
#  which leads to a deadlock.

In [None]:
import threading
import time

# Locks representing shared resources
lock1 = threading.Lock()
lock2 = threading.Lock()

# Thread 1 tries to acquire lock1 and then lock2
def thread1_task():
    print("Thread 1: Trying to acquire Lock 1")
    with lock1:
        print("Thread 1: Acquired Lock 1")
        time.sleep(1)  # Simulate some work
        print("Thread 1: Trying to acquire Lock 2")
        with lock2:
            print("Thread 1: Acquired Lock 2")

# Thread 2 tries to acquire lock2 and then lock1 (opposite order)
def thread2_task():
    print("Thread 2: Trying to acquire Lock 2")
    with lock2:
        print("Thread 2: Acquired Lock 2")
        time.sleep(1)  # Simulate some work
        print("Thread 2: Trying to acquire Lock 1")
        with lock1:
            print("Thread 2: Acquired Lock 1")

# Create threads
thread1 = threading.Thread(target=thread1_task)
thread2 = threading.Thread(target=thread2_task)

# Start threads
thread1.start()
thread2.start()

# Wait for threads to finish
thread1.join()
thread2.join()

print("Both threads completed (this will never print due to deadlock)")

# Explanation:
# Thread 1 acquires Lock 1 and waits for Lock 2.
# Thread 2 acquires Lock 2 and waits for Lock 1.
# Both threads are stuck waiting for each other to release their locks, causing a deadlock.

In [None]:
# Solution: Preventing Deadlocks by Acquiring Locks in a Defined Order

# To avoid deadlock, we can enforce a strict ordering when acquiring multiple locks.
# For example, both threads should always acquire Lock 1 before Lock 2.

In [None]:
import threading
import time

# Locks representing shared resources
lock1 = threading.Lock()
lock2 = threading.Lock()

# Thread 1 acquires locks in defined order (lock1, lock2)
def thread1_task():
    print("Thread 1: Trying to acquire Lock 1")
    with lock1:
        print("Thread 1: Acquired Lock 1")
        time.sleep(1)
        print("Thread 1: Trying to acquire Lock 2")
        with lock2:
            print("Thread 1: Acquired Lock 2")

# Thread 2 also acquires locks in the same order (lock1, lock2)
def thread2_task():
    print("Thread 2: Trying to acquire Lock 1")
    with lock1:
        print("Thread 2: Acquired Lock 1")
        time.sleep(1)
        print("Thread 2: Trying to acquire Lock 2")
        with lock2:
            print("Thread 2: Acquired Lock 2")

# Create threads
thread1 = threading.Thread(target=thread1_task)
thread2 = threading.Thread(target=thread2_task)

# Start threads
thread1.start()
thread2.start()

# Wait for threads to finish
thread1.join()
thread2.join()

print("Both threads completed successfully")


In [None]:
# Explanation:

# Both threads now acquire locks in the same order (lock1 → lock2), preventing deadlocks.
# If Thread 1 acquires lock1, Thread 2 must wait for lock1 to be released before it can proceed,
#  ensuring there is no circular waiting.

# Outcome:
# The program will execute without deadlock, and the message "Both threads completed successfully"
#  will print.

In [None]:
# Scenario 2: Thread Starvation Example

# In thread starvation, low-priority threads do not get a chance to execute because higher-priority
# threads continuously monopolize the resources. In Python, we don't have thread priorities directly,
# but we can simulate starvation by having threads that take much longer to release resources or 
# run without yielding control.

In [None]:
import threading
import time

# Shared resource
lock = threading.Lock()

# High-priority thread that monopolizes the lock for a long time
def high_priority_task():
    while True:
        with lock:
            print("High Priority Thread: Working")
            time.sleep(2)  # Hold the lock for 2 seconds, simulating a long task

# Low-priority thread that tries to access the lock
def low_priority_task():
    while True:
        with lock:
            print("Low Priority Thread: Working")
            time.sleep(0.1)  # Hold the lock for a short time

# Create and start threads
high_priority_thread = threading.Thread(target=high_priority_task)
low_priority_thread = threading.Thread(target=low_priority_task)

# Start the high-priority thread first (simulating CPU hogging)
high_priority_thread.start() # This thread will continuously hold the lock
time.sleep(0.5)  # Give the high-priority thread a head start
low_priority_thread.start() # Start the low-priority thread
# only after the high-priority thread has started and held the lock for 0.5 seconds 
# the low-priority thread will start

In [None]:
# Explanation:
# High-priority thread: Simulates a thread that holds the lock for a long time (2 seconds),
#  monopolizing the shared resource.
# Low-priority thread: Tries to acquire the lock but can only do so for very brief periods 
# (0.1 seconds) because the high-priority thread quickly reacquires the lock.
# Outcome:
# The low-priority thread will get very few chances to execute, leading to thread starvation.

# The high-priority thread keeps monopolizing the resource, preventing the low-priority thread
#  from accessing it frequently.

In [None]:
# Solution: Avoiding Starvation Using Time-Slicing or Yielding
# One solution to thread starvation is to introduce periodic yielding or reduce the time the high-priority thread holds the lock.

In [None]:
import threading
import time

# Shared resource
lock = threading.Lock()

# High-priority thread that monopolizes the lock but yields control
def high_priority_task():
    while True:
        with lock:
            print("High Priority Thread: Working")
            time.sleep(0.5)  # Hold the lock for less time to allow other threads

# Low-priority thread that tries to access the lock
def low_priority_task():
    while True:
        with lock:
            print("Low Priority Thread: Working")
            time.sleep(0.1)

# Create and start threads
high_priority_thread = threading.Thread(target=high_priority_task)
low_priority_thread = threading.Thread(target=low_priority_task)

# Start both threads
high_priority_thread.start()
time.sleep(0.5)  # Give the high-priority thread a head start
low_priority_thread.start()


In [None]:
# Summary of Key Concepts:
# Deadlock: Happens when two or more threads wait on each other to release resources, 
# resulting in a standstill.
# Prevention: Always acquire locks in a consistent order to avoid circular waiting.
# Thread Starvation: Occurs when a low-priority thread doesn't get sufficient CPU time 
# because other threads continuously hog resources.
# Solution: Use time-slicing, yielding, or ensure fair access to resources by not holding 
# locks for long periods.
# Understanding and managing these issues is critical for writing efficient, safe, and 
# reliable multithreaded applications.

In [None]:
# Overview of ThreadPoolExecutor

# ThreadPoolExecutor is a part of the concurrent.futures module in Python, 
# which provides a high-level interface for asynchronously executing callables using threads.
#  It allows you to create a pool of threads and manage them efficiently.

In [None]:
import concurrent.futures
import time

# A simple function that simulates a task
def task(n):
    time.sleep(n)
    return f'Task completed in {n} seconds'

# Using ThreadPoolExecutor
def main():
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        # Here executor will manage a pool of 3 threads to execute tasks concurrently 

        # Submit multiple tasks to the pool
        # Imagine we have 5 tasks with varying sleep times (1 to 5 seconds) 
        futures = [executor.submit(task, i) for i in range(1, 6)]
        # Here executor will submit the tasks to the pool and return a list of Future objects
        # since we got 3 workers, the first 3 tasks will be executed concurrently,
        # and the remaining tasks will wait until a worker is free
        # 4 and 5 will wait until 1, 2, and 3 are completed before they can start executing

        # Collect results as they complete
        for future in concurrent.futures.as_completed(futures):
            print(future.result()) # This will print the result of each task as it completes
            # So here each task is a Future object, and we can call result() on it to get the return value

if __name__ == "__main__":
    main()


In [None]:
# Explanation
# Function Definition: The task function simulates a task that takes n seconds to complete by sleeping for that duration.

# ThreadPoolExecutor:

# We create an instance of ThreadPoolExecutor with max_workers=3, meaning up to 3 threads can run concurrently.
# The with statement ensures proper cleanup of threads when done.
# Submitting Tasks:

# We submit tasks to the pool using executor.submit(), which returns a Future object for each task.
# In this case, we submit 5 tasks, where each task sleeps for 1 to 5 seconds.
# Collecting Results:

# We use concurrent.futures.as_completed() to iterate over the completed futures as they finish, allowing us to print results in the order they complete, not the order they were submitted.
# Possible Data Scenarios
# Variable Task Durations:

# Tasks may have different execution times, which can lead to some tasks finishing before others, as shown in the example.
# Error Handling:

# You might want to handle exceptions if a task fails. You can catch exceptions when calling future.result().

In [None]:
for future in concurrent.futures.as_completed(futures):
    try:
        print(future.result())
    except Exception as e:
        print(f'Task generated an exception: {e}')

In [None]:
# Dynamic Task Submission:

# Instead of a fixed list, you might generate tasks dynamically based on user input or data from a database.
# Task Dependencies:

# If tasks are dependent on the results of previous tasks, you would need to manage the order of execution, possibly by chaining futures.
# Resource Limitation:

# If tasks are resource-intensive (e.g., I/O bound), you might need to tune max_workers to avoid overwhelming the system.
# Performance Monitoring:

# Measure execution time for each task or the overall completion time, especially if tasks are expected to take varying amounts of time.
# Conclusion
# ThreadPoolExecutor provides a powerful way to manage threads and execute tasks concurrently. Understanding how to utilize it with different data scenarios will help you optimize performance in multithreaded applications.

In [None]:
# In this analogy:

# Threads can run different functions (like playing different games) independently. Each thread (kid) has to be managed individually, which can get chaotic!

# ThreadPoolExecutor: The Team of Helpers
# Now imagine there’s a group of helpers (the ThreadPoolExecutor) who organize the kids on the playground.

# Same Game: This time, let’s say all the kids want to play the same game, like hide and seek. The helpers assign kids to play this game.
# Limited Helpers: There are only a few helpers available (let’s say 3), so even though there are many kids, only 3 can play at a time. The helpers will make sure everyone gets a turn without chaos.

# In this analogy:

# ThreadPoolExecutor efficiently manages tasks, allowing multiple kids (threads) to play the same game (function) but ensuring that not too many play at once. When a kid finishes, the helper quickly assigns another kid to join the game.


In [None]:
# Summary
# Threads: Kids playing different games independently. Each one has to manage themselves.
# ThreadPoolExecutor: Helpers organizing kids to play the same game efficiently, with a limit on how many can play at once.
# Function Execution
# Threads can execute different functions (like playing different games).
# ThreadPoolExecutor often executes the same function (like all kids playing hide and seek) but can also execute different functions if needed, depending on how you submit the tasks.

In [None]:
# we can even combine both 

In [None]:
import concurrent.futures
import threading
import time

# Function that simulates a task
def independent_task(n):
    time.sleep(n)
    print(f'Independent task {n} completed.')

# Function for the thread pool
def pool_task(n):
    time.sleep(n)
    return f'Pool task {n} completed.'

# Using both threads and ThreadPoolExecutor
def main():
    # Create some independent threads
    independent_threads = []
    for i in range(1, 4):
        t = threading.Thread(target=independent_task, args=(i,))
        independent_threads.append(t)
        t.start()

    # Using ThreadPoolExecutor for pooled tasks
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        pool_futures = [executor.submit(pool_task, i) for i in range(1, 6)]
        
        # Collect results from the pool
        for future in concurrent.futures.as_completed(pool_futures):
            print(future.result())

    # Wait for all independent threads to finish
    for t in independent_threads:
        t.join()

if __name__ == "__main__":
    main()


In [None]:
# Explanation
# Independent Tasks:

# We create and start some threads for independent tasks (e.g., independent_task). Each thread runs a task that simulates a delay based on the input.
# ThreadPoolExecutor:

# We use ThreadPoolExecutor to manage a set of tasks (e.g., pool_task). We submit multiple tasks to the executor, which manages them efficiently, allowing up to 2 tasks to run concurrently.
# Collecting Results:

# We collect results from the pool of tasks using as_completed(), which allows us to process results as they finish.
# Joining Independent Threads:

# Finally, we ensure all independent threads have completed by calling join() on each.
# Benefits of Combining
# Flexibility: You can handle different types of tasks (some that run independently and others that benefit from pooling).
# Efficiency: The ThreadPoolExecutor helps optimize resource usage by controlling the number of concurrent tasks, while independent threads can still execute without being managed by the pool.

In [None]:
# Daemon Threads
# Background Workers: Daemon threads are like helpful assistants that work in the background.
# Automatic Termination: If the main program (the boss) finishes its job, the daemon threads automatically stop working and leave.
# Use Case: They are useful for tasks that don't need to finish if the main program exits, like background monitoring or logging.

In [None]:
# Non-Daemon Threads
# Essential Workers: Non-daemon threads are like essential employees. They must finish their tasks before the main program can close.
# Prevent Program Exit: If the main program finishes but non-daemon threads are still running, the program will wait for those threads to complete.
# Use Case: They are ideal for tasks that are critical to the application, like processing user data or completing a transaction.

In [None]:
import threading
import time

# Function for a daemon thread
def daemon_task():
    while True:
        print("Daemon thread is running...")
        time.sleep(1)

# Function for a non-daemon thread
def non_daemon_task():
    print("Non-daemon thread started.")
    time.sleep(5)
    print("Non-daemon thread finished.")

# Creating threads
daemon_thread = threading.Thread(target=daemon_task)
non_daemon_thread = threading.Thread(target=non_daemon_task)

# Set the daemon property to True
daemon_thread.daemon = True

# Start the threads
daemon_thread.start()
non_daemon_thread.start()

# Wait for the non-daemon thread to finish
non_daemon_thread.join() # Here we are waiting for the non-daemon thread to finish

print("Main program is exiting...")


In [None]:
# Explanation of the Code
# Daemon Thread:

# The daemon_task function runs indefinitely, printing a message every second.
# We set daemon_thread.daemon = True to make it a daemon thread.
# Non-Daemon Thread:

# The non_daemon_task function runs for 5 seconds, then finishes.
# It does not have the daemon property set, so it is a non-daemon thread.
# Thread Execution:

# When you start both threads, the main program will continue running.
# However, when the non-daemon thread finishes (after 5 seconds), the main program will exit, and the daemon thread will be terminated immediately.

# Summary
# Daemon Threads: Run in the background, automatically terminated when the main program exits. Useful for non-essential tasks.
# Non-Daemon Threads: Must finish their work before the main program can exit. Critical for tasks that need to be completed.

In [None]:
# ere are simple examples demonstrating multithreading use cases in data engineering: batch data processing, ETL pipelines, real-time data processing, and API rate limiting.

# 1. Batch Data Processing
# In this example, we simulate reading data from multiple partitions concurrently.

In [None]:
import threading
import time

# Simulated function to read data from a partition
def read_partition(partition_id):
    print(f"Starting to read from partition {partition_id}")
    time.sleep(2)  # Simulate time taken to read data
    print(f"Finished reading from partition {partition_id}")

# Simulating reading from multiple partitions
def batch_data_processing():
    partitions = [1, 2, 3, 4]  # Simulated partition IDs
    threads = []

    for partition in partitions:
        thread = threading.Thread(target=read_partition, args=(partition,))
        threads.append(thread)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    print("Batch data processing completed.")

# Run the example
batch_data_processing()


In [None]:
# ETL Pipelines
# This example shows using multithreading to perform data transformation and loading into a database.

In [4]:
import threading
import time

# Simulated function to transform data
def transform_data(data):
    print(f"Transforming {data}")
    time.sleep(1)  # Simulate time taken to transform data
    return f"Transformed {data}"

# Simulated function to load data into a database
def load_to_database(transformed_data):
    print(f"Loading {transformed_data} to database")
    time.sleep(1)  # Simulate loading time
    print(f"Loaded {transformed_data}")

# Simulating ETL process
def etl_pipeline(data_list):
    threads = []

    for data in data_list:
        # Transform data in a thread
        transformed_data = transform_data(data)
        # Load transformed data in another thread
        thread = threading.Thread(target=load_to_database, args=(transformed_data,))
        threads.append(thread)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    print("ETL pipeline completed.")

# Run the example
etl_pipeline(["data1", "data2", "data3"])


Transforming data1
Loading Transformed data1 to databaseTransforming data2

Loaded Transformed data1
Loading Transformed data2 to database
Transforming data3
Loaded Transformed data2
Loading Transformed data3 to database
Loaded Transformed data3
ETL pipeline completed.


In [None]:
# Real-time Data Processing
# In this example, we simulate processing data streams from multiple sources concurrently.

In [None]:
import threading
import time

# Simulated function to process a data stream
def process_stream(source):
    print(f"Starting to process stream from {source}")
    for i in range(3):
        time.sleep(1)  # Simulate processing time
        print(f"Processed item {i + 1} from {source}")
    print(f"Finished processing stream from {source}")

# Simulating real-time data processing
def real_time_processing():
    sources = ["Source A", "Source B", "Source C"]
    threads = []

    for source in sources:
        thread = threading.Thread(target=process_stream, args=(source,))
        threads.append(thread)
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    print("Real-time data processing completed.")

# Run the example
real_time_processing()


In [None]:
# API Rate Limiting
# This example demonstrates managing multiple API calls while respecting rate limits

In [None]:
import threading
import time
import random

# Simulated function to call an API
def call_api(api_id):
    print(f"API call to {api_id} started")
    time.sleep(random.uniform(0.5, 1.5))  # Simulate varying response times
    print(f"API call to {api_id} completed")

# Simulating API rate limiting
def api_rate_limiting():
    api_ids = ["API 1", "API 2", "API 3", "API 4"]
    threads = []

    for api_id in api_ids:
        thread = threading.Thread(target=call_api, args=(api_id,))
        threads.append(thread)
        thread.start()

        # Introducing a delay to respect rate limit
        time.sleep(1)  # Delay to avoid exceeding rate limits
        # This will ensure that we are not exceeding the allowed number of requests to the API
        # within a certain time frame.

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

    print("All API calls completed.")

# Run the example
api_rate_limiting()

# what this program does is it simulates a scenario where we have multiple threads
#  that are performing different tasks concurrently.

# In the first example, we simulate reading data from multiple partitions concurrently.
# In the second example, we simulate an ETL pipeline where data is transformed and loaded
#  into a database.

# In the third example, we simulate real-time data processing from multiple sources.
# In the fourth example, we demonstrate managing multiple API calls while respecting rate limits.
# These examples illustrate how multithreading can be effectively used in data engineering tasks


# we are inducing a delay to respect the rate limit of the API calls. 
# which means we are ensuring that we are not exceeding the allowed number of requests to the API
#  within a certain time frame.