# 1. Polling or Retrying for Job Completion or File Availability:
- You are waiting for a file to arrive in a specific folder (like /data/input/) before starting your processing job.

In [None]:
import time
import os

while not os.path.exists("/data/input/daily_file.csv"):
    print("Waiting for file...")
    time.sleep(60)


Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...


In [None]:
# 1. **Polling or Retrying for Job Completion or File Availability**
# Used to check periodically if a file has landed in a folder (like in S3, HDFS) or if an external process (e.g., an ETL job or API call) has completed.

import time
import os

while not os.path.exists("/data/input/daily_file.csv"):
    print("Waiting for file...")
    time.sleep(60)

# 2. **Retry Logic for Unstable Connections**
# Especially for APIs or flaky DB connections.

import requests

url = "https://api.example.com/data"
max_retries = 5
attempts = 0

while attempts < max_retries:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            break
    except Exception as e:
        print(f"Retrying due to error: {e}")
    attempts += 1

# 3. **Paginated API Calls**
# Some APIs return data in pages; you need to loop until all pages are fetched.

page = 1
while True:
    data = fetch_data_from_api(page=page)
    if not data:
        break
    process_data(data)
    page += 1

# 4. **Streaming Data Processing Loops**
# Useful in Kafka consumers, Spark Structured Streaming, or custom streaming solutions.

while True:
    message = kafka_consumer.poll()
    if message:
        process_message(message)

# 5. **Batch Processing Large Datasets**
# Breaking a huge task into chunks (e.g., reading 1 million records at a time).

offset = 0
batch_size = 100000

while True:
    df = read_data(offset, batch_size)
    if df.empty:
        break
    process(df)
    offset += batch_size

# 6. **Partition-wise Processing**
# In file systems like HDFS/S3 or partitioned databases.

for partition in partitions:
    offset = 0
    while True:
        batch = read_partition_data(partition, offset, batch_size)
        if not batch:
            break
        process(batch)
        offset += batch_size

# 7. **Loop Until a Condition in a Workflow Is Met**
# E.g., in Airflow or custom workflows: keep checking if a downstream system is ready.

ready = check_downstream_system_ready()
while not ready:
    print("Waiting for downstream system...")
    time.sleep(30)
    ready = check_downstream_system_ready()

# Would you like examples specific to **PySpark**, **Airflow**, or **cloud-based pipelines** (e.g., AWS Glue, Azure Data Factory)?


Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...
Waiting for file...


KeyboardInterrupt: 

In [None]:
import ssl
print(ssl.OPENSSL_VERSION)

OpenSSL 1.1.1d  10 Sep 2019


In [None]:
import requests

url = "https://raw.githubusercontent.com/ramkarpiyush/big_data/refs/heads/main/flight-data2015.json"

response = requests.get(url)
# print(response)
# print(response.status_code)

print("Status Code Name...")
if response.status_code == 200:
    print("‚úÖ Success")
elif response.status_code == 404:
    print("‚ùå Not Found")
elif response.status_code == 401:
    print("üîí Unauthorized")
elif response.status_code == 500:
    print("Server error üö®")
else:
    print("Status:", response.status_code)


Status Code Name...
‚úÖ Success


# 2. Retry Logic for Unstable Connections
    - Especially for APIs or flaky DB connections.

In [None]:
max_retries = 5
attempts = 0

while attempts < max_retries:
    try:
        response = requests.get(url)
        print(response.status_code)
        if response.status_code == 200:
            break
    except Exception as e:
        print(f"Retrying due to error: {e}")
    attempts += 1

200


# 3. Paginated API Calls

## What is a Paginated API?

It is an API that returns large sets of data in smaller pieces, called **pages**, instead of all at once.

## Common Query Parameters for Pagination

| Parameter | Meaning                                        |
|-----------|------------------------------------------------|
| `page`    | Which page you want (e.g., `page=2`)           |
| `limit`   | How many items per page (e.g., `limit=10`)     |
| `offset`  | How many items to skip (alternative to `page`) |

## Types of Pagination

| Type           | How It Works                                                  | Example                |
|----------------|---------------------------------------------------------------|------------------------|
| Page-based     | Use `page=2`, `limit=10`                                      | `?page=3&limit=10`     |
| Offset-based   | Use `offset=30`, `limit=10` (starts at item 30)               | `?offset=30&limit=10`  |
| Cursor-based   | Uses a token or ID from the last result (e.g., `?after=123`)  | Facebook, Twitter APIs |

## Benefits of Paginated APIs

1. Prevent slow or heavy responses.  
2. Improve API speed & performance.  
3. Support mobile apps and limited-bandwidth users.  
4. Avoid overloading servers and clients.


In [None]:
# 1. Page-Based Pagination:
"""
- You request a specific page number and optionally a page size (limit).
- Example: ?page=2&limit=5 means get the 2nd page with 5 items per page.
"""
import requests
import pandas as pd

url = "https://dummyjson.com/products"

params = {
    "_page": 10,
    "_limit": 5
}

# This sends an HTTP GET request to the specified url
response = requests.get(url, params=params)
print(response)

"""
The returned object response contains the server's response.
1. response.status_code     - HTTP status code (like 200, 404)
2. response.text            - raw text of the response
3. response.json()          - parsed JSON (if the response is JSON)
"""

# HTTP status code
status_code = response.status_code
print(status_code)

# Converts the response body (parsed JSON) into a Python dictionary or list
data = response.json()       
print(data)

product_data = [
    {
        "id": p.get("id"),
        "title": p.get("title"),
        "brand": p.get("brand", "N/A")  # default if 'brand' is missing
    }
    for p in data.get("products", [])
]

# df = pd.DataFrame(product_data)
# print(df)

# Use pandas.json_normalize():
df = pd.json_normalize(data['products'])
print(df.columns.tolist())
# print(df[['id', 'dimensions.width', 'sku', 'weight', 'price', 'tags', 'brand']])
print(df[df.columns.tolist()])


<Response [200]>
200
{'products': [{'id': 1, 'title': 'Essence Mascara Lash Princess', 'description': 'The Essence Mascara Lash Princess is a popular mascara known for its volumizing and lengthening effects. Achieve dramatic lashes with this long-lasting and cruelty-free formula.', 'category': 'beauty', 'price': 9.99, 'discountPercentage': 10.48, 'rating': 2.56, 'stock': 99, 'tags': ['beauty', 'mascara'], 'brand': 'Essence', 'sku': 'BEA-ESS-ESS-001', 'weight': 4, 'dimensions': {'width': 15.14, 'height': 13.08, 'depth': 22.99}, 'warrantyInformation': '1 week warranty', 'shippingInformation': 'Ships in 3-5 business days', 'availabilityStatus': 'In Stock', 'reviews': [{'rating': 3, 'comment': 'Would not recommend!', 'date': '2025-04-30T09:41:02.053Z', 'reviewerName': 'Eleanor Collins', 'reviewerEmail': 'eleanor.collins@x.dummyjson.com'}, {'rating': 4, 'comment': 'Very satisfied!', 'date': '2025-04-30T09:41:02.053Z', 'reviewerName': 'Lucas Gordon', 'reviewerEmail': 'lucas.gordon@x.dummyjso

In [None]:
# Count of records

import requests
import pandas as pd
import json

url = "https://dummyjson.com/products"

response = requests.get(url)
print(response.status_code)

data = response.json()
print(data)

# Count of number of items
print(len(data['products']))

df = pd.DataFrame(data)
df.count()


200
{'products': [{'id': 1, 'title': 'Essence Mascara Lash Princess', 'description': 'The Essence Mascara Lash Princess is a popular mascara known for its volumizing and lengthening effects. Achieve dramatic lashes with this long-lasting and cruelty-free formula.', 'category': 'beauty', 'price': 9.99, 'discountPercentage': 10.48, 'rating': 2.56, 'stock': 99, 'tags': ['beauty', 'mascara'], 'brand': 'Essence', 'sku': 'BEA-ESS-ESS-001', 'weight': 4, 'dimensions': {'width': 15.14, 'height': 13.08, 'depth': 22.99}, 'warrantyInformation': '1 week warranty', 'shippingInformation': 'Ships in 3-5 business days', 'availabilityStatus': 'In Stock', 'reviews': [{'rating': 3, 'comment': 'Would not recommend!', 'date': '2025-04-30T09:41:02.053Z', 'reviewerName': 'Eleanor Collins', 'reviewerEmail': 'eleanor.collins@x.dummyjson.com'}, {'rating': 4, 'comment': 'Very satisfied!', 'date': '2025-04-30T09:41:02.053Z', 'reviewerName': 'Lucas Gordon', 'reviewerEmail': 'lucas.gordon@x.dummyjson.com'}, {'rating

products    30
total       30
skip        30
limit       30
dtype: int64

In [None]:
import requests

url = "https://dummyjson.com/products"
page = 1
limit = 5

skip = (page - 1) * limit       # skip = (page - 1) * limit 
                                # How many items to skip
                                # Page 1: skip = 0, Page 2: skip = 5, Page 3: skip = 10, etc.
print(type(skip))

params = {
    "limit": limit,
    "skip": skip
}


response = requests.get(url, params=params)    
print(response)                                

data = response.json()

df = pd.DataFrame(data)
print(df)


<class 'int'>
<Response [200]>
                                            products  total  skip  limit
0  {'id': 1, 'title': 'Essence Mascara Lash Princ...    194     0      5
1  {'id': 2, 'title': 'Eyeshadow Palette with Mir...    194     0      5
2  {'id': 3, 'title': 'Powder Canister', 'descrip...    194     0      5
3  {'id': 4, 'title': 'Red Lipstick', 'descriptio...    194     0      5
4  {'id': 5, 'title': 'Red Nail Polish', 'descrip...    194     0      5


In [None]:
import requests
import pandas as pd
import time
import os

url = "https://dummyjson.com/products"
output_file = "d://results.csv"

limit = 10     # number of records per page
sleep_time = 2  # seconds to wait between requests
page = 0

while True:
    skip = (page - 1) * limit
    params = {
        "limit": limit,
        "skip": skip
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"Failed to fetch data: {response.status_code}")
        break

    data = response.json()

    # If the API returns a list directly
    if isinstance(data, list):
        df = pd.DataFrame(data)
    # If the API returns an object with a key like 'products' or 'items'
    elif 'products' in data:
        df = pd.DataFrame(data['products'])
    else:
        print("Unexpected data structure.")
        break

    # Break if no more data
    if df.empty:
        print("No more data to fetch.")
        break

    # Write or append
    if page == 0 and not os.path.exists(output_file):
        df.to_csv(output_file, index=False)
    else:
        df.to_csv(output_file, mode='a', header=False, index=False)

    print(f"Fetched and saved page {page + 1}")

    page += 1
    time.sleep(sleep_time)


ImportError: Missing optional dependency 'fsspec'.  Use pip or conda to install fsspec.