In [1]:
import requests
import pandas as pd
import sqlite3
import logging
from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler

# ----------------------------------------
# Logging Setup
# ----------------------------------------
logging.basicConfig(
    filename="pipeline.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)


# ----------------------------------------
# STEP 1: EXTRACT
# ----------------------------------------
def extract_data():
    url = "https://jsonplaceholder.typicode.com/users"
    response = requests.get(url)

    if response.status_code == 200:
        logging.info("Data extracted successfully.")
        return response.json()
    else:
        logging.error("Extraction failed. Status Code: %s", response.status_code)
        return None


# ----------------------------------------
# STEP 2: TRANSFORM
# ----------------------------------------
def transform_data(raw_data):
    df = pd.DataFrame(raw_data)

    # Select & rename some fields
    df = df[["id", "name", "email"]]
    df.rename(columns={"id": "user_id"}, inplace=True)

    logging.info("Data transformed successfully.")
    return df


# ----------------------------------------
# STEP 3: LOAD
# ----------------------------------------
def load_data(df):
    conn = sqlite3.connect("users.db")
    df.to_sql("users", conn, if_exists="replace", index=False)
    conn.close()

    logging.info("Data loaded into SQLite database successfully.")


# ----------------------------------------
# Complete ETL Pipeline
# ----------------------------------------
def run_pipeline():
    logging.info("Pipeline started.")

    raw_data = extract_data()
    if raw_data:
        df = transform_data(raw_data)
        load_data(df)

    logging.info("Pipeline completed.")


# ----------------------------------------
# Automate with a Scheduler
# ----------------------------------------
if __name__ == "__main__":
    scheduler = BlockingScheduler()

    # Run every 6 hours
    scheduler.add_job(run_pipeline, "interval", hours=6)

    logging.info("Scheduler started. Pipeline will run every 6 hours.")
    scheduler.start()


ModuleNotFoundError: No module named 'apscheduler'