## Install dependencies

In [None]:
%%capture
!pip install "dlt[duckdb]"

## 🧪 Exercise 1: Extract Paginated Data from the GitHub API

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator


organization = "dlt-hub"

def paginated_getter():
    client = RESTClient(
        base_url="https://api.github.com",
        paginator=HeaderLinkPaginator()
    )

    for page in client.paginate(f"/orgs/{organization}/repos"):
        yield page


for page_data in paginated_getter():
    print(page_data)
    break

## 🧪 Exercise 2: Loading GitHub issues

In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient


organization = "dlt-hub"


@dlt.resource(name="issues")
def get_issues():
    client = RESTClient(
        base_url="https://api.github.com",
    )

    for page in client.paginate(f"repos/{organization}/dlt/issues"):
        yield page

pipeline = dlt.pipeline(pipeline_name="issuess_pipeline", destination="duckdb")
info = pipeline.run(get_issues)
print(info)

## 🧪 Exercise 3: Add authentification to GitHub issues

In [None]:
import os
import dlt
from google.colab import userdata
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth

access_token = userdata.get('SECRET_KEY')

organization = "dlt-hub"


@dlt.resource(name="issues")
def get_issues():
    client = RESTClient(
        base_url="https://api.github.com",
        auth=BearerTokenAuth(token=access_token)
    )

    for page in client.paginate(f"repos/{organization}/dlt/issues"):
        yield page

pipeline = dlt.pipeline(pipeline_name="issues_pipeline", destination="duckdb")
info = pipeline.run(get_issues.add_limit(2))
print(info)

## 🧪 Exercise 4: Load GitHub data into DuckDB with incremental loading

In [None]:
import os
import dlt
from google.colab import userdata
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.auth import BearerTokenAuth

access_token = userdata.get('SECRET_KEY')

organization = "dlt-hub"


@dlt.resource(name="issues", write_disposition="merge", primary_key="id")
def get_issues(updated=dlt.sources.incremental("updated_at", initial_value="2025-02-28T00:00:00Z")):
    client = RESTClient(
        base_url="https://api.github.com",
        auth=BearerTokenAuth(token=access_token)
    )

    for page in client.paginate(f"repos/{organization}/dlt/issues", params={"updated_at": updated.last_value}):
        yield page

pipeline = dlt.pipeline(pipeline_name="issues_pipeline_incremental", destination="duckdb")
info = pipeline.run(get_issues)
print(info)

### Second run

In [None]:
info = pipeline.run(get_issues)
print(info)

### Min `updated_at`

In [None]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            MIN(updated_at)
            FROM issues;
            """
        )
    print(res)

### Question: How many issues were created/updated in the last 2 months?

In [None]:
issues_table = pipeline.dataset().issues.df()
len(issues_table)

## 🧪 Exercise 5: Update GitHub source to declarative style

In [None]:
import dlt
from google.colab import userdata
from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources

access_token = userdata.get('SECRET_KEY')


@dlt.source
def github_source(github_token=dlt.secrets.value):
    config: RESTAPIConfig = {
        "client": {
            "base_url": "https://api.github.com/repos/dlt-hub/dlt/",
            "auth": {
                "token": github_token,
            },
        },
        "resources": [
            {
                "name": "issues",
                "endpoint": {
                    "path": "issues",
                    "params": {
                        "sort": "updated",
                        "direction": "desc",
                        "state": "open",
                        "since": {
                            "type": "incremental",
                            "cursor_path": "updated_at",
                            "initial_value": "2024-01-25T11:21:28Z",
                        },
                    },
                },
            },
            "stargazers",
            "commits",

        ],
    }

    yield from rest_api_resources(config)


pipeline = dlt.pipeline(
    pipeline_name="rest_api_github",
    destination="duckdb",
    dataset_name="rest_api_data",
)

load_info = pipeline.run(github_source(access_token).add_limit(2))
print(load_info)

### Explore data

In [None]:
import duckdb
from google.colab import data_table

data_table.enable_dataframe_formatter()


with duckdb.connect(f"{pipeline.pipeline_name}.duckdb") as conn:
  # Set search path to the dataset
  conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

  # Describe the dataset
  df = conn.sql("DESCRIBE").df()

df