In [None]:
# install the package (run on Google Colab)
%%capture
!pip install dlt[bigquery]

In [None]:
# extract the urls from web
import dlt
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

url_parquet = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

def get_parquet_urls():
    """Get the urls from the TLC NYC webpage data"""
    response = requests.get(url_parquet)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all("a", href=True)

    parquet_urls = [
        link["href"] for link in links if link["href"].endswith(".parquet")
    ]

    return parquet_urls

In [None]:
# get the urls and insert into a variable urls_list
urls_list = get_parquet_urls()

# to see the url
urls_list[0]

In [None]:
# Insert into the parquet_urls list the yellow_tripdata from January 2024 to June 2024
parquet_urls = [f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-0{str(month)}.parquet' for month in range(1,7)]
parquet_urls

In [None]:
# Connect to download the data
@dlt.resource(name="ny_taxi_data", write_disposition="replace")
def ny_taxi():

  for url in parquet_urls:
    response = requests.get(url)
    response.raise_for_status()

    df = pd.read_parquet(BytesIO(response.content))
    yield df.to_dict(orient="records")

In [None]:
# declare the credentials from bigquery
import os
from google.colab import userdata

os.environ["DESTINATION__BIGQUERY__CREDENTIALS"] = userdata.get('BIGQUERY_CRED')

In [None]:
# Create the pipeline in order to insert the data into bigquery
pipeline = dlt.pipeline(
    pipeline_name="taxi_data",
    destination="bigquery",
    dataset_name="demobigquery2612",
    dev_mode=True,
)

info = pipeline.run(ny_taxi)
print(info)