In [1]:
!apt-get install -y python3-asyncpg
!apt-get install -y python3-pandas

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  python3-async-timeout
The following NEW packages will be installed:
  python3-async-timeout python3-asyncpg
0 upgraded, 2 newly installed, 0 to remove and 12 not upgraded.
Need to get 527 kB of archives.
After this operation, 1832 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu noble/universe amd64 python3-async-timeout all 4.0.3-1 [6412 B]
Get:2 http://archive.ubuntu.com/ubuntu noble/universe amd64 python3-asyncpg amd64 0.29.0-1build1 [521 kB]
Fetched 527 kB in 0s (1128 kB/s)        
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package python3-async-timeout.
(Reading database ... 34422 files and directories currently installed.)
Preparing to unpack .../python3-async-timeout_4.0.3-1_all.deb ...
Unpacking python3-async-timeout (4.0.3-1) ...
Sele

In [2]:
import pyspark
print(pyspark.__version__)

3.5.5


In [None]:
import os
import requests
import time
import nest_asyncio
import asyncpg
import json
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType


DB_CONFIG = {
    "user": os.getenv("POSTGRES_USER", "admin"),
    "password": os.getenv("POSTGRES_PASSWORD", "admin"),
    "database": os.getenv("POSTGRES_DB", "tender"),
    "host": os.getenv("POSTGRES_HOST", "postgre"),
    "port": int(os.getenv("POSTGRES_PORT", 5432))
}

nest_asyncio.apply()

async def create_table():
    conn = await asyncpg.connect(**DB_CONFIG)
    create_table_query = """
    CREATE TABLE IF NOT EXISTS tenders (
        id VARCHAR PRIMARY KEY,
        date VARCHAR,
        deadline_date VARCHAR,
        title VARCHAR,
        category VARCHAR,
        description VARCHAR,
        phase VARCHAR,
        place VARCHAR,
        awarded_value VARCHAR,
        awarded_currency VARCHAR,
        awarded_date VARCHAR,
        suppliers_name VARCHAR,
        count BIGINT,
        offers_count BIGINT,
        json_data JSONB
    );
    """
    await conn.execute(create_table_query)
    await conn.close()

spark = (SparkSession.builder
                    .appName("REST_API_with_PySpark_DF")
                    .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

def get_tenders(page=1):
    url = f"https://tenders.guru/api/hu/tenders?page={page}"
    response = requests.get(url)

    try:
        if response.status_code == 200:
            json_data = response.json()
            return json_data
    except Exception as e:
        print(e)
        return None


schema = StructType([
    StructField("id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("deadline_date", StringType(), True),
    StructField("title", StringType(), True),
    StructField("category", StringType(), True),
    StructField("description", StringType(), True),
    StructField("phase", StringType(), True),
    StructField("place", StringType(), True),
    StructField("awarded_value", StringType(), True),
    StructField("awarded_currency", StringType(), True),
    StructField("purchaser_name", StringType(), True),
    StructField("awarded", ArrayType(StructType([
        StructField("date", StringType(), True),
        StructField("suppliers", ArrayType(StructType([
            StructField("name", StringType(), True),
            StructField("id", LongType(), True),
            StructField("slug", StringType(), True)
        ])), True),
        StructField("count", LongType(), True),
        StructField("offers_count", ArrayType(LongType()), True),
        StructField("suppliers_name", StringType(), True),
        StructField("value", LongType(), True)
    ])), True)
])


def create_df(data, schema):
    """
    create dataframe with selection
    """
    df = spark.createDataFrame(data, schema=schema)
    
    df = df.withColumn("awarded_date", F.col("awarded").getItem(0).getField("date")) \
           .withColumn("suppliers_name", F.col("awarded").getItem(0).getField("suppliers_name")) \
           .withColumn("count", F.col("awarded").getItem(0).getField("count")) \
           .withColumn("offers_count", F.col("awarded").getItem(0).getField("offers_count").getItem(0))

    selected_columns = [
        "id", 
        "date", 
        "deadline_date", 
        "title", 
        "category", 
        "description", 
        "phase", 
        "place", 
        "awarded_value", 
        "awarded_currency",
        "awarded_date", 
        "suppliers_name",
        "count", 
        "offers_count"
    ]
    
    return df.select(*selected_columns)

async def insert_to_postgres(records, json_data):
    conn = await asyncpg.connect(**DB_CONFIG)

    json_data = json.dumps(json_data)
    
    insert_query = """
    INSERT INTO tenders (id, date, deadline_date, title, category, description, phase, place, 
                         awarded_value, awarded_currency, awarded_date, suppliers_name, count, offers_count, json_data)
    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
    ON CONFLICT (id) DO NOTHING;
    """
    records_with_json = [(*record, json_data) for record in records]
    await conn.executemany(insert_query, records_with_json)
    await conn.close()

async def main():
    await create_table()
    page = 1
    continue_fetching = True

    while continue_fetching:
        json_data = get_tenders(page)

        if json_data is not None:
            data = json_data["data"]
            total_pages = json_data['page_count']

            df = create_df(data, schema)
            df.show(5)

            pandas_df = df.toPandas()
            records = [tuple(x) for x in pandas_df.to_numpy()]

            if records:
                await insert_to_postgres(records, json_data)

            if page >= total_pages:
                continue_fetching = False
            else:
                page += 1

            time.sleep(10)
        else:
            print("No data fetched or an error occurred.")
            continue_fetching = False

await main()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/30 19:08:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-----+----------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+----------------+------------+--------------------+-----+------------+
|   id|      date|deadline_date|               title|     category|         description|               phase|               place|awarded_value|awarded_currency|awarded_date|      suppliers_name|count|offers_count|
+-----+----------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+----------------+------------+--------------------+-----+------------+
|36077|2021-08-01|   2021-08-12|"Régészeti feltár...|constructions|Vállalkozási kere...|E60 - Szerződéskö...|     HU MAGYARORSZÁG| 299000000.00|             HUF|  2021-11-17|Salisbury Régésze...|    1|           2|
|36073|2021-09-26|   2021-10-07|Cserepes és Szélm...|constructions|1. részajánlat: K...|E60 - Szerződéskö...|HU322 Jász-Nagyku...|  56147040