In [5]:
import os
import nest_asyncio
import asyncpg
from pyspark.sql import SparkSession
import pandas as pd
from pathlib import Path

In [6]:
nest_asyncio.apply()

DB_CONFIG = {
    "user": os.getenv("POSTGRES_USER", "admin"),
    "password": os.getenv("POSTGRES_PASSWORD", "admin"),
    "database": os.getenv("POSTGRES_DB", "tender"),
    "host": os.getenv("POSTGRES_HOST", "postgre"),
    "port": int(os.getenv("POSTGRES_PORT", 5432))
}

async def read_table_as_df() -> pd.DataFrame:
    conn = await asyncpg.connect(**DB_CONFIG)
    rows = await conn.fetch("SELECT * FROM tenders")
    df = pd.DataFrame([dict(row) for row in rows])
    await conn.close()
    return df


In [7]:
tender_csv_name = "tender.csv"
tender_csv_path = Path.cwd() / tender_csv_name

if not tender_csv_path.is_file():
    df = await read_table_as_df()
    df.to_csv(tender_csv_path, index=False)
    print(f"Saved tender data to: {tender_csv_path}")
else:
    print(f"{tender_csv_name} already exists at: {tender_csv_path}")


tender.csv already exists at: /opt/workspace/tender.csv


In [8]:
spark = SparkSession.builder \
                    .appName("File_to_spark") \
                    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

25/04/12 13:55:09 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [14]:
df = spark.read.csv(tender_csv_path.as_posix(), header=True, inferSchema=True)

df.show()
df.printSchema()

+--------------------+--------------------+--------------------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c0|                  id|                date|deadline_date|               title|     category|         description|               phase|               place|       awarded_value|   awarded_currency|       awarded_date|      suppliers_name|               count|        offers_count|        request_json|
+--------------------+--------------------+--------------------+-------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|                   0|            

In [23]:
df = df.drop("request_json")
df = df.dropna()

print(df.count())
df.show()

98
+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|                 _c0|                  id|                date|       deadline_date|               title|     category|         description|               phase|               place|       awarded_value|    awarded_currency|       awarded_date|      suppliers_name|               count|        offers_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|                   0|               35930|          2021-10-17|      