## Python environment libs

In [None]:
!pip install pandas numpy
!pip install Faker

## Initial Tables Creation
### Copy CSV file into the container
`docker cp /home/user/random_dataset.csv postgres_container:/tmp/random_dataset.csv`

### Connect to PostgreSQL
`docker exec -it postgres_container psql -U postgres -d your_database`

### Once in psql
```
CREATE TABLE sales_data (
    date TIMESTAMP,
    name VARCHAR(50),
    market_area VARCHAR(100),
    number_of_sales INTEGER,
    pricing_unit INTEGER
);
```

`\copy sales_data FROM '/tmp/random_dataset.csv' WITH CSV HEADER;`

## Python Data Mock-up / Generator

In [None]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
from loguru import logger as log

# Initialize Faker
fake = Faker("id_ID")

# Set the range for datetime
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 1, 1)


# Function to generate a random datetime between a given range
def random_date(start, end):
    return start + timedelta(
        seconds=np.random.randint(0, int((end - start).total_seconds()))
    )


# Function to generate each row
def generateSalesRow(names: list, prices: list):
    return {
        "date": random_date(start_date, end_date),
        "name": random.choice(names),
        "market_area": fake.administrative_unit(),
        "number_of_sales": fake.random_int(min=1, max=75),
        "pricing_unit": random.choice(prices),
    }


# Function to generate whole dataset
def generateSales(num: int):
    """
    param :
    - num = nummber of dataset
    """
    try:
        sales_list = [
            "Diah",
            "Wahyu",
            "Lisa",
            "Anton",
            "Malik",
            "Riana",
            "Rafi",
            "Bela",
            "Budi",
        ]
        pricing_units = [50000, 100000, 150000]

        log.info("Flow - Generating Data Rows")
        generated_data = [
            generateSalesRow(names=sales_list, prices=pricing_units) for _ in range(num)
        ]

        log.info("Flow - Saving dataframe into csv")
        generated_dataframe = pd.DataFrame(generated_data)
        generated_dataframe.to_csv("sales_generated.csv", index=False)
    except Exception as e:
        log.error(f"Flow error - error occured as {e}")


if __name__ == "__main__":
    """
    All entry point
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--numbers", help="Numbers of data to generate", type=int)
    args = parser.parse_args()

    if args.numbers:
        generateSales(num=args.numbers)
    else:
        log.error("Specify numbers")


## Export data to parquet

In [None]:
!pip install psycopg2-binary
!pip install sqlalchemy, pyarrrow

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sqlalchemy import create_engine
from loguru import logger as log

# Database connection parameters
db_params = {
    "dbname": "postgres",
    "user": "postgres",
    "password": "password",
    "host": "localhost",
    "port": "5432",
}

# Create SQLAlchemy engine
engine = create_engine(
    f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
)

# SQL query to fetch data
query = "SELECT * FROM sales_data WHERE name = 'Budi'"

try:
    # Read data into a pandas DataFrame
    log.info("Flow - Connect and Fetch through DB")
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)

    print(f"DataFrame shape: {df.shape}")
    print(df.head())

    log.info("Flow - Converting to Parquet")
    # Convert the pandas DataFrame to a PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write the PyArrow Table to a Parquet file
    pq.write_table(table, "sales_data.parquet")

    log.info("Flow - Parquet Exported")
except Exception as e:
    log.error(f"Flow error - occured as {e}")


## Ingest via API

In [None]:
!pip install pokebase
!pip install sqlalchemy

In [None]:
import pokebase as pb
from sqlalchemy import create_engine, text
from loguru import logger as log

# Database connection parameters
db_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'password',
    'host': 'localhost',
    'port': '5432'
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}")

# Function to insert a Pokemon into the database
def insert_pokemon(pokemon):
    query = text("""
    INSERT INTO pokemon (id, name, height, weight, base_experience, type)
    VALUES (:id, :name, :height, :weight, :base_experience, :type)
    ON CONFLICT (id) DO UPDATE SET
        name = :name,
        height = :height,
        weight = :weight,
        base_experience = :base_experience,
        type = :type
    """)

    with engine.connect() as conn:
        conn.execute(query, {
            'id': pokemon.id,
            'name': pokemon.name,
            'height': pokemon.height,
            'weight': pokemon.weight,
            'base_experience': pokemon.base_experience,
            'type': pokemon.types[0].type.name if pokemon.types else 'Unknown'
        })
        conn.commit()

# Fetch and insert Pokemon data
def fetch_and_insert_pokemon(start_id, end_id):
    for pokemon_id in range(start_id, end_id + 1):
        try:
            pokemon = pb.pokemon(pokemon_id)
            insert_pokemon(pokemon)
            log.info(f"Inserted Pokemon: {pokemon.name}")
        except Exception as e:
            log.warning(f"Error fetching Pokemon with ID {pokemon_id}: {e}")

if __name__ == "__main__":
    """
    Main Entry Point
    """
    fetch_and_insert_pokemon(1, 20)  # Fetch and insert first 20 Pokemon

"""
CREATE TABLE pokemon (
    id INTEGER PRIMARY KEY,
    name VARCHAR(100),
    height INTEGER,
    weight INTEGER,
    base_experience INTEGER,
    type VARCHAR(50)
);
"""