In [1]:
import psycopg2
import os
from dotenv import load_dotenv
load_dotenv()


True

In [13]:
conn = psycopg2.connect(
    dbname=os.getenv("DB_STREAM_NAME"),
    user=os.getenv("DB_STREAM_USER"),
    password=os.getenv("DB_STREAM_PASSWORD"),
    host=os.getenv("DB_STREAM_HOST"),
    port=os.getenv("DB_STREAM_PORT")
)

In [12]:
import pandas as pd

# Đọc file Parquet
file_path = "../../data/green_tripdata_2024-01.parquet"
df = pd.read_parquet(file_path)

# Hiển thị 5 dòng đầu tiên
# print(df.head())

In [4]:
len(df)

56551

In [5]:
DTYPE_MAPPING = {
    'int32': 'INTEGER',               # ánh xạ từ int32 sang INTEGER
    'int64': 'BIGINT',                # ánh xạ từ int64 sang BIGINT
    'float64': 'DOUBLE PRECISION',     # ánh xạ từ float64 sang DOUBLE PRECISION
    'object': 'TEXT',                  # ánh xạ từ object sang TEXT
    'datetime64[us]': 'TIMESTAMP', 
    'datetime64[ns]': 'TIMESTAMP', # ánh xạ từ datetime64 sang TIMESTAMP
}

def clear_all_tables(conn):
    cur = conn.cursor()
    cur.execute("SELECT tablename FROM pg_tables WHERE schemaname='public'")
    tables = cur.fetchall()
    for table in tables:
        cur.execute(f"DROP TABLE {table[0]} CASCADE")
    conn.commit()
    cur.close()

def create_table(conn, table_name, columns):
    cur = conn.cursor()
    columns_str = ', '.join([f"{col_name} {DTYPE_MAPPING[str(col_type)]}" for col_name, col_type in columns.items()])
    cur.execute(f"CREATE TABLE {table_name} ({columns_str})")
    conn.commit()
    cur.close()

In [15]:
clear_all_tables(conn)
# conn.rollback()

In [6]:
import requests
import json

def check_connector_status():
    try:
        # Kiểm tra connector có tồn tại
        response = requests.get('http://localhost:8083/connectors/taxi-nyc-cdc')
        print("Connector status:", response.json())
        
        # Kiểm tra trạng thái chi tiết
        response = requests.get('http://localhost:8083/connectors/taxi-nyc-cdc/status')
        print("Detailed status:", json.dumps(response.json(), indent=2))
        
    except Exception as e:
        print(f"Lỗi khi check connector: {e}")
check_connector_status()

Connector status: {'name': 'taxi-nyc-cdc', 'config': {'connector.class': 'io.debezium.connector.postgresql.PostgresConnector', 'database.user': 'postgres', 'database.dbname': 'streaming_db', 'topic.prefix': 'streaming', 'database.hostname': 'host.docker.internal', 'database.password': '12345', 'name': 'taxi-nyc-cdc', 'database.server.name': 'source', 'table.include.list': 'public.yellow_trip_raw,public.green_trip_raw,public.', 'database.port': '5432', 'plugin.name': 'pgoutput'}, 'tasks': [{'connector': 'taxi-nyc-cdc', 'task': 0}], 'type': 'source'}
Detailed status: {
  "name": "taxi-nyc-cdc",
  "connector": {
    "state": "RUNNING",
    "worker_id": "172.18.0.7:8083"
  },
  "tasks": [
    {
      "id": 0,
      "state": "RUNNING",
      "worker_id": "172.18.0.7:8083"
    }
  ],
  "type": "source"
}


In [7]:
def check_replication_slot():
    try:
        conn = psycopg2.connect(
            dbname="streaming_db",
            user="postgres",
            password="12345",
            host="localhost",
            port="5432"
        )
        cur = conn.cursor()
        
        # Kiểm tra replication slot
        cur.execute("SELECT * FROM pg_replication_slots;")
        slots = cur.fetchall()
        print("Replication slots:", slots)
        
        # Kiểm tra publication
        cur.execute("SELECT * FROM pg_publication;")
        publications = cur.fetchall()
        print("Publications:", publications)
        
        cur.close()
        conn.close()
        
    except Exception as e:
        print(f"Lỗi khi check replication: {e}")


In [8]:
check_replication_slot()

Replication slots: [('debezium', 'pgoutput', 'logical', 16388, 'streaming_db', False, True, 3956, None, '780', '0/1ABB450', '0/1ABB450', 'reserved', None, False, None, False, None, False, False), ('taxi_nyc_time_series', 'pgoutput', 'logical', 16388, 'streaming_db', False, False, None, None, '763', '0/1A56280', '0/1A905A0', 'reserved', None, False, datetime.datetime(2024, 12, 11, 10, 50, 44, 452919, tzinfo=datetime.timezone(datetime.timedelta(seconds=25200))), False, None, False, False)]
Publications: [(16394, 'dbz_publication', 10, True, True, True, True, True, False)]


In [9]:
def insert_to_db(conn, table_name, df):
    cur = conn.cursor()
    for i, row in df.iterrows():
        columns = ', '.join(row.keys())
        values = ', '.join([f"'{str(value)}'" for value in row.values])
        cur.execute(f"INSERT INTO {table_name} ({columns}) VALUES ({values})")
    conn.commit()
    cur.close()

In [16]:
create_table(conn, "green_trip_raw", df.dtypes)

In [20]:
insert_to_db(conn, 'green_trip_raw', df.iloc[0:15])
conn.close()

In [10]:
conn.rollback()

In [12]:
insert_to_db(conn, 'green_trip_raw', df.iloc[115:120])