In [14]:
import pandas as pd
import psycopg2
import sys

# --- 1. CONFIGURATION ---
DB_HOST = "localhost"
DB_NAME = "bank_reviews" 
DB_USER = "postgres"
DB_PASSWORD = "Hottiefy#1"
DB_PORT = 5433


# ----------------------------------------------------
# 2. DATA LOADING AND PREPARATION
# ----------------------------------------------------

# Load the single input file provided by the user
# Note: This file contains sentiment/theme data but LACKS the review_date column.
df = pd.read_csv("./data/processed/reviews_with_sentiment.csv") 

# Prepare the banks DataFrame from unique values in the reviews data
df_banks = df[['bank_code', 'bank_name']].drop_duplicates().reset_index(drop=True)

# Define placeholders for missing data
placeholder_app_name = None  # For banks table
placeholder_review_date = None # For reviews table (inserts SQL NULL)


# ----------------------------------------------------
# 3. DATABASE CONNECTION SETUP
# ----------------------------------------------------

try:
    conn = psycopg2.connect(
        host=DB_HOST,
        database=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        port=DB_PORT
    )
    cur = conn.cursor()
except psycopg2.Error as e:
    print(f"Error connecting to database: {e}")
    sys.exit(1)


# ----------------------------------------------------
# 4. DATA INSERTION (Banks First, then Reviews)
# ----------------------------------------------------
print("--- Starting Data Insertion ---")

# 4a. Insert into banks (Using public.banks)
print(f"Inserting {len(df)} reviews...")
for _, row in df.iterrows():
    cur.execute(
        """
        INSERT INTO public.reviews 
            (review_id, review_text,rating, bank_code,review_date,bank_name,  sentiment_label, sentiment_score, identified_themes)
        VALUES 
            (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON CONFLICT (review_id) DO NOTHING;
        """,
        (
            row["review_id"],   
            row["review_text"],   
            int(row["rating"]),     
            row["bank_code"],           
          
            placeholder_review_date, 
                  # NULL for review_date
                     row["bank_name"],  
            row["sentiment_label"],
            float(row["sentiment_score"]),
            row["identified_themes"]
        )
    )


# 4b. Insert into reviews (Using public.reviews)
print(f"Inserting {len(df_banks)} unique banks...")
for _, row in df_banks.iterrows():
    cur.execute(
        """
        INSERT INTO public.banks (bank_code, bank_name, app_name)
        VALUES (%s, %s, %s)
        ON CONFLICT (bank_code) DO NOTHING;
        """,
        (
            row["bank_code"],
            row["bank_name"],
            placeholder_app_name # NULL for app_name
        )
    )

# ----------------------------------------------------
# 5. COMMIT & CLOSE
# ----------------------------------------------------
conn.commit()
print("Data insertion complete. Committing changes.")


# ----------------------------------------------------
# 6. VERIFICATION (KPIs)
# ----------------------------------------------------
print("\n--- Verification Queries (KPIs) ---")

# Helper function for quick querying
def run_query(sql):
    return pd.read_sql(sql, conn)

# KPI 1: Count reviews per bank
print("Review counts by bank:")
sql_count = """
SELECT b.bank_name, COUNT(r.review_id) AS total_reviews
FROM public.reviews r
JOIN public.banks b ON b.bank_code = r.bank_code
GROUP BY b.bank_name
ORDER BY total_reviews DESC;
"""
# Note: You will need to run this Python script to see the output data
display(run_query(sql_count))


# KPI 2: Average rating per bank
print("\nAverage rating by bank:")
sql_avg_rating = """
SELECT b.bank_name, ROUND(AVG(r.rating), 2) AS avg_rating
FROM public.reviews r
JOIN public.banks b ON b.bank_code = r.bank_code
GROUP BY b.bank_name
ORDER BY avg_rating DESC;
"""
# Note: You will need to run this Python script to see the output data
display(run_query(sql_avg_rating))

# ----------------------------------------------------
# 7. FINAL CLEANUP
# ----------------------------------------------------
cur.close()
conn.close()
print("\nDatabase connection closed.")

--- Starting Data Insertion ---
Inserting 1200 reviews...


UndefinedTable: relation "public.reviews" does not exist
LINE 2:         INSERT INTO public.reviews 
                            ^


In [None]:
CREATE TABLE Orders (
    order_id INTEGER PRIMARY KEY,
    order_date DATE,
    total_amount NUMERIC(10, 2),
    
    -- Defining the Foreign Key
    customer_id INTEGER,
    
    CONSTRAINT fk_customer
        FOREIGN KEY (customer_id)
        REFERENCES Customers (customer_id)
        -- ON DELETE RESTRICT prevents deleting a customer 
        -- if they have any orders in the Orders table.
        ON DELETE RESTRICT 
);

In [None]:
CREATE TABLE public.reviews (

review_id VARCHAR(50) PRIMARY KEY,



bank_code VARCHAR(50) REFERENCES public.banks (bank_code),

review_text TEXT,

rating INTEGER,

review_date DATE, -- Placeholder is NULL, but table should expect a DATE type

sentiment_label VARCHAR(50),

sentiment_score REAL,

identified_themes TEXT,

-- bank_name is redundant here as it's in the banks table,

-- but if you must include it, define it here.



);