In [2]:
# notebooks/03_database_storage.ipynb

import os
import pandas as pd
import oracledb # Import the Oracle DB driver

# --- 1. Database Configuration (using environment variables) ---
# It's highly recommended to set these in your environment
# e.g., in your shell: export DB_USER="myuser"
DB_USER = os.getenv('DB_USER', 'system')
DB_PASSWORD = os.getenv('DB_PASSWORD', '123456')
DB_HOST = os.getenv('DB_HOST', 'localhost')
DB_PORT = os.getenv('DB_PORT', '1521')
DB_SERVICE_NAME = os.getenv('DB_SERVICE_NAME', 'XEPDB1') # Common for Oracle XE, replace with your actual service name or SID

# Construct the DSN (Data Source Name)
DB_DSN = f"{DB_HOST}:{DB_PORT}/{DB_SERVICE_NAME}"

# --- 2. Data Loading (from your previous script) ---
processed_data_dir = os.path.join(os.path.abspath(''), os.pardir, 'data', 'processed')
input_filepath = os.path.join(processed_data_dir, 'fintech_app_reviews_analyzed.csv')

df_analyzed = pd.DataFrame() # Initialize empty DataFrame
if not os.path.exists(input_filepath):
    print(f"Error: Analyzed data file not found at {input_filepath}.")
    print("Please ensure '02_sentiment_thematic_analysis.ipynb' has been run successfully to create this file.")
else:
    df_analyzed = pd.read_csv(input_filepath)
    print(f"Loaded {len(df_analyzed)} analyzed reviews from {input_filepath}")
    print("Columns in df_analyzed:", df_analyzed.columns.tolist())


# --- 3. Database Connection Function ---
def get_db_connection():
    """Establishes and returns a connection to the Oracle database."""
    connection = None
    try:
        # For thin mode (recommended, no Instant Client needed for basic use)
        connection = oracledb.connect(user=DB_USER, password=DB_PASSWORD, dsn=DB_DSN)
        print("Successfully connected to Oracle Database!")
        return connection
    except oracledb.Error as e:
        error_obj, = e.args
        print(f"Database connection error: {error_obj.message}")
        return None

# --- 4. Function to Create Table ---
def create_reviews_table(connection):
    """
    Creates the FINTECH_APP_REVIEWS table if it does not already exist.
    Matches the schema of your df_analyzed DataFrame.
    """
    if connection is None:
        print("No database connection available to create table.")
        return False

    cursor = connection.cursor()
    table_name = "FINTECH_APP_REVIEWS" # Use a consistent table name

    # Define the table creation SQL based on your DataFrame columns
    # Adjust column types and lengths as necessary for your data
    # REVIEW_TEXT should be CLOB for potentially long review texts
    # DATE should be DATE type
    # Sentiment_Score, Rating are NUMERIC
    create_table_sql = f"""
    CREATE TABLE {table_name} (
        REVIEW_ID VARCHAR2(50) PRIMARY KEY,
        USER_NAME VARCHAR2(255),
        RATING NUMBER(1,0),
        REVIEW_DATE DATE,
        REVIEW_TEXT CLOB,
        BANK_APP_NAME VARCHAR2(255),
        SOURCE VARCHAR2(100),
        SENTIMENT VARCHAR2(50),
        SENTIMENT_SCORE NUMBER(5,4),
        PROCESSED_REVIEWS_TOKENS CLOB, -- Storing as CLOB, consider if you need to parse this in DB
        EXTRACTED_KEYWORDS CLOB,       -- Storing as CLOB
        IDENTIFIED_THEME VARCHAR2(255)
    )
    """

    try:
        # Check if table exists (Oracle specific way)
        cursor.execute(f"SELECT table_name FROM user_tables WHERE table_name = '{table_name.upper()}'")
        if cursor.fetchone():
            print(f"Table '{table_name}' already exists. Skipping creation.")
        else:
            cursor.execute(create_table_sql)
            print(f"Table '{table_name}' created successfully.")
        connection.commit()
        return True
    except oracledb.Error as e:
        error_obj, = e.args
        print(f"Error creating table '{table_name}': {error_obj.message}")
        return False
    finally:
        cursor.close()

# --- 5. Function to Insert Data ---
def insert_reviews_data(connection, df):
    """
    Inserts data from the pandas DataFrame into the FINTECH_APP_REVIEWS table.
    Uses executemany for efficient bulk insertion.
    """
    if connection is None or df.empty:
        print("No database connection or empty DataFrame to insert data.")
        return

    cursor = connection.cursor()
    table_name = "FINTECH_APP_REVIEWS"

    # SQL INSERT statement with placeholders
    # Ensure the order of columns matches the order of data in the tuple
    insert_sql = f"""
    INSERT INTO {table_name} (
        REVIEW_ID, USER_NAME, RATING, REVIEW_DATE, REVIEW_TEXT,
        BANK_APP_NAME, SOURCE, SENTIMENT, SENTIMENT_SCORE,
        PROCESSED_REVIEWS_TOKENS, EXTRACTED_KEYWORDS, IDENTIFIED_THEME
    ) VALUES (
        :review_id, :user_name, :rating, :review_date, :review_text,
        :bank_app_name, :source, :sentiment, :sentiment_score,
        :processed_reviews_tokens, :extracted_keywords, :identified_theme
    )
    """

    data_to_insert = []
    for index, row in df.iterrows():
        data_to_insert.append({
            "review_id": str(row['reviewId']), # Ensure unique and string for PK
            "user_name": str(row['User Name']) if pd.notna(row['User Name']) else None,
            "rating": int(row['Rating']) if pd.notna(row['Rating']) else None,
            "review_date": pd.to_datetime(row['Date']).date(), # Convert to Python date object
            "review_text": str(row['Review Text']) if pd.notna(row['Review Text']) else None,
            "bank_app_name": str(row['Bank/App Name']) if pd.notna(row['Bank/App Name']) else None,
            "source": str(row['Source']) if pd.notna(row['Source']) else None,
            "sentiment": str(row['Sentiment']) if pd.notna(row['Sentiment']) else None,
            "sentiment_score": float(row['Sentiment_Score']) if pd.notna(row['Sentiment_Score']) else None,
            "processed_reviews_tokens": str(row['Processed_Reviews_Tokens']) if pd.notna(row['Processed_Reviews_Tokens']) else None,
            "extracted_keywords": str(row['Extracted_Keywords']) if pd.notna(row['Extracted_Keywords']) else None,
            "identified_theme": str(row['Identified_Theme']) if pd.notna(row['Identified_Theme']) else None,
        })

    try:
        cursor.executemany(insert_sql, data_to_insert)
        connection.commit()
        print(f"Successfully inserted {cursor.rowcount} rows into '{table_name}'.")
    except oracledb.Error as e:
        error_obj, = e.args
        print(f"Error inserting data into '{table_name}': {error_obj.message}")
        # Rollback on error
        connection.rollback()
    finally:
        cursor.close()

# --- 6. Function to Read Data (Example) ---
def fetch_reviews_data(connection, limit=5):
    """
    Fetches a limited number of reviews from the FINTECH_APP_REVIEWS table.
    """
    if connection is None:
        print("No database connection available to fetch data.")
        return pd.DataFrame()

    cursor = connection.cursor()
    table_name = "FINTECH_APP_REVIEWS"
    select_sql = f"SELECT * FROM {table_name} WHERE ROWNUM <= :limit_val" # Oracle specific LIMIT

    try:
        cursor.execute(select_sql, limit_val=limit)
        columns = [col[0] for col in cursor.description]
        rows = cursor.fetchall()
        df_from_db = pd.DataFrame(rows, columns=columns)
        print(f"Successfully fetched {len(df_from_db)} rows from '{table_name}'.")
        return df_from_db
    except oracledb.Error as e:
        error_obj, = e.args
        print(f"Error fetching data from '{table_name}': {error_obj.message}")
        return pd.DataFrame()
    finally:
        cursor.close()


# --- Main Execution Block for Database Operations ---
if not df_analyzed.empty:
    db_connection = None
    try:
        db_connection = get_db_connection()
        if db_connection:
            if create_reviews_table(db_connection):
                # Before inserting, it's good practice to clear existing data
                # if you intend to refresh the table each time.
                # Or, handle duplicates if REVIEW_ID is PK and you might re-run.
                # For simplicity here, we'll assume a fresh insert or ignore duplicates.
                # If you need to clear the table first, uncomment the following:
                # print(f"Truncating table FINTECH_APP_REVIEWS...")
                # cursor = db_connection.cursor()
                # cursor.execute("TRUNCATE TABLE FINTECH_APP_REVIEWS")
                # db_connection.commit()
                # cursor.close()
                # print("Table truncated.")

                insert_reviews_data(db_connection, df_analyzed)

                # Optional: Fetch some data to verify
                # fetched_df = fetch_reviews_data(db_connection)
                # if not fetched_df.empty:
                #     print("\nSample data fetched from database:")
                #     print(fetched_df.head())
    finally:
        if db_connection:
            db_connection.close()
            print("Database connection closed.")
else:
    print("DataFrame for database operations is empty. Skipping database connection.")

Loaded 8986 analyzed reviews from d:\10academy\10acadamey\week_02_challenge\notebooks\..\data\processed\fintech_app_reviews_analyzed.csv
Columns in df_analyzed: ['reviewId', 'User Name', 'Rating', 'Date', 'Review Text', 'Bank/App Name', 'Source', 'Sentiment', 'Sentiment_Score', 'Processed_Reviews_Tokens', 'Extracted_Keywords', 'Identified_Theme']
Successfully connected to Oracle Database!
Table 'FINTECH_APP_REVIEWS' created successfully.
Successfully inserted 8986 rows into 'FINTECH_APP_REVIEWS'.
Database connection closed.
