In [None]:
#worked locally but fails when ecountering duplicates
import os
import pandas as pd
import cx_Oracle
from datetime import datetime
import glob

# Oracle Database connection details
# DB_USER = "system"
# DB_PASSWORD = "oracle"
# DB_HOST = "localhost"
# DB_PORT = "1521"
# DB_SERVICE_NAME = "XE"
DB_USER = "EDI"
DB_PASSWORD = "edi"
DB_HOST = "172.16.2.122"
DB_PORT = "1521"
DB_SERVICE_NAME = "EDIPROD"
TABLE_NAME = "ITRS_URT_RECEIPTS"

# Directory path containing the .xls files
FILE_PATH = r"C:\edi_data\test"  # Update with your actual file path

# Expected DataFrame columns (adapt this to your actual columns)
COLUMNS = ['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY', 'AMOUNT', 'DESCRIPTION']

# Function to create the connection and perform insertion
def create_db_connection():
    # Oracle DSN (Data Source Name)
    dsn = cx_Oracle.makedsn(DB_HOST, DB_PORT, service_name=DB_SERVICE_NAME)
    
    # Establishing the connection
    connection = cx_Oracle.connect(DB_USER, DB_PASSWORD, dsn)
    return connection

# Function to insert/update data from DataFrame
# def upsert_data(df, connection):
#     cursor = connection.cursor()
    
#     # Insert SQL statement for the ITRS_URT_RECEIPTS table
#     insert_sql = """
#         INSERT INTO ITRS_URT_RECEIPTS (INSTITUTIONCODE, REPORTINGDATE, DESCRIPTIONNO, PURPOSE, PU_CODE, 
#                                        COUNTRY, SECTOR, CURRENCY, AMOUNT, DESCRIPTION, STATUS)
#         VALUES (:INSTITUTIONCODE, TO_DATE(:REPORTINGDATE, 'DD-MON-YYYY'), :DESCRIPTIONNO, :PURPOSE, :PU_CODE, 
#                 :COUNTRY, :SECTOR, :CURRENCY, :AMOUNT, :DESCRIPTION, :STATUS)
#     """
    
#     # Iterating through DataFrame rows and executing insert query
#     for _, row in df.iterrows():
#         cursor.execute(insert_sql, row.to_dict())
    
#     # Commit the transaction
#     connection.commit()
#     cursor.close()
import cx_Oracle
import pandas as pd

def upsert_data(df, connection):
    cursor = connection.cursor()
    
    # Convert data types to match the table definition
    df = df.fillna('')
    df['REPORTINGDATE'] = pd.to_datetime(df['REPORTINGDATE'], errors='coerce').dt.strftime('%d-%b-%Y')  # Format as 'DD-MON-YYYY'
    
    # Ensure numeric columns are converted properly
    df['DESCRIPTIONNO'] = pd.to_numeric(df['DESCRIPTIONNO'], errors='coerce').fillna(0).astype(int)  # Convert to int
    df['PU_CODE'] = pd.to_numeric(df['PU_CODE'], errors='coerce').fillna(0).astype(int)  # Convert to int
    df['AMOUNT'] = pd.to_numeric(df['AMOUNT'], errors='coerce').fillna(0).astype(float)  # Convert to float
    
    # Convert DataFrame rows to list of tuples
    data_tuples = [
        (
            row.INSTITUTIONCODE, row.REPORTINGDATE, row.DESCRIPTIONNO, row.PURPOSE, row.PU_CODE,
            row.COUNTRY, row.SECTOR, row.CURRENCY, row.AMOUNT, row.DESCRIPTION, row.STATUS
        )
        for _, row in df.iterrows()
    ]
    
    # UPSERT (MERGE) SQL statement
    upsert_sql = """
        MERGE INTO ITRS_URT_RECEIPTS target
        USING (SELECT :1 AS INSTITUTIONCODE, TO_DATE(:2, 'DD-MON-YYYY') AS REPORTINGDATE, :3 AS DESCRIPTIONNO, 
                      :4 AS PURPOSE, :5 AS PU_CODE, :6 AS COUNTRY, :7 AS SECTOR, :8 AS CURRENCY, 
                      :9 AS AMOUNT, :10 AS DESCRIPTION, :11 AS STATUS FROM DUAL) source
        ON (target.INSTITUTIONCODE = source.INSTITUTIONCODE AND target.REPORTINGDATE = source.REPORTINGDATE AND 
            target.DESCRIPTIONNO = source.DESCRIPTIONNO)
        WHEN MATCHED THEN 
            UPDATE SET target.PURPOSE = source.PURPOSE, target.PU_CODE = source.PU_CODE, 
                       target.COUNTRY = source.COUNTRY, target.SECTOR = source.SECTOR, 
                       target.CURRENCY = source.CURRENCY, target.AMOUNT = source.AMOUNT, 
                       target.DESCRIPTION = source.DESCRIPTION, target.STATUS = source.STATUS
        WHEN NOT MATCHED THEN 
            INSERT (INSTITUTIONCODE, REPORTINGDATE, DESCRIPTIONNO, PURPOSE, PU_CODE, 
                    COUNTRY, SECTOR, CURRENCY, AMOUNT, DESCRIPTION, STATUS)
            VALUES (source.INSTITUTIONCODE, source.REPORTINGDATE, source.DESCRIPTIONNO, source.PURPOSE, 
                    source.PU_CODE, source.COUNTRY, source.SECTOR, source.CURRENCY, 
                    source.AMOUNT, source.DESCRIPTION, source.STATUS)
    """

    try:
        cursor.executemany(upsert_sql, data_tuples)  # Bulk UPSERT for efficiency
        connection.commit()
        print(f"Upserted {len(data_tuples)} records successfully.")
    except cx_Oracle.DatabaseError as e:
        print(f"Database error: {e}")
      


# Function to read and process .xls files from directory
def read_data_from_file(file):
    # Extract institution code and reporting date from the filename
    file_name = os.path.basename(file)
    institution_code = ""
    reporting_date_str = ""

    print(f"file: {file}")
    print(f"file_name: {file_name}")
    


    # Extract reporting date from file name (assumes position of date is fixed in the filename)
    try:
        # Assuming reporting date is at positions 8-13 (e.g., '831012' for date 08/31/2012)
        filename = file.split("\\")[-1]

        # Extract institution code (assuming it is the first 4 characters)
        institution_code = filename[:4]
        reporting_date_str = filename[8:14]
         
        print(f"Extracted reporting date string: {reporting_date_str} {institution_code}")
        reporting_date = datetime.strptime(reporting_date_str, '%d%m%y').strftime('%d-%b-%Y')  # Format to DD-MON-YYYY
        print(f"Formatted reporting date: {reporting_date}")
    except ValueError:
        print(f"Invalid reporting date format in file: {file_name}")
        return None


    # Read the Excel file into a DataFrame
    normalized_file_name = os.path.normpath(file)
    print(f"Normalized file path: {normalized_file_name}")
    if not os.path.exists(normalized_file_name):
        print(f"File not found: {normalized_file_name}")
        return None
    
    df = pd.read_excel(normalized_file_name, 
                       engine='xlrd',
                       sheet_name='ITRS2_URT_RECEIPTS', 
                       usecols="A:H",
                       skiprows=7,   
                       names=COLUMNS)
    # Filter rows where columns PURPOSE to DESCRIPTION are not empty or null
    df = df.dropna(subset=['PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY', 'AMOUNT', 'DESCRIPTION'])

    
    #print(df.head(3))
    df['REPORTINGDATE']  =  datetime.strptime(reporting_date_str, '%d%m%y').strftime('%Y-%m-%d')
    df['STATUS'] = 'N'
    # Data preprocessing: Convert date to correct format and fill missing values if necessary
    df['REPORTINGDATE'] = pd.to_datetime(df['REPORTINGDATE'], errors='coerce').dt.strftime('%d-%b-%Y')  # Format to DD-MON-YYYY
    df['STATUS'] = df['STATUS'].fillna('N')  # Default to 'N' if STATUS is missing
    
    # Add institution code and reporting date from the filename
    df['INSTITUTIONCODE'] = institution_code
    #df['REPORTINGDATE'] = reporting_date
    df = df.rename(columns=lambda x: x.strip().upper())
    #print(df.head(2))
    print(df.columns)
    return df


# Main function to orchestrate the process
def main():
    # Create connection to the Oracle database
    connection = create_db_connection()
    filex = ""
    try:
        # Iterate through all .xls files in the specified directory
        for filex in glob.glob(os.path.join(FILE_PATH, "*.xls")):
            print(f"Processing file: {filex}")
            
            # Read and preprocess data from the Excel file
            df = read_data_from_file(filex)

            if df is not None:
                # Insert data into the database
                upsert_data(df, connection)
                print(f"Data inserted/updated successfully from file: {filex}")  
    except Exception as e:
        print(f"Error occurred: {e}")
    
    finally:
        # Close the connection to the database
        connection.close()

if __name__ == "__main__":
    main()


Processing file: C:\edi_data\test\A1241078310124.xls
file: C:\edi_data\test\A1241078310124.xls
file_name: A1241078310124.xls
Extracted reporting date string: 310124 A124
Formatted reporting date: 31-Jan-2024
Normalized file path: C:\edi_data\test\A1241078310124.xls
Index(['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY',
       'AMOUNT', 'DESCRIPTION', 'REPORTINGDATE', 'STATUS', 'INSTITUTIONCODE'],
      dtype='object')
Upserted 263 records successfully.
Data inserted/updated successfully from file: C:\edi_data\test\A1241078310124.xls
Processing file: C:\edi_data\test\A1251078310124.xls
file: C:\edi_data\test\A1251078310124.xls
file_name: A1251078310124.xls
Extracted reporting date string: 310124 A125
Formatted reporting date: 31-Jan-2024
Normalized file path: C:\edi_data\test\A1251078310124.xls
Index(['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY',
       'AMOUNT', 'DESCRIPTION', 'REPORTINGDATE', 'STATUS', 'INSTITUTIONCODE'],
      dtype='

In [2]:
print("running")

running


In [1]:
#fixing duplicate issues, improved version
import os
import pandas as pd
import cx_Oracle
from datetime import datetime
import glob

# Oracle Database connection details
# DB_USER = "system"
# DB_PASSWORD = "oracle"
# DB_HOST = "localhost"
# DB_PORT = "1521"
# DB_SERVICE_NAME = "XE"
DB_USER = "EDI"
DB_PASSWORD = "edi"
DB_HOST = "172.16.2.122"
DB_PORT = "1521"
DB_SERVICE_NAME = "EDIPROD"
TABLE_NAME = "ITRS_URT_RECEIPTS"

# Directory path containing the .xls files
FILE_PATH = r"C:\edi_data\test"  # Update with your actual file path

# Expected DataFrame columns (adapt this to your actual columns)
COLUMNS = ['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY', 'AMOUNT', 'DESCRIPTION']

# Function to create the connection and perform insertion
def create_db_connection():
    # Oracle DSN (Data Source Name)
    dsn = cx_Oracle.makedsn(DB_HOST, DB_PORT, service_name=DB_SERVICE_NAME)
    
    # Establishing the connection
    connection = cx_Oracle.connect(DB_USER, DB_PASSWORD, dsn)
    return connection

# Function to insert/update data from DataFrame
# def upsert_data(df, connection):
#     cursor = connection.cursor()
    
#     # Insert SQL statement for the ITRS_URT_RECEIPTS table
#     insert_sql = """
#         INSERT INTO ITRS_URT_RECEIPTS (INSTITUTIONCODE, REPORTINGDATE, DESCRIPTIONNO, PURPOSE, PU_CODE, 
#                                        COUNTRY, SECTOR, CURRENCY, AMOUNT, DESCRIPTION, STATUS)
#         VALUES (:INSTITUTIONCODE, TO_DATE(:REPORTINGDATE, 'DD-MON-YYYY'), :DESCRIPTIONNO, :PURPOSE, :PU_CODE, 
#                 :COUNTRY, :SECTOR, :CURRENCY, :AMOUNT, :DESCRIPTION, :STATUS)
#     """
    
#     # Iterating through DataFrame rows and executing insert query
#     for _, row in df.iterrows():
#         cursor.execute(insert_sql, row.to_dict())
    
#     # Commit the transaction
#     connection.commit()
#     cursor.close()
import cx_Oracle
import pandas as pd

def upsert_data(df, connection):
    cursor = connection.cursor()
    
    df = df.fillna('')
    df['REPORTINGDATE'] = pd.to_datetime(df['REPORTINGDATE'], errors='coerce').dt.strftime('%d-%b-%Y')
    
    df['DESCRIPTIONNO'] = pd.to_numeric(df['DESCRIPTIONNO'], errors='coerce').fillna(0).astype(int)
    df['PU_CODE'] = pd.to_numeric(df['PU_CODE'], errors='coerce').fillna(0).astype(int)
    df['AMOUNT'] = pd.to_numeric(df['AMOUNT'], errors='coerce').fillna(0).astype(float)

    upsert_sql = """
        MERGE INTO ITRS_URT_RECEIPTS target
        USING (SELECT :1 AS INSTITUTIONCODE, TO_DATE(:2, 'DD-MON-YYYY') AS REPORTINGDATE, :3 AS DESCRIPTIONNO, 
                      :4 AS PURPOSE, :5 AS PU_CODE, :6 AS COUNTRY, :7 AS SECTOR, :8 AS CURRENCY, 
                      :9 AS AMOUNT, :10 AS DESCRIPTION, :11 AS STATUS FROM DUAL) source
        ON (target.INSTITUTIONCODE = source.INSTITUTIONCODE AND target.REPORTINGDATE = source.REPORTINGDATE AND 
            target.DESCRIPTIONNO = source.DESCRIPTIONNO)
        WHEN MATCHED THEN 
            UPDATE SET target.PURPOSE = source.PURPOSE, target.PU_CODE = source.PU_CODE, 
                       target.COUNTRY = source.COUNTRY, target.SECTOR = source.SECTOR, 
                       target.CURRENCY = source.CURRENCY, target.AMOUNT = source.AMOUNT, 
                       target.DESCRIPTION = source.DESCRIPTION, target.STATUS = source.STATUS
        WHEN NOT MATCHED THEN 
            INSERT (INSTITUTIONCODE, REPORTINGDATE, DESCRIPTIONNO, PURPOSE, PU_CODE, 
                    COUNTRY, SECTOR, CURRENCY, AMOUNT, DESCRIPTION, STATUS)
            VALUES (source.INSTITUTIONCODE, source.REPORTINGDATE, source.DESCRIPTIONNO, source.PURPOSE, 
                    source.PU_CODE, source.COUNTRY, source.SECTOR, source.CURRENCY, 
                    source.AMOUNT, source.DESCRIPTION, source.STATUS)
    """
    duplicate_records = 0
    upserted_records = 0
    for _, row in df.iterrows():
        duplicate_records = 0
        inserted_records = 0
        try:
            cursor.execute(upsert_sql, (
                row.INSTITUTIONCODE, row.REPORTINGDATE, row.DESCRIPTIONNO, row.PURPOSE, row.PU_CODE,
                row.COUNTRY, row.SECTOR, row.CURRENCY, row.AMOUNT, row.DESCRIPTION, row.STATUS
            ))
            upserted_records += 1
        except cx_Oracle.IntegrityError as e:
            #print(f"Skipping duplicate entry: {row.DESCRIPTIONNO} - {e}")
            duplicate_records += 1
        except cx_Oracle.DatabaseError as e:
            print(f"Database error on {row.DESCRIPTIONNO}: {e}")
    
    connection.commit()
    print(f"Upserted {upserted_records} records successfully.")
    cursor.close()


# Function to read and process .xls files from directory
def read_data_from_file(file):
    # Extract institution code and reporting date from the filename
    file_name = os.path.basename(file)
    institution_code = ""
    reporting_date_str = ""

    print(f"file: {file}")
    print(f"file_name: {file_name}")
    


    # Extract reporting date from file name (assumes position of date is fixed in the filename)
    try:
        # Assuming reporting date is at positions 8-13 (e.g., '831012' for date 08/31/2012)
        filename = file.split("\\")[-1]

        # Extract institution code (assuming it is the first 4 characters)
        institution_code = filename[:4]
        reporting_date_str = filename[8:14]
         
        print(f"Extracted reporting date string: {reporting_date_str} {institution_code}")
        reporting_date = datetime.strptime(reporting_date_str, '%d%m%y').strftime('%d-%b-%Y')  # Format to DD-MON-YYYY
        print(f"Formatted reporting date: {reporting_date}")
    except ValueError:
        print(f"Invalid reporting date format in file: {file_name}")
        return None


    # Read the Excel file into a DataFrame
    normalized_file_name = os.path.normpath(file)
    print(f"Normalized file path: {normalized_file_name}")
    if not os.path.exists(normalized_file_name):
        print(f"File not found: {normalized_file_name}")
        return None
    
    df = pd.read_excel(normalized_file_name, 
                       engine='xlrd',
                       sheet_name='ITRS2_URT_RECEIPTS', 
                       usecols="A:H",
                       skiprows=7,   
                       names=COLUMNS)
    # Filter rows where columns PURPOSE to DESCRIPTION are not empty or null
    df = df.dropna(subset=['PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY', 'AMOUNT', 'DESCRIPTION'])

    
    #print(df.head(3))
    df['REPORTINGDATE']  =  datetime.strptime(reporting_date_str, '%d%m%y').strftime('%Y-%m-%d')
    df['STATUS'] = 'N'
    # Data preprocessing: Convert date to correct format and fill missing values if necessary
    df['REPORTINGDATE'] = pd.to_datetime(df['REPORTINGDATE'], errors='coerce').dt.strftime('%d-%b-%Y')  # Format to DD-MON-YYYY
    df['STATUS'] = df['STATUS'].fillna('N')  # Default to 'N' if STATUS is missing
    
    # Add institution code and reporting date from the filename
    df['INSTITUTIONCODE'] = institution_code
    #df['REPORTINGDATE'] = reporting_date
    df = df.rename(columns=lambda x: x.strip().upper())
    #print(df.head(2))
    print(df.columns)
    return df


# Main function to orchestrate the process
def main():
    # Create connection to the Oracle database
    connection = create_db_connection()
    filex = ""
    try:
        # Iterate through all .xls files in the specified directory
        for filex in glob.glob(os.path.join(FILE_PATH, "*.xls")):
            print(f"Processing file: {filex}")
            
            # Read and preprocess data from the Excel file
            df = read_data_from_file(filex)

            if df is not None:
                # Insert data into the database
                upsert_data(df, connection)
                print(f"Data inserted/updated successfully from file: {filex}")  
    except Exception as e:
        print(f"Error occurred: {e}")
    
    finally:
        # Close the connection to the database
        connection.close()

if __name__ == "__main__":
    main()


Processing file: C:\edi_data\test\A1241078310124.xls
file: C:\edi_data\test\A1241078310124.xls
file_name: A1241078310124.xls
Extracted reporting date string: 310124 A124
Formatted reporting date: 31-Jan-2024
Normalized file path: C:\edi_data\test\A1241078310124.xls
Index(['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY',
       'AMOUNT', 'DESCRIPTION', 'REPORTINGDATE', 'STATUS', 'INSTITUTIONCODE'],
      dtype='object')
Upserted 263 records successfully.
Data inserted/updated successfully from file: C:\edi_data\test\A1241078310124.xls
Processing file: C:\edi_data\test\A1251078310124.xls
file: C:\edi_data\test\A1251078310124.xls
file_name: A1251078310124.xls
Extracted reporting date string: 310124 A125
Formatted reporting date: 31-Jan-2024
Normalized file path: C:\edi_data\test\A1251078310124.xls
Index(['DESCRIPTIONNO', 'PURPOSE', 'PU_CODE', 'COUNTRY', 'SECTOR', 'CURRENCY',
       'AMOUNT', 'DESCRIPTION', 'REPORTINGDATE', 'STATUS', 'INSTITUTIONCODE'],
      dtype='

KeyboardInterrupt: 