In [31]:
import psycopg2
import pandas as pd
import logging
from dotenv import load_dotenv
import os

In [32]:
# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger()
# create a log file
fh = logging.FileHandler("log.log")
fh.setLevel(logging.INFO)
logger.addHandler(fh)

In [33]:
# Load .env file
load_dotenv()

# Get database connection parameters from environment variables
dbname = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")

In [34]:
# Check current working directory
current_working_directory = os.getcwd()
logger.info(f"Current Working Directory: {current_working_directory}")
print(f"Current Working Directory: {current_working_directory}")

2024-06-08 22:37:17,461 - INFO - Current Working Directory: /Users/nicksolly/Dev/outreachOptimisation


Current Working Directory: /Users/nicksolly/Dev/outreachOptimisation


In [35]:
file_path = './inputData/PreviousWeeksEngagementLists/20240510 Engagement List.xlsx'
logger.info(f"File Path: {file_path}")

2024-06-08 22:37:17,467 - INFO - File Path: ./inputData/PreviousWeeksEngagementLists/20240510 Engagement List.xlsx


In [36]:
# Database connection
conn = psycopg2.connect(
    dbname=dbname,
    user=user,
    password=password,
    host=host,
    port=port
)

In [37]:
# Load the Excel data into a DataFrame
df_raw = pd.read_excel(file_path)
# add to logger with information of data loaded (e.g. shape, columns, etc.)
logger.info(f"Data loaded with shape: {df_raw.shape}")
# logger.info(f'Columns: {df_raw.columns}')

2024-06-08 22:37:38,181 - INFO - Data loaded with shape: (19327, 312)


In [38]:
# Filter the data
df_filtered = df_raw[
    (df_raw["Engagement Partner Service Line"] == "Consulting")
    & (df_raw["Engagement Status"] == "Released")
]
# add to logger with information of df_raw filtered (e.g. shape, columns, etc.)
logger.info(f"data filtered with shape: {df_filtered.shape}")

2024-06-08 22:37:38,189 - INFO - Data filtered with shape: (1094, 312)


In [39]:
df_filtered.head()
# reduce columns to only the ones needed
keep_cols = [
    "Engagement ID",
    "Creation Date",
    "Release Date",
    "Last Time Charged Date",
    "Last Expenses Charged Date",
    "Last Active ETC-P Date",
    "Engagement",
    "Client",
    "Engagement Partner",
    "Engagement Partner GUI",
    "Engagement Manager",
    "Engagement Manager GUI",
]

df_filtered= df_filtered[keep_cols]
# add to logger with information of data reduced (e.g. shape, columns, etc.)
logger.info(f"Data reduced with shape: {df_filtered.shape}")
df_filtered.head()

2024-06-08 22:37:38,193 - INFO - Data reduced with shape: (1094, 12)


Unnamed: 0,Engagement ID,Creation Date,Release Date,Last Time Charged Date,Last Expenses Charged Date,Last Active ETC-P Date,Engagement,Client,Engagement Partner,Engagement Partner GUI,Engagement Manager,Engagement Manager GUI
37,E-66318700,2021-11-01,2021-11-01,2021-11-19,NaT,2022-08-31,Project Longbow - Pensions,Cinven Capital Management (VII) General Partne...,"Mignault, Matthew",5044822,"Khan, Haroon",2335483
97,E-68148181,2024-03-14,2024-03-26,2024-04-02,NaT,2024-04-19,23 Dec ICE Europe Partners LP UK ITRA,ICE EUROPE PARTNERS LP,"Holt, Steve",5017155,"Baimakhanov, Yerbol",3403954
139,E-65556437,2020-02-20,2020-02-20,2023-11-24,2024-05-10,NaT,ECL+ investment code,Ernst & Young LLP,"Cade, Martin",2032814,"Cade, Martin",2032814
189,E-66704288,2022-06-10,2023-09-21,2024-04-05,NaT,2023-11-16,23 Sep Integrafin Holdings plc Actrl,INTEGRAFIN HOLDINGS PLC,"King, David",5014087,"Solomons, Ben",3245294
228,E-67334780,2023-03-27,2023-03-30,2023-06-02,2023-06-05,2023-06-26,Backfill and IFRS17 - Q123,Lloyds Banking Group PLC,"Edey, Brian",5008847,"Saundh, Raj",5052687


In [None]:
df_filtered['Last Time Charged Date'].dtypes

# Convert to datetime
df_filtered["Last Time Charged Date"] = pd.to_datetime(df_filtered["Last Time Charged Date"])
# repeat for other date columns Creation Date, Release Date, Last Expenses Charged Date, Last Active ETC-P Date
df_filtered["Creation Date"] = pd.to_datetime(df_filtered["Creation Date"])
df_filtered["Release Date"] = pd.to_datetime(df_filtered["Release Date"])
df_filtered["Last Expenses Charged Date"] = pd.to_datetime(df_filtered["Last Expenses Charged Date"])
df_filtered["Last Active ETC-P Date"] = pd.to_datetime(df_filtered["Last Active ETC-P Date"])


In [40]:
# add temp calculated columns:
# fill blank values in Last ACTIVE ETC-P Date with Release Date
df_filtered.loc[:, "Last ETC Date"] = df_filtered["Last Active ETC-P Date"].fillna(
    df_filtered["Release Date"]
)

# Ensure 'Data Date' and 'Last ETC Date' columns are in datetime format
df_filtered.loc[:, "Data Date"] = df_filtered["Last Time Charged Date"].max()

# ETC status calculation


# convert last ETC date to datetime
df_filtered["Last ETC Date"] = pd.to_datetime(df_filtered["Last ETC Date"])

# print data types
logger.info(f"Data Types: {df_filtered.dtypes}")

2024-06-08 22:37:38,231 - INFO - Data Types: Engagement ID                         object
Creation Date                 datetime64[ns]
Release Date                  datetime64[ns]
Last Time Charged Date        datetime64[ns]
Last Expenses Charged Date    datetime64[ns]
Last Active ETC-P Date        datetime64[ns]
Engagement                            object
Client                                object
Engagement Partner                    object
Engagement Partner GUI                 int64
Engagement Manager                    object
Engagement Manager GUI                 int64
Last ETC Date                 datetime64[ns]
Data Date                     datetime64[ns]
dtype: object


In [41]:

# Calculate the age of ETC in days using date offset
df_filtered.loc[:, "ETC Age"] = (
    df_filtered["Data Date"] - df_filtered["Last ETC Date"]
).dt.days

df_filtered.loc[:, "Data Date"] = df_filtered["Last Time Charged Date"].max()

# reset index
df_filtered.reset_index(drop=True, inplace=True)

df_filtered.head()

Unnamed: 0,Engagement ID,Creation Date,Release Date,Last Time Charged Date,Last Expenses Charged Date,Last Active ETC-P Date,Engagement,Client,Engagement Partner,Engagement Partner GUI,Engagement Manager,Engagement Manager GUI,Last ETC Date,Data Date,ETC Age
0,E-66318700,2021-11-01,2021-11-01,2021-11-19,NaT,2022-08-31,Project Longbow - Pensions,Cinven Capital Management (VII) General Partne...,"Mignault, Matthew",5044822,"Khan, Haroon",2335483,2022-08-31,2024-05-10,618
1,E-68148181,2024-03-14,2024-03-26,2024-04-02,NaT,2024-04-19,23 Dec ICE Europe Partners LP UK ITRA,ICE EUROPE PARTNERS LP,"Holt, Steve",5017155,"Baimakhanov, Yerbol",3403954,2024-04-19,2024-05-10,21
2,E-65556437,2020-02-20,2020-02-20,2023-11-24,2024-05-10,NaT,ECL+ investment code,Ernst & Young LLP,"Cade, Martin",2032814,"Cade, Martin",2032814,2020-02-20,2024-05-10,1541
3,E-66704288,2022-06-10,2023-09-21,2024-04-05,NaT,2023-11-16,23 Sep Integrafin Holdings plc Actrl,INTEGRAFIN HOLDINGS PLC,"King, David",5014087,"Solomons, Ben",3245294,2023-11-16,2024-05-10,176
4,E-67334780,2023-03-27,2023-03-30,2023-06-02,2023-06-05,2023-06-26,Backfill and IFRS17 - Q123,Lloyds Banking Group PLC,"Edey, Brian",5008847,"Saundh, Raj",5052687,2023-06-26,2024-05-10,319


In [42]:
# are any value null in LAST ETC DATE?
logger.info(f"Any null values in 'Last ETC Date' column: {df_filtered['Last ETC Date'].isnull().values.any()}")

2024-06-08 22:37:38,243 - INFO - Any null values in 'Last ETC Date' column: False


In [43]:
# create a new table in the database
create_table_query = """
CREATE TABLE IF NOT EXISTS engagement_list (
    engagement_id VARCHAR(255),
    creation_date DATE,
    release_date DATE,
    last_time_charged_date DATE,
    last_expenses_charged_date DATE,
    last_active_etc_p_date DATE,
    engagement VARCHAR(255),
    client VARCHAR(255),
    engagement_partner VARCHAR(255),
    engagement_partner_gui VARCHAR(255),
    engagement_manager VARCHAR(255),
    engagement_manager_gui VARCHAR(255),
    last_etc_date DATE,
    data_date DATE,
    etc_age INT
);
"""
# create a cursor object
cur = conn.cursor()
# execute the query to create the table
cur.execute(create_table_query)
# commit the transaction
conn.commit()

In [44]:
# Function to handle NaT values
def handle_nat(value):
    if pd.isna(value):
        return None
    return value

# Insert filtered data into the table with logging
for i, row in df_filtered.iterrows():
    try:
        # Check if the row already exists
        cur.execute(
            """
            SELECT 1 FROM engagement_list
            WHERE engagement_id = %s AND data_date = %s
            """,
            (row["Engagement ID"], row["Data Date"])
        )
        if cur.fetchone():
            logger.info(
                f"Skipping row {i + 1}/{len(df_filtered)}: {row['Engagement ID']} (already exists)"
            )
            continue

        # Insert the row if it does not exist
        cur.execute(
            """
            INSERT INTO engagement_list (
                engagement_id, creation_date, release_date, last_time_charged_date, last_expenses_charged_date,
                last_active_etc_p_date, engagement, client, engagement_partner, engagement_partner_gui,
                engagement_manager, engagement_manager_gui, last_etc_date, data_date, etc_age
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """,
            (
                row["Engagement ID"],
                handle_nat(row["Creation Date"]),
                handle_nat(row["Release Date"]),
                handle_nat(row["Last Time Charged Date"]),
                handle_nat(row["Last Expenses Charged Date"]),
                handle_nat(row["Last Active ETC-P Date"]),
                row["Engagement"],
                row["Client"],
                row["Engagement Partner"],
                row["Engagement Partner GUI"],
                row["Engagement Manager"],
                row["Engagement Manager GUI"],
                handle_nat(row["Last ETC Date"]),
                row["Data Date"],
                row["ETC Age"],
            ),
        )
        logger.info(
            f"Inserted row {i + 1}/{len(df_filtered)}: {row['Engagement ID']}"
        )
    except psycopg2.Error as e:
        logger.error(f"Error inserting row {i + 1}: {e}")
        conn.rollback()  # Rollback the transaction to reset the state

# Commit changes and close the connection
conn.commit()

logger.info("Data insertion completed.")

2024-06-08 22:37:38,291 - INFO - Inserted row 1/1094: E-66318700
2024-06-08 22:37:38,293 - INFO - Inserted row 2/1094: E-68148181
2024-06-08 22:37:38,294 - INFO - Inserted row 3/1094: E-65556437
2024-06-08 22:37:38,295 - INFO - Inserted row 4/1094: E-66704288
2024-06-08 22:37:38,297 - INFO - Inserted row 5/1094: E-67334780
2024-06-08 22:37:38,298 - INFO - Inserted row 6/1094: E-66186999
2024-06-08 22:37:38,300 - INFO - Inserted row 7/1094: E-67946477
2024-06-08 22:37:38,301 - INFO - Inserted row 8/1094: E-67416917
2024-06-08 22:37:38,303 - INFO - Inserted row 9/1094: E-67638141
2024-06-08 22:37:38,304 - INFO - Inserted row 10/1094: E-68168032
2024-06-08 22:37:38,305 - INFO - Inserted row 11/1094: E-67745897
2024-06-08 22:37:38,307 - INFO - Inserted row 12/1094: E-67684350
2024-06-08 22:37:38,308 - INFO - Inserted row 13/1094: E-67873655
2024-06-08 22:37:38,309 - INFO - Inserted row 14/1094: E-67482231
2024-06-08 22:37:38,310 - INFO - Inserted row 15/1094: E-68243861
2024-06-08 22:37:38

In [45]:
# query to number of rows to check the data in the table created in the database
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM engagement_list")
rows = cur.fetchone()
logger.info(f"Number of rows in the table: {rows[0]}")

2024-06-08 22:37:40,126 - INFO - Number of rows in the table: 4269
