In [33]:
import pandas as pd
from faker import Faker
# Faker.seed(999)
import random
import os
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime

In [34]:
# Initialize Faker
fake = Faker('en_GB')

datestamp = datetime.now().strftime("%Y%m%d")

# Define the number of rows you want to generate
num_rows = 2000

# Make dir if it does not exist
dir_name = 'dummyData'
if not os.path.exists(f'./{dir_name}'):
    os.makedirs(f'./{dir_name}')



In [35]:
def get_db_connection():
    return psycopg2.connect(
        dbname=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
    )
def create_table_if_not_exists(connection, table_name, schema):
    cursor = connection.cursor()
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} ({schema});
    """
    cursor.execute(create_table_query)
    connection.commit()
    cursor.close()

def insert_data(connection, df, table_name):
    engine = create_engine(f'postgresql+psycopg2://{os.getenv("DB_USER")}:{os.getenv("DB_PASSWORD")}@{os.getenv("DB_HOST")}:{os.getenv("DB_PORT")}/{os.getenv("DB_NAME")}')
    df.to_sql(table_name, engine, if_exists='append', index=False)
    print(engine)

def load_data_to_db(df, table_name, schema):
    connection = get_db_connection()
    try:
        create_table_if_not_exists(connection, table_name, schema)
        insert_data(connection, df, table_name)
        connection.close()
        print(f"Data loaded into the {table_name} table successfully.")
    except Exception as e:
        print(f"Error loading data into database: {str(e)}")
        connection.close()
        

In [36]:


def check_database_connection():
    try:
        connection = psycopg2.connect(
            dbname=os.getenv("DB_NAME"),
            user=os.getenv("DB_USER"),
            password=os.getenv("DB_PASSWORD"),
            host=os.getenv("DB_HOST"),
            port=os.getenv("DB_PORT"),
        )
        connection.close()
        return True
    except Exception as e:
        print(f"Error connecting to the database: {str(e)}")
        return False

# Call the function to check the database connection
is_connected = check_database_connection()

if is_connected:
    print("Database connection is successful.")
else:
    print("Database connection failed.")

Database connection is successful.


# Engagement Data

Generate fake engagement data


In [37]:
# Define the columns with proper names and formatting
columns = [
    "Engagement ID",
    "Creation Date",
    "Release Date",
    "Last Time Charged Date",
    "Last Expenses Charged Date",
    "Last Active ETC-P Date",
    "Engagement",
    "Client",
    "Engagement Region",
    "Engagement Country",
    "Engagement Type",
    "Currency",
    "Engagement Partner",
    "Engagement Partner GUI",
    "Engagement Manager",
    "Engagement Manager GUI",
    "Engagement Partner Service Line",
    "Engagement Status",
]

def generate_partner_manager() -> tuple[str, str]:
    """Generate a consistent engagement partner and manager."""
    last_name = fake.last_name()
    first_name = fake.first_name()
    gui = fake.unique.pystr_format("#######")
    return f"{last_name}, {first_name}", gui

def generate_dummy_data(num_rows: int) -> list[list]:
    """Generate dummy data for the given number of rows."""
    data = []
    for _ in range(num_rows):
        engagement_id = fake.unique.pystr_format("E-########")
        creation_date = fake.date_between(start_date="-1y", end_date="today")
        release_date = fake.date_between(start_date=creation_date, end_date="today")
        last_time_charged_date = (
            fake.date_between(start_date=release_date, end_date="today")
            if random.choice([True, False])
            else None
        )
        last_expenses_charged_date = (
            fake.date_between(start_date=release_date, end_date="today")
            if random.choice([True, False])
            else None
        )
        last_active_etcp_date = (
            fake.date_between(start_date=release_date, end_date="today")
            if random.choice([True, False])
            else None
        )
        engagement = fake.bs().title()
        client = fake.company()
        engagement_region = "EMEA"
        engagement_country = fake.country()
        engagement_type = "External Project"
        currency = fake.currency_code()
        engagement_partner, engagement_partner_gui = generate_partner_manager()
        engagement_manager, engagement_manager_gui = generate_partner_manager()
        engagement_partner_service_line = random.choice(
            ["CBS & Elim", "Assurance", "Consulting", "Tax", "SaT"]
        )
        engagement_status = random.choice(["Released", "Active", "Pending"])

        data.append(
            [
                engagement_id,
                creation_date,
                release_date,
                last_time_charged_date,
                last_expenses_charged_date,
                last_active_etcp_date,
                engagement,
                client,
                engagement_region,
                engagement_country,
                engagement_type,
                currency,
                engagement_partner,
                engagement_partner_gui,
                engagement_manager,
                engagement_manager_gui,
                engagement_partner_service_line,
                engagement_status,
            ]
        )
    return data

def save_to_excel(data: list[list], columns: list[str], file_name: str, dir_name: str) -> None:
    """Save the data to an Excel file."""
    df = pd.DataFrame(data, columns=columns)
    try:
        df.to_excel(f"./{dir_name}/{file_name}", index=False)
        print(f"Data saved to {dir_name}/{file_name}")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")

def main(num_rows: int, dir_name: str) -> None:
    """Main function to generate data and save to Excel."""
    file_name = f"{datestamp}-test_engagement_data-{num_rows}.xlsx"
    data = generate_dummy_data(num_rows)
    save_to_excel(data, columns, file_name, dir_name)

    # Display the first few rows of the DataFrame
    df = pd.DataFrame(data, columns=columns)
    print(df.head())

if __name__ == "__main__":
    main(num_rows, dir_name)


Data saved to dummyData/20240701-test_engagement_data-2000.xlsx
  Engagement ID Creation Date Release Date Last Time Charged Date  \
0    E-39878182    2024-05-30   2024-06-23             2024-06-27   
1    E-55084827    2023-10-01   2023-10-31                   None   
2    E-96695189    2024-05-24   2024-06-26                   None   
3    E-86192794    2024-03-27   2024-05-18             2024-05-23   
4    E-94564765    2024-01-18   2024-03-14             2024-04-24   

  Last Expenses Charged Date Last Active ETC-P Date  \
0                       None             2024-06-23   
1                 2024-02-09                   None   
2                 2024-06-26             2024-06-30   
3                       None                   None   
4                 2024-05-16                   None   

                           Engagement                     Client  \
0  Revolutionize Efficient E-Commerce                Jones-Price   
1       Enable Customized Convergence             Coll

# Load Engagement data to the database

In [38]:

df_engagement = pd.read_excel(f"./{dir_name}/{datestamp}-test_engagement_data-{num_rows}.xlsx")

df_engagement.head()

Unnamed: 0,Engagement ID,Creation Date,Release Date,Last Time Charged Date,Last Expenses Charged Date,Last Active ETC-P Date,Engagement,Client,Engagement Region,Engagement Country,Engagement Type,Currency,Engagement Partner,Engagement Partner GUI,Engagement Manager,Engagement Manager GUI,Engagement Partner Service Line,Engagement Status
0,E-39878182,2024-05-30,2024-06-23,2024-06-27,NaT,2024-06-23,Revolutionize Efficient E-Commerce,Jones-Price,EMEA,Moldova,External Project,CVE,"Porter, Vanessa",3334473,"Gray, Howard",4197427,SaT,Active
1,E-55084827,2023-10-01,2023-10-31,NaT,2024-02-09,NaT,Enable Customized Convergence,Collins-Wilson,EMEA,Canada,External Project,IMP,"Rice, George",3977163,"Brown, Ross",1462644,Consulting,Pending
2,E-96695189,2024-05-24,2024-06-26,NaT,2024-06-26,2024-06-30,Incentivize Out-Of-The-Box Vortals,"Davies, Noble and Walters",EMEA,Latvia,External Project,SOS,"Davies, Lisa",4375632,"Schofield, Sharon",6793322,Assurance,Released
3,E-86192794,2024-03-27,2024-05-18,2024-05-23,NaT,NaT,E-Enable Granular Supply-Chains,Riley-Wall,EMEA,Equatorial Guinea,External Project,BTN,"Thompson, Bradley",6474315,"Fox, Danielle",58032,Consulting,Pending
4,E-94564765,2024-01-18,2024-03-14,2024-04-24,2024-05-16,NaT,Unleash Sticky Bandwidth,Smith Group,EMEA,Niger,External Project,MOP,"Lyons, Valerie",9284326,"Hayward, Abigail",8535165,Consulting,Active


In [39]:
# calculate the max and min length of each column
max_length = df_engagement.map(lambda x: len(str(x))).max()
min_length = df_engagement.map(lambda x: len(str(x))).min()
# print the max and min length of each column
print(max_length)
print(min_length)


Engagement ID                      10
Creation Date                      19
Release Date                       19
Last Time Charged Date             19
Last Expenses Charged Date         19
Last Active ETC-P Date             19
Engagement                         47
Client                             33
Engagement Region                   4
Engagement Country                 51
Engagement Type                    16
Currency                            3
Engagement Partner                 22
Engagement Partner GUI              7
Engagement Manager                 21
Engagement Manager GUI              7
Engagement Partner Service Line    10
Engagement Status                   8
dtype: int64
Engagement ID                      10
Creation Date                      19
Release Date                       19
Last Time Charged Date              3
Last Expenses Charged Date          3
Last Active ETC-P Date              3
Engagement                         15
Client                              7

In [40]:
# replace spaces and make lowercase for column names and remove special characters "-" and "/"
df_engagement.columns = df_engagement.columns.str.replace(" ", "_").str.lower().str.replace("-", "").str.replace("/", "")

# create a schema for the table
schema = """
engagement_id VARCHAR(10),
creation_date DATE,
release_date DATE,
last_time_charged_date DATE,
last_expenses_charged_date DATE,
last_active_etcp_date DATE,
engagement VARCHAR(255),
client VARCHAR(255),
engagement_region CHAR(4),
engagement_country VARCHAR(255),
engagement_type VARCHAR(255),
currency CHAR(3),
engagement_partner VARCHAR(255),
engagement_partner_gui VARCHAR(8),
engagement_manager VARCHAR(255),
engagement_manager_gui VARCHAR(8),
engagement_partner_service_line VARCHAR(10),
engagement_status VARCHAR(8)
"""

# load the data into the database
table_name = "engagement_data_raw"
load_data_to_db(df_engagement, table_name, schema)

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the engagement_data_raw table successfully.


# Email List

In [42]:
# get list of unique values in a engagement_partner_gui and engagement_manager_gui in a single list
engagement_partner_gui = df_engagement['engagement_partner_gui'].unique().tolist()
engagement_manager_gui = df_engagement['engagement_manager_gui'].unique().tolist()
gui = engagement_partner_gui + engagement_manager_gui
# remove duplicates
gui = list(set(gui))
# print gui size
print(len(gui))
# add faker safe email address to each gui and move to a df
email = []
for i in gui:
    email.append(fake.safe_email())
df_email = pd.DataFrame(list(zip(gui, email)), columns=['gui', 'email'])
# sort by gui
df_email = df_email.sort_values(by=['gui']).reset_index(drop=True)

size = df_email.shape[0]

# Save the DataFrame to a CSV file
df_email.to_excel(f"./{dir_name}/test_emailList-{size}.xlsx", index=False)

# create schema for cols in the table
email_schema = """
    gui CHAR(7),
    email TEXT
"""

load_data_to_db(df_email, 'email', email_schema)

df_email.head()


4000
Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the email table successfully.


Unnamed: 0,gui,email
0,1316,bfrost@example.net
1,4329,susanpatterson@example.org
2,6897,teresa13@example.org
3,16757,louisehaynes@example.net
4,16987,mortongary@example.com


# Exception


In [43]:
# Sample engagement_id from df_engagement
engagement_id = df_engagement['engagement_id'].sample(n=50).tolist()

# Define columns for exception data
columns = [
    'engagement_id', 
    'exception_reason_category', 
    'exception_reason_description', 
    'exception_reason_status', 
    'exception_reason_start_date', 
    'exception_reason_status_user', 
    'exception_reason_status_user_gui', 
    'exception_reason_status_user_email'
]

# Generate exception data
data = [
    [
        eid,
        random.choice(["Tax", "Investment Code", "Invoice", "Other"]),
        fake.sentence(),
        random.choice(["Pending EP Approval", "Pending Finance Approval", "Approved", "Finance Rejected", "EP Rejected", "Expired", "New Time"]),
        fake.date_between(start_date='-4w', end_date='today'),
        f"{fake.last_name()}, {fake.first_name()}",
        fake.unique.pystr_format("#######"),
        fake.safe_email()
    ]
    for eid in engagement_id
]

# Create DataFrame
df_exception = pd.DataFrame(data, columns=columns)

# Define schema for exception data
exception_schema = """
    engagement_id CHAR(10),
    exception_reason_category TEXT,
    exception_reason_description TEXT,
    exception_reason_status TEXT,
    exception_reason_start_date DATE,
    exception_reason_status_user TEXT,
    exception_reason_status_user_gui CHAR(7),
    exception_reason_status_user_email TEXT
"""

size = df_exception.shape[0]

# Save DataFrame to an Excel file
file_name = f"{datestamp}-test_exception_data-{size}.xlsx"
df_exception.to_excel(f"./{dir_name}/{file_name}", index=False)

# Load data to the database (uncomment the line below when the function is available)
load_data_to_db(df_exception, 'exception', exception_schema)

# Display the first 25 rows of the DataFrame
df_exception.head(25)

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the exception table successfully.


Unnamed: 0,engagement_id,exception_reason_category,exception_reason_description,exception_reason_status,exception_reason_start_date,exception_reason_status_user,exception_reason_status_user_gui,exception_reason_status_user_email
0,E-41072085,Investment Code,Dolor enim impedit aspernatur velit dolorum iste.,New Time,2024-06-04,"Wall, James",3447460,bethan80@example.org
1,E-53827349,Other,Quibusdam quas ex facere exercitationem vel.,Approved,2024-06-14,"Lord, Arthur",207154,sjones@example.net
2,E-57469365,Invoice,Libero placeat fugit id.,Expired,2024-06-25,"Godfrey, Patricia",1340243,ryan38@example.com
3,E-71785176,Tax,Inventore inventore voluptatem quibusdam aliqu...,Finance Rejected,2024-06-06,"Parker, Rosemary",7193257,martin07@example.net
4,E-50728953,Tax,Illo omnis temporibus rem quibusdam qui.,Approved,2024-06-13,"Smith, Karl",6540404,sammatthews@example.org
5,E-74876801,Tax,Quisquam eum dolores aliquid minima ullam.,Pending EP Approval,2024-06-27,"Smith, Megan",2496611,mandy90@example.com
6,E-56041625,Tax,Totam laborum sit earum.,Approved,2024-06-14,"Stewart, Pamela",714670,xjones@example.com
7,E-10340498,Investment Code,Magni non odio dolores magni nemo.,Pending EP Approval,2024-06-21,"Barnes, Georgina",5236881,marcgreen@example.net
8,E-09334275,Other,Laborum nemo ut animi delectus quidem deserunt.,Finance Rejected,2024-06-18,"Cook, Ian",7160432,frankcarter@example.org
9,E-93381907,Investment Code,Odit rerum dignissimos eos possimus nesciunt.,Expired,2024-06-09,"Kemp, Norman",8736047,wsmith@example.net


In [44]:
# Sample 25 unique engagement IDs
engagement_id = df_engagement['engagement_id'].sample(n=25, random_state=1).tolist()

# Create a list to hold delegate information
delegate_list = []

# Generate delegate information for each engagement ID
for eid in engagement_id:
    num_delegates = random.randint(1, 3)  # Randomly choose 1, 2, or 3 delegates
    for i in range(1, num_delegates + 1):
        delegate_name = fake.last_name() + ", " + fake.first_name()
        delegate_gui = fake.unique.pystr_format("#######")
        delegate_email = fake.safe_email()
        delegate_list.append({
            'engagement_id': eid,
            'delegate_number': i,
            'delegate_name': delegate_name,
            'delegate_gui': delegate_gui,
            'delegate_email': delegate_email
        })

# Create a DataFrame from the delegate list
df_delegate = pd.DataFrame(delegate_list)

# Sort the DataFrame by Engagement ID and Delegate Number
df_delegate = df_delegate.sort_values(by=['engagement_id', 'delegate_number'])

df_delegate.reset_index(drop=True, inplace=True)

# create schema for cols in the table
delegate_schema = """
    engagement_id CHAR(10),
    delegate_number INTEGER,
    delegate_name TEXT,
    delegate_gui CHAR(7),
    delegate_email TEXT
"""

load_data_to_db(df_delegate, 'delegate', delegate_schema)

size = df_delegate.shape[0]

# Save the DataFrame to an Excel file
df_delegate.to_excel(f"./{dir_name}/{datestamp}test_delegateData-{size}.xlsx", index=False)

df_delegate.head(25)

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the delegate table successfully.


Unnamed: 0,engagement_id,delegate_number,delegate_name,delegate_gui,delegate_email
0,E-10641400,1,"Ahmed, Douglas",7583391,dward@example.org
1,E-10641400,2,"Ali, Trevor",5697949,victorpollard@example.net
2,E-16697108,1,"Johnson, Mohammed",9807149,sarahstone@example.net
3,E-16697108,2,"Kay, Jenna",4807301,fowlerpaula@example.com
4,E-16697108,3,"Pollard, Diana",3467177,gsmith@example.org
5,E-19129239,1,"Elliott, Lauren",1695832,leekatherine@example.org
6,E-19129239,2,"Barnett, Vanessa",8639127,dominic73@example.com
7,E-19129239,3,"Brown, Marc",7608502,kimberleyjohnson@example.net
8,E-26063954,1,"Dale, Frances",9776476,tracey89@example.net
9,E-28201816,1,"Baker, Dylan",6225383,leighlowe@example.com
