In [16]:
import pandas as pd
from faker import Faker
# Faker.seed(999)
import random
import os
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime

In [17]:
# Initialize Faker
fake = Faker('en_GB')

# Define the number of rows you want to generate
num_rows = 2000

# Make dir if it does not exist
dir_name = 'dummy_data'
if not os.path.exists(f'./{dir_name}'):
    os.makedirs(f'./{dir_name}')



In [24]:
def get_db_connection():
    return psycopg2.connect(
        dbname=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
    )
def create_table_if_not_exists(connection, table_name, schema):
    cursor = connection.cursor()
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} ({schema});
    """
    cursor.execute(create_table_query)
    connection.commit()
    cursor.close()

def insert_data(connection, df, table_name):
    engine = create_engine(f'postgresql+psycopg2://{os.getenv("DB_USER")}:{os.getenv("DB_PASSWORD")}@{os.getenv("DB_HOST")}:{os.getenv("DB_PORT")}/{os.getenv("DB_NAME")}')
    df.to_sql(table_name, engine, if_exists='append', index=False)
    print(engine)

def load_data_to_db(df, table_name, schema):
    connection = get_db_connection()
    try:
        create_table_if_not_exists(connection, table_name, schema)
        insert_data(connection, df, table_name)
        connection.close()
        print(f"Data loaded into the {table_name} table successfully.")
    except Exception as e:
        print(f"Error loading data into database: {str(e)}")
        connection.close()
        

In [28]:
import psycopg2

def check_database_connection():
    try:
        connection = psycopg2.connect(
            dbname=os.getenv("DB_NAME"),
            user=os.getenv("DB_USER"),
            password=os.getenv("DB_PASSWORD"),
            host=os.getenv("DB_HOST"),
            port=os.getenv("DB_PORT"),
        )
        connection.close()
        return True
    except Exception as e:
        print(f"Error connecting to the database: {str(e)}")
        return False

# Call the function to check the database connection
is_connected = check_database_connection()

if is_connected:
    print("Database connection is successful.")
else:
    print("Database connection failed.")

Database connection is successful.


# Engagement Data

Generate fake engagement data


In [29]:
# Define the columns
columns = [
    'engagement_id', 
    'creation_date', 
    'release_date', 
    'last_time_charged_date',
    'last_expenses_charged_date', 
    'last_active_etcp_date', 
    'engagement',
    'client', 
    'engagement_region', 
    'engagement_country', 
    'engagement_type',
    'currency', 
    'engagement_partner', 
    'engagement_partner_gui',
    'engagement_manager', 
    'engagement_manager_gui', 
    'engagement_partner_service_line',
    'engagement_status'
]

# Create a list to hold the data
data = []

# Function to generate consistent engagement partner and manager
def generate_partner_manager():
    last_name = fake.last_name()
    first_name = fake.first_name()
    gui = fake.unique.pystr_format("#######")
    return f"{last_name}, {first_name}", gui

# Generate dummy data
for _ in range(num_rows):
    engagement_id = fake.unique.pystr_format("E-########")
    creation_date = fake.date_between(start_date='-1y', end_date='today')
    release_date = fake.date_between(start_date=creation_date, end_date='today')
    last_time_charged_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    last_expenses_charged_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    last_active_etcp_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    engagement = fake.bs().title()
    client = fake.company()
    engagement_region = "EMEA"
    engagement_country = fake.country()
    engagement_type = "External Project"
    currency = fake.currency_code()
    engagement_partner, engagement_partner_gui = generate_partner_manager()
    engagement_manager, engagement_manager_gui = generate_partner_manager()
    engagement_partner_service_line = random.choice(["CBS & Elim", "Assurance", "Consulting", "Tax", "SaT"])
    engagement_status = random.choice(["Released", "Active", "Pending"])

    data.append([
        engagement_id,
        creation_date,
        release_date,
        last_time_charged_date,
        last_expenses_charged_date,
        last_active_etcp_date,
        engagement,
        client,
        engagement_region,
        engagement_country,
        engagement_type,
        currency,
        engagement_partner,
        engagement_partner_gui,
        engagement_manager,
        engagement_manager_gui,
        engagement_partner_service_line,
        engagement_status
    ])

# Create a DataFrame
df_engagement = pd.DataFrame(data, columns=columns)

# Create schema for cols in the table
engagement_schema = """
    engagement_id CHAR(10),
    creation_date DATE,
    release_date DATE,
    last_time_charged_date DATE,
    last_expenses_charged_date DATE,
    last_active_etcp_date DATE,
    engagement TEXT,
    client TEXT,
    engagement_region VARCHAR(10),
    engagement_country TEXT,
    engagement_type TEXT,
    currency CHAR(3),
    engagement_partner TEXT,
    engagement_partner_gui CHAR(7),
    engagement_manager TEXT,
    engagement_manager_gui CHAR(7),
    engagement_partner_service_line TEXT,
    engagement_status VARCHAR(20)
"""

# Save the DataFrame to an Excel file
file_name = "test_engagement_data.xlsx"
df_engagement.to_excel(f"./{dir_name}/{file_name}", index=False)

# Load data to the database
load_data_to_db(df_engagement, 'engagement_data_raw', engagement_schema)

# Display the first few rows of the DataFrame
df_engagement.head()

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the engagement_data_test table successfully.


Unnamed: 0,engagement_id,creation_date,release_date,last_time_charged_date,last_expenses_charged_date,last_active_etcp_date,engagement,client,engagement_region,engagement_country,engagement_type,currency,engagement_partner,engagement_partner_gui,engagement_manager,engagement_manager_gui,engagement_partner_service_line,engagement_status
0,E-24066247,2023-09-28,2024-05-05,,2024-05-30,2024-06-01,Whiteboard Value-Added E-Tailers,Harris Inc,EMEA,Turkmenistan,External Project,RWF,"Martin, Sara",5486139,"Bennett, Max",4620752,CBS & Elim,Pending
1,E-02508091,2024-06-02,2024-06-03,2024-06-05,,2024-06-11,Revolutionize Synergistic Applications,Alexander Ltd,EMEA,Dominican Republic,External Project,NZD,"Holden, Jessica",9786027,"Robertson, Hazel",5395068,Assurance,Active
2,E-89293404,2023-12-29,2024-02-24,2024-04-18,,,Seize Vertical Architectures,Marsh-Reeves,EMEA,British Virgin Islands,External Project,PEN,"Taylor, Stephen",3102621,"Palmer, George",2125442,Assurance,Released
3,E-02301942,2024-03-20,2024-06-13,,,2024-06-13,Monetize Bleeding-Edge Infrastructures,Gill-Harrison,EMEA,Libyan Arab Jamahiriya,External Project,RUB,"Butler, Hollie",5191279,"Scott, Damian",3073553,Tax,Released
4,E-88892803,2024-03-13,2024-04-09,,,2024-06-04,Brand Robust Communities,Goddard Group,EMEA,United States Minor Outlying Islands,External Project,BDT,"Lord, Molly",6549490,"Moore, Clare",4336548,Assurance,Active


In [33]:
# get list of unique values in a engagement_partner_gui and engagement_manager_gui in a single list
engagement_partner_gui = df_engagement['engagement_partner_gui'].unique().tolist()
engagement_manager_gui = df_engagement['engagement_manager_gui'].unique().tolist()
gui = engagement_partner_gui + engagement_manager_gui
# remove duplicates
gui = list(set(gui))
# print gui size
print(len(gui))
# add faker safe email address to each gui and move to a df
email = []
for i in gui:
    email.append(fake.safe_email())
df_email = pd.DataFrame(list(zip(gui, email)), columns=['gui', 'email'])
# sort by gui
df_email = df_email.sort_values(by=['gui']).reset_index(drop=True)

# Save the DataFrame to a CSV file
df_email.to_excel(f"./{dir_name}/test_emailList.xlsx", index=False)

# create schema for cols in the table
email_schema = """
    gui CHAR(7),
    email TEXT
"""

load_data_to_db(df_email, 'email', email_schema)

df_email.head()


4000
Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the email table successfully.


Unnamed: 0,gui,email
0,2252,megan43@example.org
1,6089,mauricereed@example.org
2,7581,donald88@example.org
3,7678,flyons@example.com
4,8356,katesmith@example.net


# Exception


In [36]:
# Sample engagement_id from df_engagement
engagement_id = df_engagement['engagement_id'].sample(n=50).tolist()

# Define columns for exception data
columns = [
    'engagement_id', 
    'exception_reason_category', 
    'exception_reason_description', 
    'exception_reason_status', 
    'exception_reason_start_date', 
    'exception_reason_status_user', 
    'exception_reason_status_user_gui', 
    'exception_reason_status_user_email'
]

# Generate exception data
data = [
    [
        eid,
        random.choice(["Tax", "Investment Code", "Invoice", "Other"]),
        fake.sentence(),
        random.choice(["Pending EP Approval", "Pending Finance Approval", "Approved", "Finance Rejected", "EP Rejected", "Expired", "New Time"]),
        fake.date_between(start_date='-4w', end_date='today'),
        f"{fake.last_name()}, {fake.first_name()}",
        fake.unique.pystr_format("#######"),
        fake.safe_email()
    ]
    for eid in engagement_id
]

# Create DataFrame
df_exception = pd.DataFrame(data, columns=columns)

# Define schema for exception data
exception_schema = """
    engagement_id CHAR(10),
    exception_reason_category TEXT,
    exception_reason_description TEXT,
    exception_reason_status TEXT,
    exception_reason_start_date DATE,
    exception_reason_status_user TEXT,
    exception_reason_status_user_gui CHAR(7),
    exception_reason_status_user_email TEXT
"""

# Save DataFrame to an Excel file
file_name = f"test_exception_data.xlsx"
df_exception.to_excel(f"./{dir_name}/{file_name}", index=False)

# Load data to the database (uncomment the line below when the function is available)
load_data_to_db(df_exception, 'exception', exception_schema)

# Display the first 25 rows of the DataFrame
df_exception.head(25)

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the exception table successfully.


Unnamed: 0,engagement_id,exception_reason_category,exception_reason_description,exception_reason_status,exception_reason_start_date,exception_reason_status_user,exception_reason_status_user_gui,exception_reason_status_user_email
0,E-04309870,Investment Code,Accusantium at magni vel consequatur facere no...,Finance Rejected,2024-05-23,"Smith, Marian",3124565,walkeranthony@example.net
1,E-55261918,Investment Code,Quidem amet perferendis aliquid.,Approved,2024-06-09,"Lewis, Victoria",3694863,fletchergareth@example.org
2,E-09875494,Invoice,Doloribus voluptates distinctio.,EP Rejected,2024-06-09,"Hudson, Max",7167606,vholmes@example.org
3,E-00679506,Other,Occaecati iusto animi in in quos laborum.,Finance Rejected,2024-05-26,"Campbell, Karen",783287,njones@example.com
4,E-98439983,Investment Code,Debitis quod beatae praesentium.,Finance Rejected,2024-06-03,"Lamb, Gail",1536715,bradshawdiane@example.com
5,E-24467247,Invoice,Quibusdam similique maiores aut enim reprehend...,Approved,2024-05-20,"Begum, Eleanor",8180663,sroberts@example.net
6,E-57749103,Investment Code,Unde assumenda ex dolorem nihil.,Approved,2024-05-23,"Taylor, Gemma",1913964,sianandrews@example.com
7,E-85261787,Other,Impedit corporis tenetur quod eos ea dolorem.,New Time,2024-05-19,"Yates, Garry",2017552,rrobinson@example.net
8,E-61253283,Invoice,Corporis maiores maiores distinctio veniam.,Finance Rejected,2024-05-26,"Adams, Victoria",5281673,maryyoung@example.net
9,E-29148919,Investment Code,Eum quis consequatur ad possimus.,Approved,2024-05-23,"Todd, Katy",4249545,tobygray@example.com


In [38]:
# Sample 25 unique engagement IDs
engagement_id = df_engagement['engagement_id'].sample(n=25, random_state=1).tolist()

# Create a list to hold delegate information
delegate_list = []

# Generate delegate information for each engagement ID
for eid in engagement_id:
    num_delegates = random.randint(1, 3)  # Randomly choose 1, 2, or 3 delegates
    for i in range(1, num_delegates + 1):
        delegate_name = fake.last_name() + ", " + fake.first_name()
        delegate_gui = fake.unique.pystr_format("#######")
        delegate_email = fake.safe_email()
        delegate_list.append({
            'engagement_id': eid,
            'delegate_number': i,
            'delegate_name': delegate_name,
            'delegate_gui': delegate_gui,
            'delegate_email': delegate_email
        })

# Create a DataFrame from the delegate list
df_delegate = pd.DataFrame(delegate_list)

# Sort the DataFrame by Engagement ID and Delegate Number
df_delegate = df_delegate.sort_values(by=['engagement_id', 'delegate_number'])

df_delegate.reset_index(drop=True, inplace=True)

# create schema for cols in the table
delegate_schema = """
    engagement_id CHAR(10),
    delegate_number INTEGER,
    delegate_name TEXT,
    delegate_gui CHAR(7),
    delegate_email TEXT
"""

load_data_to_db(df_delegate, 'delegate', delegate_schema)

# Save the DataFrame to an Excel file
df_delegate.to_excel(f"./{dir_name}/test_delegateData.xlsx", index=False)

df_delegate.head(25)

Engine(postgresql+psycopg2://nicksolly:***@localhost:5433/etc_compliance_db)
Data loaded into the delegate table successfully.


Unnamed: 0,engagement_id,delegate_number,delegate_name,delegate_gui,delegate_email
0,E-01878396,1,"Brown, Paula",6272723,nmoss@example.net
1,E-01878396,2,"Davis, Bethany",6220031,maureengreen@example.net
2,E-06231304,1,"Ahmed, Joshua",9230206,jfarrell@example.com
3,E-07508517,1,"Jennings, Connor",7148402,nicolagill@example.com
4,E-12686767,1,"Warren, Jasmine",6762465,glenakhtar@example.org
5,E-12686767,2,"Sanders, Jemma",7501656,simon14@example.org
6,E-15108910,1,"Goddard, Harry",6647990,khammond@example.com
7,E-15108910,2,"Johnson, Frank",8452915,woodmark@example.net
8,E-17327959,1,"Walker, Gerald",7213016,chandleralex@example.org
9,E-17327959,2,"Hartley, Brandon",482651,leigh83@example.org
