In [1]:
import pandas as pd
from faker import Faker
# Faker.seed(999)
import random
import os
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime

In [2]:
# Initialize Faker
fake = Faker('en_GB')

# Define the number of rows you want to generate
num_rows = 1000

# Make dir if it does not exist
dir_name = 'dummy_data'
if not os.path.exists(f'./{dir_name}'):
    os.makedirs(f'./{dir_name}')



In [3]:
def get_db_connection():
    return psycopg2.connect(
        dbname=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
    )
def create_table_if_not_exists(connection, table_name, schema):
    cursor = connection.cursor()
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} ({schema});
    """
    cursor.execute(create_table_query)
    connection.commit()
    cursor.close()

def insert_data(connection, df, table_name):
    engine = create_engine(f'postgresql+psycopg2://{os.getenv("DB_USER")}:{os.getenv("DB_PASSWORD")}@{os.getenv("DB_HOST")}:{os.getenv("DB_PORT")}/{os.getenv("DB_NAME")}')
    df.to_sql(table_name, engine, if_exists='append', index=False)

def load_data_to_db(df, table_name, schema):
    connection = get_db_connection()
    try:
        create_table_if_not_exists(connection, table_name, schema)
        insert_data(connection, df, table_name)
        connection.close()
        print(f"Data loaded into the {table_name} table successfully.")
    except Exception as e:
        print(f"Error loading data into database: {str(e)}")
        connection.close()

# Engagement Data

Generate fake engagement data


In [4]:
# Define the columns
columns = [
    'engagement_id', 
    'creation_date', 
    'release_date', 
    'last_time_charged_date',
    'last_expenses_charged_date', 
    'last_active_etcp_date', 
    'engagement',
    'client', 
    'engagement_region', 
    'engagement_country', 
    'engagement_type',
    'currency', 
    'engagement_partner', 
    'engagement_partner_gui',
    'engagement_manager', 
    'engagement_manager_gui', 
    'engagement_partner_service_line',
    'engagement_status'
]

# Create a list to hold the data
data = []

# Function to generate consistent engagement partner and manager
def generate_partner_manager():
    last_name = fake.last_name()
    first_name = fake.first_name()
    gui = fake.unique.pystr_format("#######")
    return f"{last_name}, {first_name}", gui

# Generate dummy data
for _ in range(num_rows):
    engagement_id = fake.unique.pystr_format("E-########")
    creation_date = fake.date_between(start_date='-1y', end_date='today')
    release_date = fake.date_between(start_date=creation_date, end_date='today')
    last_time_charged_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    last_expenses_charged_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    last_active_etcp_date = fake.date_between(start_date=release_date, end_date='today') if random.choice([True, False]) else None
    engagement = fake.bs().title()
    client = fake.company()
    engagement_region = "EMEA"
    engagement_country = fake.country()
    engagement_type = "External Project"
    currency = fake.currency_code()
    engagement_partner, engagement_partner_gui = generate_partner_manager()
    engagement_manager, engagement_manager_gui = generate_partner_manager()
    engagement_partner_service_line = random.choice(["CBS & Elim", "Assurance", "Consulting", "Tax", "SaT"])
    engagement_status = random.choice(["Released", "Active", "Pending"])

    data.append([
        engagement_id,
        creation_date,
        release_date,
        last_time_charged_date,
        last_expenses_charged_date,
        last_active_etcp_date,
        engagement,
        client,
        engagement_region,
        engagement_country,
        engagement_type,
        currency,
        engagement_partner,
        engagement_partner_gui,
        engagement_manager,
        engagement_manager_gui,
        engagement_partner_service_line,
        engagement_status
    ])

# Create a DataFrame
df_engagement = pd.DataFrame(data, columns=columns)

# Create schema for cols in the table
engagement_schema = """
    engagement_id CHAR(10),
    creation_date DATE,
    release_date DATE,
    last_time_charged_date DATE,
    last_expenses_charged_date DATE,
    last_active_etcp_date DATE,
    engagement TEXT,
    client TEXT,
    engagement_region VARCHAR(10),
    engagement_country TEXT,
    engagement_type TEXT,
    currency CHAR(3),
    engagement_partner TEXT,
    engagement_partner_gui CHAR(7),
    engagement_manager TEXT,
    engagement_manager_gui CHAR(7),
    engagement_partner_service_line TEXT,
    engagement_status VARCHAR(20)
"""

# Save the DataFrame to an Excel file
file_name = "test_engagement_data.xlsx"
df_engagement.to_excel(f"./{dir_name}/{file_name}", index=False)

# Load data to the database
# load_data_to_db(df_engagement, 'engagement_data', engagement_schema)

# Display the first few rows of the DataFrame
df_engagement.head()

Unnamed: 0,engagement_id,creation_date,release_date,last_time_charged_date,last_expenses_charged_date,last_active_etcp_date,engagement,client,engagement_region,engagement_country,engagement_type,currency,engagement_partner,engagement_partner_gui,engagement_manager,engagement_manager_gui,engagement_partner_service_line,engagement_status
0,E-23313916,2023-10-09,2024-01-09,,2024-05-21,,Scale Real-Time Functionalities,Lambert-Stewart,EMEA,Vanuatu,External Project,ILS,"Ball, Jasmine",9743836,"Kelly, Jordan",2732773,Assurance,Released
1,E-82652400,2023-11-15,2024-01-30,,,2024-04-01,Generate Dot-Com Applications,King Group,EMEA,Djibouti,External Project,ANG,"Hill, Rita",4200322,"Ball, Shane",8297959,CBS & Elim,Pending
2,E-89749027,2024-02-24,2024-04-16,2024-05-23,2024-05-25,,Seize Synergistic Models,Bibi and Sons,EMEA,Bahrain,External Project,GGP,"Kay, Jeffrey",5808488,"Doherty, Edward",4971055,Tax,Pending
3,E-65366123,2024-01-15,2024-04-05,,2024-05-21,,Implement Extensible Technologies,Gregory and Sons,EMEA,Malawi,External Project,JPY,"Page, Ellie",4371015,"Robinson, Georgia",5269173,CBS & Elim,Released
4,E-31340802,2024-04-20,2024-05-06,,,,Monetize Interactive Initiatives,Moore-Cameron,EMEA,French Polynesia,External Project,BRL,"Smith, Annette",7028266,"Blake, Chloe",4760900,SaT,Pending


In [5]:
# get list of unique values in a engagement_partner_gui and engagement_manager_gui in a single list
engagement_partner_gui = df_engagement['engagement_partner_gui'].unique().tolist()
engagement_manager_gui = df_engagement['engagement_manager_gui'].unique().tolist()
gui = engagement_partner_gui + engagement_manager_gui
# remove duplicates
gui = list(set(gui))
# print gui size
print(len(gui))
# add faker safe email address to each gui and move to a df
email = []
for i in gui:
    email.append(fake.safe_email())
df_email = pd.DataFrame(list(zip(gui, email)), columns=['GUI', 'Email'])
# sort by gui
df_email = df_email.sort_values(by=['GUI']).reset_index(drop=True)

# Save the DataFrame to a CSV file
df_email.to_excel(f"./{dir_name}/test_emailList.xlsx", index=False)

# create schema for cols in the table
email_schema = {
    'gui': 'CHAR(7)',
    'email': 'TEXT'
}

# load_data_to_db(df_email, 'email', email_schema)

df_email.head()


2000


Unnamed: 0,GUI,Email
0,6117,woodjack@example.net
1,6181,eleanorcarter@example.org
2,19400,butlergillian@example.com
3,27495,scottkatherine@example.com
4,31624,joseph38@example.com


# Exception


In [9]:
# Sample engagement_id from df_engagement
engagement_id = df_engagement['engagement_id'].sample(n=50).tolist()

# Define columns for exception data
columns = [
    'engagement_id', 
    'exception_reason_category', 
    'exception_reason_description', 
    'exception_reason_status', 
    'exception_reason_start_date', 
    'exception_reason_status_user', 
    'exception_reason_status_user_gui', 
    'exception_reason_status_user_email'
]

# Generate exception data
data = [
    [
        eid,
        random.choice(["Tax", "Investment Code", "Invoice", "Other"]),
        fake.sentence(),
        random.choice(["Pending EP Approval", "Pending Finance Approval", "Approved", "Finance Rejected", "EP Rejected", "Expired", "New Time"]),
        fake.date_between(start_date='-4w', end_date='today'),
        f"{fake.last_name()}, {fake.first_name()}",
        fake.unique.pystr_format("#######"),
        fake.safe_email()
    ]
    for eid in engagement_id
]

# Create DataFrame
df_exception = pd.DataFrame(data, columns=columns)

# Define schema for exception data
exception_schema = {
    'engagement_id': 'CHAR(10)',
    'exception_reason_category': 'TEXT',
    'exception_reason_description': 'TEXT',
    'exception_reason_status': 'TEXT',
    'exception_reason_start_date': 'DATE',
    'exception_reason_status_user': 'TEXT',
    'exception_reason_status_user_gui': 'CHAR(7)',
    'exception_reason_status_user_email': 'TEXT'
}

# Save DataFrame to an Excel file
file_name = f"test_exception_data.xlsx"
df_exception.to_excel(f"./{dir_name}/{file_name}", index=False)

# Load data to the database (uncomment the line below when the function is available)
# load_data_to_db(df_exception, 'exception', exception_schema)

# Display the first 25 rows of the DataFrame
df_exception.head(25)

Unnamed: 0,engagement_id,exception_reason_category,exception_reason_description,exception_reason_status,exception_reason_start_date,exception_reason_status_user,exception_reason_status_user_gui,exception_reason_status_user_email
0,E-30701606,Other,Consequuntur laboriosam recusandae repellat.,Approved,2024-06-10,"Grant, Jean",5631387,dianepeacock@example.com
1,E-15619013,Other,Sapiente doloremque quae quae vitae.,Expired,2024-05-23,"Wilson, Charlotte",3547329,garrycharlton@example.com
2,E-68039002,Other,Illo ad iste praesentium.,New Time,2024-06-13,"May, Nathan",5616800,mohammad35@example.net
3,E-80808709,Investment Code,Eveniet labore earum culpa porro ipsam.,New Time,2024-06-07,"Stevens, Carolyn",3012337,astokes@example.com
4,E-26371593,Invoice,Nisi aperiam excepturi.,Expired,2024-05-19,"Nicholls, Abdul",2708402,tony38@example.net
5,E-01692719,Tax,Dolore eos repudiandae dolorem nesciunt.,Approved,2024-06-03,"Turnbull, Connor",3148509,maureenfoster@example.org
6,E-04484186,Tax,Beatae nesciunt animi libero.,New Time,2024-05-18,"Dean, Stephanie",1234534,ladams@example.org
7,E-60339957,Invoice,Dolorem possimus perferendis voluptates dolor ...,Finance Rejected,2024-05-18,"Gordon, Sara",2612822,alisoncooper@example.net
8,E-33867744,Investment Code,Necessitatibus expedita a deserunt vitae.,Finance Rejected,2024-06-02,"Matthews, Lynn",3490129,parsonsarthur@example.com
9,E-69815798,Tax,Recusandae voluptatibus quidem magnam officia ...,Pending EP Approval,2024-06-10,"Hussain, Marion",8894042,wyattabdul@example.com


In [7]:
# Sample 25 unique engagement IDs
engagement_id = df_engagement['engagement_id'].sample(n=25, random_state=1).tolist()

# Create a list to hold delegate information
delegate_list = []

# Generate delegate information for each engagement ID
for eid in engagement_id:
    num_delegates = random.randint(1, 3)  # Randomly choose 1, 2, or 3 delegates
    for i in range(1, num_delegates + 1):
        delegate_name = fake.last_name() + ", " + fake.first_name()
        delegate_gui = fake.unique.pystr_format("#######")
        delegate_email = fake.safe_email()
        delegate_list.append({
            'Engagement ID': eid,
            'Delegate Number': i,
            'Delegate Name': delegate_name,
            'Delegate GUI': delegate_gui,
            'Delegate Email': delegate_email
        })

# Create a DataFrame from the delegate list
df_delegate = pd.DataFrame(delegate_list)

# Sort the DataFrame by Engagement ID and Delegate Number
df_delegate = df_delegate.sort_values(by=['Engagement ID', 'Delegate Number'])

df_delegate.reset_index(drop=True, inplace=True)

# create schema for cols in the table
delegate_schema = {
    'Engagement ID': 'CHAR(10)',
    'Delegate Number': 'INTEGER',
    'Delegate Name': 'TEXT',
    'Delegate GUI': 'CHAR(7)',
    'Delegate Email': 'TEXT'
}

# load_data_to_db(df_delegate, 'delegate', delegate_schema)

# Save the DataFrame to an Excel file
df_delegate.to_excel(f"./{dir_name}/test_delegateData.xlsx", index=False)

df_delegate.head(25)

Unnamed: 0,Engagement ID,Delegate Number,Delegate Name,Delegate GUI,Delegate Email
0,E-03392511,1,"Fowler, Vanessa",8159458,harrisamber@example.com
1,E-03392511,2,"Wilkins, Jodie",1367872,mccarthybryan@example.org
2,E-03392511,3,"Gibson, Brett",5901112,hlee@example.org
3,E-09136190,1,"Rose, Kirsty",6976322,patricia16@example.com
4,E-09136190,2,"Robinson, Gordon",1877488,allanben@example.org
5,E-16872978,1,"Barber, Sam",4663082,jemma97@example.org
6,E-16872978,2,"Morrison, Jack",9913527,richardwebb@example.org
7,E-18153558,1,"Hayward, Martin",5037909,dixoncharlotte@example.net
8,E-18153558,2,"Clements, Allan",550660,dtaylor@example.com
9,E-20797940,1,"Payne, Elliot",6143483,eileen87@example.org
