In [41]:
import pandas as pd
import random
import datetime


In [42]:
# number of records in the ola tablr
num_rows = 50

# number of dates to generate data for
num_dates = 7

# number of jobs having more that one run for each date
num_dup_jobs = 2

In [43]:
def generate_ola_records(num_rows):
    data = []

    for i in range(num_rows):
        application = random.choice(['APP_1', 'APP_2'])
        category = random.choice(['CAT_1', 'CAT_2', 'CAT_3'])
        description = random.choice(['DESC_1', 'DESC_2'])
        job_name = f'JOB_{i+1}'
        cutoff = datetime.time(random.randint(0, 6), 0)
        active = random.choices([True, False], weights=[90, 10], k=1)[0]
        source = random.choice(['SRC_1', 'SRC_2'])
        daymatch = random.choices([True, False], weights=[90, 10], k=1)[0]

        # Append the data to the list
        data.append([application, category, description, job_name, cutoff, active, source, daymatch])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Application', 'Category', 'Description', 'Job Name', 'Cutoff', 'Active', 'Source', 'Daymatch'])

    # Return the DataFrame
    return df

# Call the function and assign the returned DataFrame to ola_records
ola_records = generate_ola_records(num_rows)

In [44]:

def get_dates(num_dates):
    # Get today's date
    today = pd.Timestamp.today().normalize()

    # Initialize a list to store the dates
    dates = []

    # Start from 2 days ago and keep going back until we have enough dates
    i = 2
    while len(dates) < num_dates:
        date = today - pd.DateOffset(days=i)
        if date.dayofweek < 5:  # If the date is not on a weekend
            dates.append(date)
        i += 1

    return dates


def get_start_time(date):
    # Create a start time
    return datetime.datetime.combine(date, datetime.time(random.randint(20, 23), random.randint(0, 59))) if random.random() < 0.5 else datetime.datetime.combine(date + pd.DateOffset(days=1), datetime.time(random.randint(0, 1), random.randint(0, 59)))

def get_cutoff_time(date):
    # Create a cutoff time
    return datetime.datetime.combine(date + pd.DateOffset(days=1), datetime.time(random.randint(0, 5), 0))

def get_end_time(date):
    # Create an end time
    return datetime.datetime.combine(date + pd.DateOffset(days=1), datetime.time(random.randint(0, 7), random.randint(0, 59)))

In [45]:
def generate_data(ola_records, num_dates, num_rows,num_dup_jobs):
    # Create a list of unique job names
    job_names = ola_records['Job Name'].unique().tolist()

    data = []
    dates = get_dates(num_dates)
    for i in range(num_dates):
        date = dates[i]
        available_job_names = job_names.copy()  # Create a copy of job names for each date
        for j in range(num_rows + num_dup_jobs):
            start_time = get_start_time(date)
            cutoff_time = get_cutoff_time(date)
            end_time = get_end_time(date)
            buffer = (cutoff_time - end_time).total_seconds() / 3600

            # If all job names are used for this date, refresh the available job names
            if not available_job_names:
                available_job_names = job_names.copy()

            # Select a job name from the available job names and remove it from the list
            job_name = random.choice(available_job_names)
            available_job_names.remove(job_name)

            # Append the data to the list
            data.append([date, start_time, cutoff_time, end_time, buffer, job_name])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Start Time', 'Cutoff Time', 'End Time', 'Buffer', 'Job Name'])

    # Return the DataFrame
    return df
# Call the function with the ola_records DataFrame
data_df = generate_data(ola_records,num_dates, num_rows, num_dup_jobs)

In [46]:

data_df

Unnamed: 0,Date,Start Time,Cutoff Time,End Time,Buffer,Job Name
0,2023-12-15,2023-12-16 00:53:00,2023-12-16 00:00:00,2023-12-16 02:37:00,-2.616667,JOB_7
1,2023-12-15,2023-12-15 21:49:00,2023-12-16 05:00:00,2023-12-16 07:04:00,-2.066667,JOB_42
2,2023-12-15,2023-12-15 20:04:00,2023-12-16 04:00:00,2023-12-16 03:38:00,0.366667,JOB_37
3,2023-12-15,2023-12-15 23:51:00,2023-12-16 00:00:00,2023-12-16 05:55:00,-5.916667,JOB_14
4,2023-12-15,2023-12-16 00:59:00,2023-12-16 00:00:00,2023-12-16 05:29:00,-5.483333,JOB_23
...,...,...,...,...,...,...
359,2023-12-07,2023-12-07 20:21:00,2023-12-08 05:00:00,2023-12-08 02:03:00,2.950000,JOB_8
360,2023-12-07,2023-12-08 00:25:00,2023-12-08 04:00:00,2023-12-08 06:12:00,-2.200000,JOB_35
361,2023-12-07,2023-12-07 23:53:00,2023-12-08 05:00:00,2023-12-08 05:56:00,-0.933333,JOB_40
362,2023-12-07,2023-12-07 21:16:00,2023-12-08 00:00:00,2023-12-08 03:43:00,-3.716667,JOB_40


In [47]:
data_df.to_csv('data.csv', index=False)

In [50]:
# Identify duplicates based on 'Date' and 'Job Name'
duplicates = data_df[data_df.duplicated(['Date', 'Job Name'], keep=False)]

duplicates.to_csv('duplicates.csv', index=False)

# Print the duplicates
print(duplicates)

          Date          Start Time         Cutoff Time            End Time  \
32  2023-12-15 2023-12-16 01:42:00 2023-12-16 01:00:00 2023-12-16 01:51:00   
41  2023-12-15 2023-12-15 23:01:00 2023-12-16 02:00:00 2023-12-16 06:05:00   
50  2023-12-15 2023-12-15 22:17:00 2023-12-16 02:00:00 2023-12-16 01:12:00   
51  2023-12-15 2023-12-16 01:19:00 2023-12-16 00:00:00 2023-12-16 05:40:00   
72  2023-12-14 2023-12-14 23:47:00 2023-12-15 00:00:00 2023-12-15 06:16:00   
95  2023-12-14 2023-12-14 23:08:00 2023-12-15 00:00:00 2023-12-15 07:53:00   
102 2023-12-14 2023-12-15 01:40:00 2023-12-15 05:00:00 2023-12-15 01:55:00   
103 2023-12-14 2023-12-14 21:20:00 2023-12-15 03:00:00 2023-12-15 03:05:00   
120 2023-12-13 2023-12-14 00:52:00 2023-12-14 01:00:00 2023-12-14 03:59:00   
145 2023-12-13 2023-12-14 00:01:00 2023-12-14 00:00:00 2023-12-14 02:35:00   
154 2023-12-13 2023-12-13 23:13:00 2023-12-14 04:00:00 2023-12-14 02:37:00   
155 2023-12-13 2023-12-14 00:21:00 2023-12-14 01:00:00 2023-12-1