In [173]:
import pandas as pd
import random
import datetime


In [174]:
# number of records in the ola tablr
num_rows = 50

# number of dates to generate data for
num_dates = 7

# number of jobs having more that one run for each date
num_dup_jobs = 2

# define aggregation keys for the reporting - that will be specific columns of the ola table
key1 = 'Application'
key2 = 'Category'
key3 = 'Description'

In [175]:
def generate_ola_records(num_rows):
    data = []

    for i in range(num_rows):
        application = random.choice(['APP_1', 'APP_2'])
        category = random.choice(['CAT_1', 'CAT_2', 'CAT_3'])
        description = random.choice(['DESC_1', 'DESC_2'])
        job_name = f'JOB_{i+1}'
        cutoff = datetime.time(random.randint(0, 6), 0)
        active = random.choices([True, False], weights=[90, 10], k=1)[0]
        source = random.choice(['SRC_1', 'SRC_2'])
        daymatch = random.choices([True, False], weights=[90, 10], k=1)[0]
        nextday = random.choices([True, False], weights=[90, 10], k=1)[0]
        saturday = random.choices([True, False], weights=[10, 90], k=1)[0]

        # Append the data to the list
        data.append([application, category, description, job_name, cutoff, active, source, daymatch, nextday, saturday])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Application', 'Category', 'Description', 'Job Name', 'Cutoff', 'Active', 'Source', 'Daymatch', 'Nextday', 'Saturday'])

    # Return the DataFrame
    return df

# Call the function and assign the returned DataFrame to ola_records
ola_records = generate_ola_records(num_rows)

In [176]:

def get_dates(num_dates):
    # Get today's date
    today = pd.Timestamp.today().normalize()

    # Initialize a list to store the dates
    dates = []

    # Start from 2 days ago and keep going back until we have enough dates
    i = 2
    while len(dates) < num_dates:
        date = today - pd.DateOffset(days=i)
        if date.dayofweek < 5:  # If the date is not on a weekend
            dates.append(date)
        i += 1

    return dates


def get_start_time(date):
    # Create a start time
    return datetime.datetime.combine(date, datetime.time(random.randint(20, 23), random.randint(0, 59))) if random.random() < 0.5 else datetime.datetime.combine(date + pd.DateOffset(days=1), datetime.time(random.randint(0, 1), random.randint(0, 59)))


def get_end_time(date):
    # Create an end time
    return datetime.datetime.combine(date + pd.DateOffset(days=1), datetime.time(random.randint(0, 7), random.randint(0, 59)))

In [177]:
def generate_data(ola_records, num_dates, num_rows,num_dup_jobs):
    # Create a list of unique job names
    job_names = ola_records['Job Name'].unique().tolist()

    data = []
    dates = get_dates(num_dates)
    for i in range(num_dates):
        date = dates[i]
        available_job_names = job_names.copy()  # Create a copy of job names for each date
        for j in range(num_rows + num_dup_jobs):
            start_time = get_start_time(date)
            end_time = get_end_time(date)

            # If all job names are used for this date, refresh the available job names
            if not available_job_names:
                available_job_names = job_names.copy()

            # Select a job name from the available job names and remove it from the list
            job_name = random.choice(available_job_names)
            available_job_names.remove(job_name)

            # Append the data to the list
            data.append([date, job_name,start_time, end_time])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Date',  'Job Name','Start Time', 'End Time'])

    # Return the DataFrame
    return df
# Call the function with the ola_records DataFrame
data_df = generate_data(ola_records,num_dates, num_rows, num_dup_jobs)

In [178]:

data_df

Unnamed: 0,Date,Job Name,Start Time,End Time
0,2023-12-15,JOB_5,2023-12-15 21:25:00,2023-12-16 01:16:00
1,2023-12-15,JOB_21,2023-12-16 00:21:00,2023-12-16 02:43:00
2,2023-12-15,JOB_15,2023-12-15 23:05:00,2023-12-16 01:22:00
3,2023-12-15,JOB_41,2023-12-15 23:13:00,2023-12-16 04:46:00
4,2023-12-15,JOB_26,2023-12-15 22:16:00,2023-12-16 01:46:00
...,...,...,...,...
359,2023-12-07,JOB_34,2023-12-07 20:57:00,2023-12-08 05:08:00
360,2023-12-07,JOB_20,2023-12-08 00:15:00,2023-12-08 04:07:00
361,2023-12-07,JOB_40,2023-12-08 01:31:00,2023-12-08 05:06:00
362,2023-12-07,JOB_2,2023-12-08 01:25:00,2023-12-08 05:17:00


In [179]:
data_df.to_csv('data.csv', index=False)

In [180]:
# Identify duplicates based on 'Date' and 'Job Name'
duplicates = data_df[data_df.duplicated(['Date', 'Job Name'], keep=False)]

duplicates.to_csv('duplicates.csv', index=False)

# Print the duplicates
print(duplicates)

          Date Job Name          Start Time            End Time
18  2023-12-15   JOB_29 2023-12-15 23:19:00 2023-12-16 04:00:00
28  2023-12-15    JOB_2 2023-12-15 20:42:00 2023-12-16 06:55:00
50  2023-12-15   JOB_29 2023-12-15 21:40:00 2023-12-16 03:32:00
51  2023-12-15    JOB_2 2023-12-16 01:45:00 2023-12-16 03:55:00
66  2023-12-14   JOB_32 2023-12-14 23:42:00 2023-12-15 06:15:00
70  2023-12-14    JOB_9 2023-12-14 20:27:00 2023-12-15 04:48:00
102 2023-12-14    JOB_9 2023-12-15 01:56:00 2023-12-15 03:07:00
103 2023-12-14   JOB_32 2023-12-15 00:50:00 2023-12-15 04:39:00
118 2023-12-13   JOB_29 2023-12-14 01:19:00 2023-12-14 04:39:00
136 2023-12-13   JOB_37 2023-12-13 20:10:00 2023-12-14 00:37:00
154 2023-12-13   JOB_29 2023-12-13 20:11:00 2023-12-14 04:15:00
155 2023-12-13   JOB_37 2023-12-14 00:56:00 2023-12-14 01:07:00
177 2023-12-12   JOB_35 2023-12-12 23:23:00 2023-12-13 04:55:00
189 2023-12-12   JOB_43 2023-12-12 21:30:00 2023-12-13 06:16:00
206 2023-12-12   JOB_35 2023-12-12 23:51

In [181]:
merged_df = data_df.merge(ola_records, on='Job Name', how='inner')

merged_df

Unnamed: 0,Date,Job Name,Start Time,End Time,Application,Category,Description,Cutoff,Active,Source,Daymatch,Nextday,Saturday
0,2023-12-15,JOB_5,2023-12-15 21:25:00,2023-12-16 01:16:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
1,2023-12-14,JOB_5,2023-12-14 21:32:00,2023-12-15 01:33:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
2,2023-12-13,JOB_5,2023-12-14 00:42:00,2023-12-14 05:18:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
3,2023-12-12,JOB_5,2023-12-13 00:27:00,2023-12-13 01:45:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
4,2023-12-11,JOB_5,2023-12-11 21:03:00,2023-12-12 07:35:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,2023-12-13,JOB_10,2023-12-13 23:56:00,2023-12-14 05:30:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
360,2023-12-12,JOB_10,2023-12-13 01:15:00,2023-12-13 02:23:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
361,2023-12-11,JOB_10,2023-12-12 01:53:00,2023-12-12 03:07:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
362,2023-12-08,JOB_10,2023-12-08 20:39:00,2023-12-09 05:05:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False


In [182]:
# Ask the user if they want to remove records before a specified date
remove_records = input("Do you want to remove records before a specified date? (yes/no): ")

if remove_records.lower() == "yes":
    # Ask the user to enter the specified date
    specified_date = input("Enter the specified date (YYYY-MM-DD): ")

    # Convert the specified date to a pandas Timestamp object
    specified_date = pd.Timestamp(specified_date)

    # Filter the merged_df table to remove records before the specified date
    merged_df = merged_df[merged_df['Date'] >= specified_date]


In [183]:
merged_df

Unnamed: 0,Date,Job Name,Start Time,End Time,Application,Category,Description,Cutoff,Active,Source,Daymatch,Nextday,Saturday
0,2023-12-15,JOB_5,2023-12-15 21:25:00,2023-12-16 01:16:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
1,2023-12-14,JOB_5,2023-12-14 21:32:00,2023-12-15 01:33:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
2,2023-12-13,JOB_5,2023-12-14 00:42:00,2023-12-14 05:18:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
3,2023-12-12,JOB_5,2023-12-13 00:27:00,2023-12-13 01:45:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
4,2023-12-11,JOB_5,2023-12-11 21:03:00,2023-12-12 07:35:00,APP_1,CAT_2,DESC_1,05:00:00,True,SRC_1,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,2023-12-13,JOB_10,2023-12-13 23:56:00,2023-12-14 05:30:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
360,2023-12-12,JOB_10,2023-12-13 01:15:00,2023-12-13 02:23:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
361,2023-12-11,JOB_10,2023-12-12 01:53:00,2023-12-12 03:07:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False
362,2023-12-08,JOB_10,2023-12-08 20:39:00,2023-12-09 05:05:00,APP_2,CAT_2,DESC_2,01:00:00,True,SRC_1,False,False,False


In [184]:
# Create a new column 'Cutoff Datetime' in the merged_df DataFrame
merged_df['Cutoff Datetime'] = merged_df.apply(lambda row: row['Date'] + pd.DateOffset(days=1) if row['Nextday'] else row['Date'], axis=1)

# Adjust the cutoff datetime based on the day of the week and the 'Saturday' column
merged_df['Cutoff Datetime'] = merged_df.apply(lambda row: row['Cutoff Datetime'] + pd.DateOffset(days=2) if row['Date'].dayofweek == 4 and not row['Saturday'] else row['Cutoff Datetime'], axis=1)

# Convert 'Cutoff' column from datetime.time to number of hours past midnight
merged_df['Cutoff'] = merged_df['Cutoff'].apply(lambda t: t.hour + t.minute/60 + t.second/3600)

# Add the number of hours in the 'Cutoff' column to the 'Cutoff Datetime' column
merged_df['Cutoff Datetime'] = merged_df['Cutoff Datetime'] + pd.to_timedelta(merged_df['Cutoff'], unit='h')

# Print the updated DataFrame
merged_df


Unnamed: 0,Date,Job Name,Start Time,End Time,Application,Category,Description,Cutoff,Active,Source,Daymatch,Nextday,Saturday,Cutoff Datetime
0,2023-12-15,JOB_5,2023-12-15 21:25:00,2023-12-16 01:16:00,APP_1,CAT_2,DESC_1,5.0,True,SRC_1,True,True,False,2023-12-18 05:00:00
1,2023-12-14,JOB_5,2023-12-14 21:32:00,2023-12-15 01:33:00,APP_1,CAT_2,DESC_1,5.0,True,SRC_1,True,True,False,2023-12-15 05:00:00
2,2023-12-13,JOB_5,2023-12-14 00:42:00,2023-12-14 05:18:00,APP_1,CAT_2,DESC_1,5.0,True,SRC_1,True,True,False,2023-12-14 05:00:00
3,2023-12-12,JOB_5,2023-12-13 00:27:00,2023-12-13 01:45:00,APP_1,CAT_2,DESC_1,5.0,True,SRC_1,True,True,False,2023-12-13 05:00:00
4,2023-12-11,JOB_5,2023-12-11 21:03:00,2023-12-12 07:35:00,APP_1,CAT_2,DESC_1,5.0,True,SRC_1,True,True,False,2023-12-12 05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,2023-12-13,JOB_10,2023-12-13 23:56:00,2023-12-14 05:30:00,APP_2,CAT_2,DESC_2,1.0,True,SRC_1,False,False,False,2023-12-13 01:00:00
360,2023-12-12,JOB_10,2023-12-13 01:15:00,2023-12-13 02:23:00,APP_2,CAT_2,DESC_2,1.0,True,SRC_1,False,False,False,2023-12-12 01:00:00
361,2023-12-11,JOB_10,2023-12-12 01:53:00,2023-12-12 03:07:00,APP_2,CAT_2,DESC_2,1.0,True,SRC_1,False,False,False,2023-12-11 01:00:00
362,2023-12-08,JOB_10,2023-12-08 20:39:00,2023-12-09 05:05:00,APP_2,CAT_2,DESC_2,1.0,True,SRC_1,False,False,False,2023-12-10 01:00:00


In [185]:
# Calculate the buffer column
merged_df['Buffer'] = (merged_df['Cutoff Datetime'] - merged_df['End Time']).dt.total_seconds() / 3600

# Sort the table by ascending dates, application, category, description, and job names
merged_df.sort_values(by=['Date', 'Application', 'Category', 'Description', 'Job Name'], inplace=True)

# Drop duplicates and keep the row with the smallest buffer
merged_df.drop_duplicates(subset=['Date', 'Application', 'Category', 'Description', 'Job Name'], keep='first', inplace=True)

# Reset the index
merged_df.reset_index(drop=True, inplace=True)

# Print the updated DataFrame
merged_df


Unnamed: 0,Date,Job Name,Start Time,End Time,Application,Category,Description,Cutoff,Active,Source,Daymatch,Nextday,Saturday,Cutoff Datetime,Buffer
0,2023-12-07,JOB_17,2023-12-08 01:08:00,2023-12-08 03:51:00,APP_1,CAT_1,DESC_1,4.0,True,SRC_1,True,True,False,2023-12-08 04:00:00,0.150000
1,2023-12-07,JOB_18,2023-12-08 00:08:00,2023-12-08 06:40:00,APP_1,CAT_1,DESC_1,0.0,True,SRC_1,True,True,False,2023-12-08 00:00:00,-6.666667
2,2023-12-07,JOB_25,2023-12-08 00:44:00,2023-12-08 05:18:00,APP_1,CAT_1,DESC_1,0.0,True,SRC_2,True,True,False,2023-12-08 00:00:00,-5.300000
3,2023-12-07,JOB_3,2023-12-07 22:45:00,2023-12-08 04:45:00,APP_1,CAT_1,DESC_1,4.0,True,SRC_1,True,True,False,2023-12-08 04:00:00,-0.750000
4,2023-12-07,JOB_4,2023-12-07 21:39:00,2023-12-08 04:20:00,APP_1,CAT_1,DESC_1,5.0,True,SRC_1,False,True,False,2023-12-08 05:00:00,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,2023-12-15,JOB_22,2023-12-15 22:06:00,2023-12-16 03:57:00,APP_2,CAT_3,DESC_1,3.0,True,SRC_2,False,True,False,2023-12-18 03:00:00,47.050000
346,2023-12-15,JOB_27,2023-12-15 20:32:00,2023-12-16 02:33:00,APP_2,CAT_3,DESC_1,3.0,True,SRC_2,True,True,False,2023-12-18 03:00:00,48.450000
347,2023-12-15,JOB_46,2023-12-15 22:00:00,2023-12-16 01:13:00,APP_2,CAT_3,DESC_1,0.0,True,SRC_2,True,True,False,2023-12-18 00:00:00,46.783333
348,2023-12-15,JOB_13,2023-12-16 00:54:00,2023-12-16 00:44:00,APP_2,CAT_3,DESC_2,5.0,True,SRC_2,True,True,False,2023-12-18 05:00:00,52.266667


In [186]:

pivot_table = merged_df.pivot_table(index=[key1, key2, key3], columns='Date', values='Buffer', aggfunc='min').reset_index()

pivot_table


Date,Application,Category,Description,2023-12-07 00:00:00,2023-12-08 00:00:00,2023-12-11 00:00:00,2023-12-12 00:00:00,2023-12-13 00:00:00,2023-12-14 00:00:00,2023-12-15 00:00:00
0,APP_1,CAT_1,DESC_1,-6.666667,44.083333,-4.666667,-4.4,-7.216667,-4.6,41.533333
1,APP_1,CAT_1,DESC_2,-5.283333,-7.65,-7.666667,-5.933333,-5.266667,-5.966667,-4.183333
2,APP_1,CAT_2,DESC_1,-21.266667,22.083333,-19.083333,-26.85,-26.6,-19.0,23.05
3,APP_1,CAT_2,DESC_2,-1.616667,46.7,-1.416667,-6.233333,-4.55,-2.216667,45.233333
4,APP_1,CAT_3,DESC_1,-3.9,44.05,1.066667,-4.0,-2.183333,-3.133333,49.916667
5,APP_1,CAT_3,DESC_2,-25.166667,18.566667,-31.433333,-26.583333,-25.2,-25.75,21.2
6,APP_2,CAT_1,DESC_1,-3.383333,-3.816667,-4.966667,-6.266667,-5.2,-5.283333,1.55
7,APP_2,CAT_1,DESC_2,-5.366667,41.85,-7.2,-4.916667,-7.35,-0.966667,43.916667
8,APP_2,CAT_2,DESC_1,-22.916667,21.4,-22.666667,-20.533333,-24.416667,-25.483333,22.15
9,APP_2,CAT_2,DESC_2,-30.433333,19.916667,-26.116667,-25.383333,-28.5,-28.5,19.533333
