In [37]:
import pandas as pd
import random

# Load the data from the provided Excel file
moderator_data = pd.read_excel("./EDA/Datasets/moderator-data-cleaned.xlsx")

# Filter out moderators with a handling time of 0
moderator_data = moderator_data[moderator_data['handling time'] >= 1000]

In [38]:
# Normalize accuracy and productivity
moderator_data['normalized_accuracy'] = (moderator_data['accuracy'] - moderator_data['accuracy'].min()) / \
                                        (moderator_data['accuracy'].max() - moderator_data['accuracy'].min())

moderator_data['normalized_productivity'] = (moderator_data['Productivity'] - moderator_data['Productivity'].min()) / \
                                           (moderator_data['Productivity'].max() - moderator_data['Productivity'].min())

# Assigning the average accuracy of other moderators to those with NaN values
average_accuracy = moderator_data['accuracy'].mean()
moderator_data['accuracy'].fillna(average_accuracy, inplace=True)

# Recalculating the normalized_accuracy and moderator_score
moderator_data['normalized_accuracy'] = (moderator_data['accuracy'] - moderator_data['accuracy'].min()) / \
                                        (moderator_data['accuracy'].max() - moderator_data['accuracy'].min())
moderator_data['moderator_score'] = (moderator_data['normalized_accuracy'] + moderator_data['normalized_productivity']) / 2                                           


# Compute moderator score as the average of normalized accuracy and normalized productivity
moderator_data['moderator_score'] = (moderator_data['normalized_accuracy'] + moderator_data['normalized_productivity']) / 2

# Display the first few rows with the new columns
moderator_data[['moderator', 'normalized_accuracy', 'normalized_productivity', 'moderator_score']].head()

Unnamed: 0,moderator,normalized_accuracy,normalized_productivity,moderator_score
0,1704427801912322,0.906667,0.20271,0.554688
1,1712377365906433,0.828,0.329741,0.57887
2,1705699742139394,0.826667,0.272345,0.549506
3,1759969798094866,0.708,0.29833,0.503165
5,1695096148334594,0.884,0.35298,0.61849


In [39]:
# Assumed paid hours per day for TikTok moderators
PAID_HOURS_PER_DAY = 8

# Calculate the maximum number of tasks each moderator can handle in a day based on a 10% increase in utilization
moderator_data['max_tasks_per_day'] = (0.1 * PAID_HOURS_PER_DAY * 60 * 60 * 1000) / moderator_data['handling time']

# Display the first few rows with the updated max_tasks_per_day
moderator_data[['moderator', 'handling time', 'Utilisation %', 'max_tasks_per_day']].head()

Unnamed: 0,moderator,handling time,Utilisation %,max_tasks_per_day
0,1704427801912322,119688,1.28725,24.062563
1,1712377365906433,102324,1.157927,28.14589
2,1705699742139394,76773,1.150042,37.513188
3,1759969798094866,100732,1.146969,28.590716
5,1695096148334594,76199,1.115514,37.795772


In [40]:
# Extract unique markets from the moderator data
all_markets = set()
for market_list in moderator_data['market']:
    markets = eval(market_list)
    all_markets.update(markets)

# Generate 5000 sample ads with random queue_market attributes and random ad scores
sample_ads_50 = []
for _ in range(500):
    ad = {
        'ad_id': f"ad_{_ + 1}",
        'queue_market': random.choice(list(all_markets)),
        'ad_score': random.random()  # Random score between 0 and 1
    }
    sample_ads_50.append(ad)

# Reset the allocated_tasks dictionary for reallocation using the filtered moderators and new max tasks per day
allocated_tasks_filtered = {moderator: 0 for moderator in moderator_data['moderator']}

# Modified Allocation function to prioritize moderators with lower utilization %
def allocate_ad_v4(ad, filtered_moderator_data, allocated_tasks):
    # Filter moderators based on ad's queue_market
    valid_moderators = filtered_moderator_data[filtered_moderator_data['market'].apply(lambda x: ad['queue_market'] in eval(x))].copy()
    
    # Sort these moderators based on utilization % (ascending) and then by the difference between ad score and moderator score
    valid_moderators['score_diff'] = abs(valid_moderators['moderator_score'] - ad['ad_score'])
    valid_moderators = valid_moderators.sort_values(by=['Utilisation %', 'score_diff'])
    
    for _, moderator_row in valid_moderators.iterrows():
        # Check if the moderator can handle more tasks
        if allocated_tasks[moderator_row['moderator']] < moderator_row['max_tasks_per_day']:
            allocated_tasks[moderator_row['moderator']] += 1  # Allocate the task
            return moderator_row['moderator'], moderator_row['moderator_score']  # Return the allocated moderator's ID and score
    
    return None, None  # If no moderator is available

# Re-allocate the 5000 sample ads using the updated function and filtered moderators
allocations_50_prioritized = {}
for ad in sample_ads_50:
    allocated_moderator, moderator_score = allocate_ad_v4(ad, moderator_data, allocated_tasks_filtered)
    allocations_50_prioritized[ad['ad_id']] = {
        'moderator': allocated_moderator,
        'ad_score': ad['ad_score'],
        'moderator_score': moderator_score
    }

# First few allocations for inspection after prioritizing
list(allocations_50_prioritized.items())[:5]


[('ad_1',
  {'moderator': 1295012,
   'ad_score': 0.2808018111840296,
   'moderator_score': 0.6033293524849894}),
 ('ad_2',
  {'moderator': 3823019,
   'ad_score': 0.4802840663773682,
   'moderator_score': 0.43489353701289407}),
 ('ad_3',
  {'moderator': 1295012,
   'ad_score': 0.04581332991287268,
   'moderator_score': 0.6033293524849894}),
 ('ad_4',
  {'moderator': 1700711913253889,
   'ad_score': 0.9892815771329717,
   'moderator_score': 0.46110365029349576}),
 ('ad_5',
  {'moderator': 2658129,
   'ad_score': 0.36249360220352866,
   'moderator_score': 0.5807326712100538})]

In [41]:
# Convert allocated tasks from dictionary to Series
allocated_tasks_series = pd.Series(allocated_tasks_filtered)

# Calculate the increase in utilization % for each moderator
moderator_data['allocated_tasks'] = moderator_data['moderator'].map(allocated_tasks_series)
moderator_data['increase_in_utilisation'] = (moderator_data['allocated_tasks'] * moderator_data['handling time']) / (PAID_HOURS_PER_DAY * 60 * 60 * 1000)   # Convert handling time to percentage of a day
moderator_data['new_utilisation'] = moderator_data['Utilisation %'] + moderator_data['increase_in_utilisation']

# Extract relevant columns for display
utilisation_changes_corrected = moderator_data[['moderator', 'allocated_tasks', 'increase_in_utilisation', 'new_utilisation']]

# Only display moderators with non-zero allocations for brevity
utilisation_changes_corrected = utilisation_changes_corrected[utilisation_changes_corrected['allocated_tasks'] > 0]

utilisation_changes_corrected

Unnamed: 0,moderator,allocated_tasks,increase_in_utilisation,new_utilisation
298,1700711878316033,9,0.030283,0.885392
391,2658129,9,0.014924,0.849205
454,1700711913253889,5,0.013861,0.836882
455,6715607,8,0.024593,0.84751
560,1753333041068065,9,0.035615,0.833859
622,5769891,5,0.013718,0.798369
1088,5026590,10,0.017081,0.560963
1096,1767847119851554,7,0.018687,0.554114
1143,6585903,3,0.001328,0.456432
1152,9752939,4,0.015293,0.451113


In [80]:
from gurobipy import Model, GRB

# Initialize the Gurobi model
m = Model("AdTaskAllocation")

# Create the decision variables
x = {}
for ad in sample_ads_50:
    for _, mod_row in moderator_data.iterrows():
        mod = mod_row['moderator']
        x[ad['ad_id'], mod] = m.addVar(vtype=GRB.BINARY, name=f"x_{ad['ad_id']}_{mod}")

In [81]:
# Set the objective function
m.setObjective(sum(x[ad['ad_id'], mod] * abs(ad['ad_score'] - mod_row['moderator_score']) 
                for ad in sample_ads_50 for _, mod_row in moderator_data.iterrows()), GRB.MINIMIZE)

In [82]:
# Add the constraints

# Each ad should be allocated to only one moderator
for ad in sample_ads_50:
    m.addConstr(sum(x[ad['ad_id'], mod_row['moderator']] for _, mod_row in moderator_data.iterrows()) == 1)

# The total tasks assigned to a moderator should not exceed their max tasks per day
for _, mod_row in moderator_data.iterrows():
    mod = mod_row['moderator']
    m.addConstr(sum(x[ad['ad_id'], mod_row['moderator']] for ad in sample_ads_50) <= mod_row['max_tasks_per_day'])

# Only assign an ad to a moderator if the ad's market matches the moderator's market
for ad in sample_ads_50:
    for _, mod_row in moderator_data.iterrows():
        mod = mod_row['moderator']
        if ad['queue_market'] not in eval(mod_row['market']):
            m.addConstr(x[ad['ad_id'], mod] == 0)

In [86]:
# Solve the model
m.optimize()

Gurobi Optimizer version 10.0.2 build v10.0.2rc0 (mac64[rosetta2])

CPU model: Apple M2
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 617077 rows, 638500 columns and 1892300 nonzeros
Model fingerprint: 0x19fb5d9a
Variable types: 0 continuous, 638500 integer (638500 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e+02, 7e+02]
  Bounds range     [1e+00, 1e+00]
  RHS range        [3e-01, 1e+03]
Found heuristic solution: objective 260.8537681
Presolve removed 617077 rows and 638500 columns
Presolve time: 0.44s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.78 seconds (0.52 work units)
Thread count was 1 (of 8 available processors)

Solution count 2: 0 260.854 

Optimal solution found (tolerance 1.00e-04)
Best objective 0.000000000000e+00, best bound 0.000000000000e+00, gap 0.0000%


In [87]:
# Extract the assignments from the solution
assignments = {}
for ad in sample_ads_50:
    for _, mod_row in moderator_data.iterrows():
        if x[ad['ad_id'], mod_row['moderator']].x > 0.5:  # If this ad is assigned to this moderator
            assignments[ad['ad_id']] = mod_row['moderator']

assignments

{'ad_1': 1708440532375554,
 'ad_2': 9558788,
 'ad_3': 1708440532375554,
 'ad_4': 1700711913253889,
 'ad_5': 2658129,
 'ad_6': 9558788,
 'ad_7': 3215286,
 'ad_8': 1527988,
 'ad_9': 1736487777311745,
 'ad_10': 2658129,
 'ad_11': 9558788,
 'ad_12': 1908685,
 'ad_13': 1673359260380165,
 'ad_14': 7378621,
 'ad_15': 1708440532375554,
 'ad_16': 1714052438738946,
 'ad_17': 9558788,
 'ad_18': 1717766044988418,
 'ad_19': 1908685,
 'ad_20': 1736487777311745,
 'ad_21': 1749551102733314,
 'ad_22': 2502751,
 'ad_23': 1714052438738946,
 'ad_24': 1708440532375554,
 'ad_25': 1708440532375554,
 'ad_26': 9558788,
 'ad_27': 1708440532375554,
 'ad_28': 1755541489126450,
 'ad_29': 1708440532375554,
 'ad_30': 1673359260380165,
 'ad_31': 1673359260380165,
 'ad_32': 1752090693569553,
 'ad_33': 1960716,
 'ad_34': 2502751,
 'ad_35': 1695096148334594,
 'ad_36': 2502751,
 'ad_37': 6715607,
 'ad_38': 1708440532375554,
 'ad_39': 1960716,
 'ad_40': 3318612,
 'ad_41': 1724428408471553,
 'ad_42': 2318859,
 'ad_43': 167

In [93]:
# Convert the dictionary of assignments into a DataFrame
assignments_df = pd.DataFrame(list(assignments.items()), columns=['ad_id', 'moderator'])

# Count the number of ads assigned to each moderator
assigned_counts = assignments_df.groupby('moderator').size().reset_index(name='num_ads_assigned')

# Merge with the original moderator_data
merged_data = pd.merge(moderator_data, assigned_counts, left_on='moderator', right_on='moderator', how='left')

# Fill NaN values with 0 (for moderators with no assignments)
merged_data['num_ads_assigned'].fillna(0, inplace=True)

# Calculate the increase in utilization % and new utilization % for each moderator
merged_data['increase_in_utilisation'] = (merged_data['num_ads_assigned'] * merged_data['handling time']) / (PAID_HOURS_PER_DAY * 60 * 60 * 1000)
merged_data['new_utilisation'] = merged_data['Utilisation %'] + merged_data['increase_in_utilisation']

# Extract the relevant columns for display
output_table = merged_data[['moderator', 'num_ads_assigned', 'increase_in_utilisation', 'new_utilisation', 'max_tasks_per_day']].sort_values(['new_utilisation'])

output_table = output_table[output_table['num_ads_assigned'] > 0]

output_table

Unnamed: 0,moderator,num_ads_assigned,increase_in_utilisation,new_utilisation,max_tasks_per_day
1212,7767295,5.0,0.00332,0.293414,150.611861
1139,6585903,3.0,0.001328,0.456432,225.970969
604,1527988,4.0,0.010154,0.797654,39.394851
618,5769891,5.0,0.013718,0.798369,36.447392
731,2502751,17.0,0.051228,0.803603,33.185076
512,6233278,3.0,0.007468,0.813461,40.169605
472,3318612,4.0,0.012749,0.829582,31.375625
450,1700711913253889,5.0,0.013861,0.836882,36.072596
641,1764761367947282,5.0,0.059669,0.838248,8.379547
451,6715607,8.0,0.024593,0.84751,32.529508
