In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [40]:
train = pd.read_csv('combined_dataset.csv',sep=',')
train.info()

test = pd.read_csv('enhanced_combined_dataset.csv',sep=',')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Task Duration          50 non-null     int64 
 5   Distance to Task (km)  50 non-null     int64 
 6   Task Completed         50 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 2.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Distance to Task (km)  200 non-null    int64  
 1   Priority               200 non-null    int64  
 2   Task Complexity        200 non-null    int64  
 3   Max Working Hours   

### Drop Unnecessary Columns & Rename Columns

In [41]:
train_drop = train.drop(columns=['Task Duration'])

test_drop = test.drop(columns=["Max Working Hours","Travel Time (minutes)","Overtime Cost ($)"])
test_drop = test_drop.rename(columns={'Priority': 'Task Priority', 'Penalty for Delay ($)': 'Penalty Cost'})

train_drop.info()
test_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Distance to Task (km)  50 non-null     int64 
 5   Task Completed         50 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 2.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Distance to Task (km)  200 non-null    int64
 1   Task Priority          200 non-null    int64
 2   Task Complexity        200 non-null    int64
 3   Equipment Required     200 non-null    int64
 4   Customer Rating        200 non-

### Calculate Penalty Cost in Test Data

In [42]:
penalty_ranges = {
    1: (10, 50),
    2: (20, 100),
    3: (30, 150),
    4: (40, 200),
    5: (50, 250)
}

def calculate_penalty(priority):
    low, high = penalty_ranges[priority]
    return np.random.randint(low, high)

train_drop['Penalty Cost'] = train_drop['Task Priority'].apply(calculate_penalty)

train_drop.head()
train_drop.info()

Unnamed: 0,Technician ID,Task ID,Expertise Match,Task Priority,Distance to Task (km),Task Completed,Penalty Cost
0,T005,J001,0,2,1,1,21
1,T006,J002,1,1,17,0,33
2,T005,J003,0,2,19,1,49
3,T009,J004,1,3,15,1,67
4,T003,J005,1,3,14,1,31


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Distance to Task (km)  50 non-null     int64 
 5   Task Completed         50 non-null     int64 
 6   Penalty Cost           50 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 2.9+ KB


### Derive New Columns

<p>
    logic:

expertise = 0 if 

	eqpt trained-1, eqpt required-1 and task comp-h, tech comp-l
	eqpt trained-0, eqpt required-0 and task comp-h, tech comp-l

	eqpt trained-0, eqpt required-1 and task comp-l, tech comp-h

	eqpt trained-1, eqpt required-0 and task comp-h, tech comp-l
	eqpt trained-0, eqpt required-1 and task comp-h, tech comp-l


expertise = 1 if 

	eqpt trained-1, eqpt required-1 and task comp-l, tech comp-h
    eqpt trained-1, eqpt required-0 and task comp-l, tech comp-h
	eqpt trained-0, eqpt required-0 and task comp-l, tech comp-h
</p>

In [43]:
merged_df = pd.merge(
    train_drop, test_drop, on=['Task Priority', 'Distance to Task (km)']
)
merged_df = merged_df.sort_values(by=['Technician ID','Expertise Match'])
merged_df = merged_df[['Technician ID', 'Expertise Match', 'Equipment Required', 'Task Complexity']]
merged_df

Unnamed: 0,Technician ID,Expertise Match,Equipment Required,Task Complexity
25,T001,0,0,7
26,T001,0,1,6
22,T001,1,0,9
5,T002,0,0,6
9,T002,0,0,9
13,T002,1,0,5
23,T003,0,0,5
1,T003,1,1,5
4,T004,0,0,3
11,T004,1,0,5


In [44]:
np.random.seed(42)

# created an empty tech dictionary
techdata = [
    {'Technician ID': 'T001', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T002', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T003', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T004', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T005', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T006', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T007', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T008', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T009', 'Eqpt Trained': None, 'Tech Complexity': None},
    {'Technician ID': 'T010', 'Eqpt Trained': None, 'Tech Complexity': None}
]

tech_df = pd.DataFrame(techdata)


df = merged_df[
    (merged_df['Expertise Match'] == 1)
    & (merged_df['Equipment Required'] == 1)
].copy()


df['Eqpt Trained'] = 1
df['Tech Complexity'] = df['Task Complexity'].apply(lambda x: np.random.randint(x, 11))

findtrainedtech = tech_df.merge(df, on='Technician ID', how='left', suffixes=('', '_new'))

findtrainedtech['Eqpt Trained'] = findtrainedtech['Eqpt Trained_new'].combine_first(findtrainedtech['Eqpt Trained'])
findtrainedtech['Tech Complexity'] = findtrainedtech['Tech Complexity_new'].combine_first(findtrainedtech['Tech Complexity'])

mask = findtrainedtech['Eqpt Trained'].isna()
findtrainedtech.loc[mask, 'Eqpt Trained'] = np.random.randint(0, 2, size=mask.sum())


findtrainedtech.drop(columns=['Eqpt Trained_new', 'Tech Complexity_new', 'Expertise Match' ,'Equipment Required','Task Complexity'], inplace=True)
findtrainedtech


Unnamed: 0,Technician ID,Eqpt Trained,Tech Complexity
0,T001,0.0,
1,T002,0.0,
2,T003,1.0,8.0
3,T004,1.0,
4,T005,0.0,
5,T006,0.0,
6,T007,1.0,10.0
7,T008,0.0,
8,T009,1.0,7.0
9,T009,1.0,7.0


In [45]:
np.random.seed(42)

findtechcomp = merged_df.merge(findtrainedtech, on='Technician ID', how='left')
# findtechcomp

MAX_COMPLEXITY = 10  # define an upper limit

# --- STEP 1: Define helper function ---
def assign_tech_comp(row):
    """
    Assign Tech Complexity based on:
      - Eqpt Trained
      - Eqpt Required
      - Expertise Match
      - Task Complexity
    """
    # skip if Tech Complexity already filled
    if pd.notna(row['Tech Complexity']):
        return row['Tech Complexity']

    t_train = row['Eqpt Trained']
    t_req = row['Equipment Required']
    expertise = row['Expertise Match']
    task_comp = row['Task Complexity']

    # --- Logic from your description ---
    # EXPERTISE = 0 → task high, tech low  (usually underqualified)
    # EXPERTISE = 1 → task low, tech high (overqualified / expert)
    if expertise == 0:
        # Random number LOWER than task complexity (but >= 1)
        lower_bound = 1
        upper_bound = max(2, task_comp)
        return np.random.randint(lower_bound, upper_bound)

    elif expertise == 1:
        # Random number HIGHER than task complexity
        lower_bound = task_comp
        upper_bound = MAX_COMPLEXITY + 1
        return np.random.randint(lower_bound, upper_bound)

    # Default fallback if no match
    return task_comp


# --- STEP 2: Apply the function ---
findtechcomp['Tech Complexity'] = findtechcomp.apply(assign_tech_comp, axis=1)

# findtechcomp

avg_comp = findtechcomp.groupby('Technician ID')['Tech Complexity'].mean().reset_index().round()

tech_df = findtrainedtech.merge(avg_comp, on='Technician ID', suffixes=('', '_avg'))
tech_df = tech_df[['Technician ID', 'Eqpt Trained', 'Tech Complexity_avg']]
tech_df = tech_df.rename(columns={'Tech Complexity_avg': 'Tech Complexity'})
tech_df['Tech Complexity'] = tech_df['Tech Complexity'].astype(int)
tech_df['Eqpt Trained'] = tech_df['Eqpt Trained'].astype(int)
tech_df


Unnamed: 0,Technician ID,Eqpt Trained,Tech Complexity
0,T001,0,6
1,T002,0,7
2,T003,1,8
3,T004,1,4
4,T005,0,6
5,T006,0,6
6,T007,1,10
7,T008,0,5
8,T009,1,7
9,T009,1,7


### One Hot Encode Technician ID

In [46]:
# train_encoded = pd.get_dummies(train_clean, columns=["Technician ID"], dtype=int)