In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
train = pd.read_csv('combined_dataset.csv',sep=',')
train.info()

test = pd.read_csv('enhanced_combined_dataset.csv',sep=',')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Task Duration          50 non-null     int64 
 5   Distance to Task (km)  50 non-null     int64 
 6   Task Completed         50 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 2.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Distance to Task (km)  200 non-null    int64  
 1   Priority               200 non-null    int64  
 2   Task Complexity        200 non-null    int64  
 3   Max Working Hours   

### Drop Unnecessary Columns & Rename Columns

In [3]:
train_drop = train.drop(columns=['Task Duration'])

test_drop = test.drop(columns=["Max Working Hours","Travel Time (minutes)","Overtime Cost ($)"])
test_drop = test_drop.rename(columns={'Priority': 'Task Priority', 'Penalty for Delay ($)': 'Penalty Cost'})

train_drop.info()
test_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Distance to Task (km)  50 non-null     int64 
 5   Task Completed         50 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 2.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Distance to Task (km)  200 non-null    int64
 1   Task Priority          200 non-null    int64
 2   Task Complexity        200 non-null    int64
 3   Equipment Required     200 non-null    int64
 4   Customer Rating        200 non-

### Calculate Penalty Cost in Test Data

In [4]:
penalty_ranges = {
    1: (10, 50),
    2: (20, 100),
    3: (30, 150),
    4: (40, 200),
    5: (50, 250)
}

def calculate_penalty(priority):
    low, high = penalty_ranges[priority]
    return np.random.randint(low, high)

train_drop['Penalty Cost'] = train_drop['Task Priority'].apply(calculate_penalty)

train_drop.head()
train_drop.info()

Unnamed: 0,Technician ID,Task ID,Expertise Match,Task Priority,Distance to Task (km),Task Completed,Penalty Cost
0,T005,J001,0,2,1,1,92
1,T006,J002,1,1,17,0,18
2,T005,J003,0,2,19,1,35
3,T009,J004,1,3,15,1,133
4,T003,J005,1,3,14,1,33


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Technician ID          50 non-null     object
 1   Task ID                50 non-null     object
 2   Expertise Match        50 non-null     int64 
 3   Task Priority          50 non-null     int64 
 4   Distance to Task (km)  50 non-null     int64 
 5   Task Completed         50 non-null     int64 
 6   Penalty Cost           50 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 2.9+ KB


### Derive New Columns

<p>
    logic:

expertise = 0 if 

	eqpt trained-1, eqpt required-1 and task comp-h, tech comp-l
	eqpt trained-0, eqpt required-0 and task comp-h, tech comp-l

	eqpt trained-0, eqpt required-1 and task comp-l, tech comp-h

	eqpt trained-1, eqpt required-0 and task comp-h, tech comp-l
	eqpt trained-0, eqpt required-1 and task comp-h, tech comp-l


expertise = 1 if 

	eqpt trained-1, eqpt required-1 and task comp-l, tech comp-h
    eqpt trained-1, eqpt required-0 and task comp-l, tech comp-h
	eqpt trained-0, eqpt required-0 and task comp-l, tech comp-h
</p>

In [62]:
merged_df = pd.merge(
    train_drop, test_drop, on=['Task Priority', 'Distance to Task (km)']
)
merged_df = merged_df.sort_values(by=['Technician ID','Expertise Match'])
merged_df = merged_df[['Technician ID', 'Expertise Match', 'Equipment Required', 'Task Complexity']]
merged_df

Unnamed: 0,Technician ID,Expertise Match,Equipment Required,Task Complexity
25,T001,0,0,7
26,T001,0,1,6
22,T001,1,0,9
5,T002,0,0,6
9,T002,0,0,9
13,T002,1,0,5
23,T003,0,0,5
1,T003,1,1,5
4,T004,0,0,3
11,T004,1,0,5


In [88]:
# created an empty tech dictionary
data = {
    'Technician ID': [],
    'Eqpt Trained': [],
    'Tech Complexity': []
}


# 1. filter and copy
df = merged_df[
    (merged_df['Expertise Match'] == 1)
    & (merged_df['Equipment Required'] == 1)
].copy()

# 2. now you can safely assign into df
df['Eqpt Trained'] = 1
df = df[['Technician ID', 'Eqpt Trained']]



# insert data into tech dictionary
for _, row in df.iterrows():
    data['Technician ID'].append(row['Technician ID'])
    data['Eqpt Trained'].append(row['Eqpt Trained'])
    data['Tech Complexity'].append(None)  

df = pd.DataFrame(data)
print(df)

  Technician ID  Eqpt Trained Tech Complexity
0          T003             1            None
1          T007             1            None
2          T009             1            None
3          T009             1            None
4          T010             1            None


### One Hot Encode Technician ID

In [6]:
# train_encoded = pd.get_dummies(train_clean, columns=["Technician ID"], dtype=int)

NameError: name 'train_clean' is not defined