In [1]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch as T
import time
from scipy import stats
import wandb

device = T.device("cpu")  # Apply device to Tensor or Module

In [2]:
# Use GPU if available, else use CPU
if T.cuda.is_available():
    print(f"GPU: {T.cuda.get_device_name(0)} is available.")
    device = T.device("cuda")
else:
    print("No GPU available. Training will run on CPU.")

GPU: Quadro P2000 is available.


In [3]:
# Define the Incident dataset
class IncidentDataset(T.utils.data.Dataset):

    def __init__(self, x, y):
        # Convert x and y to 2D PyTorch tensors
        self.x = T.tensor(x.values, dtype=T.float32).to(device)
        # Display x shape
        display(self.x.shape)
        self.y = T.tensor(y.values, dtype=T.float32).to(device)
        # Display y shape
        display(self.y.shape)

    def __len__(self):
        # Return size of the dataset
        return len(self.x)

    def __getitem__(self, idx):
        # Return the dataset item at requested index
        return self.x[idx], self.y[idx]


In [4]:
# GPU's and TPU's (Google Cloud's Tensor processing unit built into IC) are generally tuned for Float 32 operations.
# Float 64 operations are 2X+ times slower and need 2X memory to store the Tensors.
# In actual practice, Users in pytorch forum are reporting 40X to 50X slowdown with float 64.
# References on the Web say that considering Speed vs Accuracy tradeoff, float 32 works best and gives adequate precision.
# Float 32 is recommended as default choice for most Applications.
# If you specifically need to use float 64, high end data center devices that support faster processing are recommended.
# Given the above, we can continue with float 32.
#
# Define the Neural Network
class Net(T.nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    # Define 3 hidden layers
    # Hidden layer 1. 17 inputs, 64 outputs
    self.hid1 = T.nn.Linear(17, 64)
    # Hidden layer 2. 64 inputs, 32 outputs
    self.hid2 = T.nn.Linear(64, 32)
    # Hidden layer 3. 32 inputs, 16 outputs
    self.hid3 = T.nn.Linear(32, 16)
    # Define output Layer. 16 inputs, 1 output
    self.oupt = T.nn.Linear(16, 1)

    # Initialize weights and bias  of each Layer
    # Kaiming initialization is recommended over Xavier initialization
    T.nn.init.kaiming_normal_(self.hid1.weight, mode='fan_in', nonlinearity='relu')
    T.nn.init.zeros_(self.hid1.bias)
      
    T.nn.init.kaiming_normal_(self.hid2.weight, mode='fan_in', nonlinearity='relu')
    T.nn.init.zeros_(self.hid2.bias)
      
    T.nn.init.kaiming_normal_(self.hid3.weight, mode='fan_in', nonlinearity='relu')
    T.nn.init.zeros_(self.hid3.bias)
      
    T.nn.init.kaiming_normal_(self.oupt.weight, mode='fan_in', nonlinearity='linear')
    T.nn.init.zeros_(self.oupt.bias)
      
  # Define forward pass for the Neural Network
  def forward(self, x):
    # Invoke Hidden 1 layer and do RELU activation
    z = T.relu(self.hid1(x))
    # Invoke Hidden 2 layer and do RELU activation
    z = T.relu(self.hid2(z))
    # Invoke Hidden 3 layer and do RELU activation
    z = T.relu(self.hid3(z))
    # Invoke Output layer and no activation
    z = self.oupt(z)
    return z


In [5]:
# Calculate accuracy
def accuracy(model, ds, pct):
  # Assumes model.eval()
  # Percent correct within pct of true Resolution Time
  n_correct = 0; n_wrong = 0; totalSqError = 0;

  for i in range(len(ds)):
    (X, Y) = ds[i]            # (predictors, target)
    with T.no_grad():
      oupt = model(X)         # Computed Resolution Time

    # Calculate absolute delta (Predicted Resolution Time - Actual Resolution Time)
    abs_delta = np.abs(oupt.item() - Y.item())
    max_allow = np.abs(pct * Y.item())
    # Add to total squared error
    totalSqError = (abs_delta)**2
    # Calculate number of correct and number of wrong
    if abs_delta < max_allow:
      n_correct +=1
    else:
      n_wrong += 1

  # Calculate the mean squared error
  meanSqError = totalSqError / len(ds)
  # Calculate RMSE 
  rmse = (meanSqError) ** (1/2)

  # Calculate accuracy
  acc = (n_correct * 100.0) / (n_correct + n_wrong)

  # Return number of correct, number of wrong, rmse, accuracy
  return n_correct, n_wrong, rmse, acc


In [6]:
# wandb login
wandb.login(key="d55bdd553d57be9e7bf9f7b938b879c5b675027a", verify=True)
# Initialize wandb
wrun = wandb.init(project='IncResTimePred2')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m2023bs04022[0m ([33m2023bs04022-bits-pilani[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\arajwade\_netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [7]:
# Set Torch manual seed
T.manual_seed(4)
# Set numpy random seed
np.random.seed(4)

In [8]:
# Read the Incident dataset
df = pd.read_csv("Inc_Data_Selective.csv")
# Print dataframe first few rows
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
0,11000.0,1701389000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1701823000.0
1,11000.0,1701389000.0,3.0,4000.0,2000.0,3.0,2000.0,0.0,1701836000.0
2,12002.0,1701389000.0,0.0,4000.0,4000.0,3.0,2000.0,0.0,1701854000.0
3,12002.0,1701389000.0,0.0,4000.0,4000.0,3.0,2000.0,0.0,1701828000.0
4,4200.0,1701389000.0,1.0,4000.0,4000.0,3.0,2000.0,0.0,1701861000.0


In [9]:
# Print dataframe last few rows
df.tail()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
8893,11000.0,1701583000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702015000.0
8894,12002.0,1701583000.0,0.0,4000.0,4000.0,3.0,2000.0,0.0,1702017000.0
8895,4200.0,1701583000.0,0.0,4000.0,4000.0,3.0,2000.0,0.0,1702027000.0
8896,11000.0,1701583000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702015000.0
8897,11000.0,1701583000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702015000.0


In [10]:
# Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8898 entries, 0 to 8897
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reported Source  7998 non-null   float64
 1   Reported Date    7998 non-null   float64
 2   Service Type     8000 non-null   float64
 3   Urgency          7998 non-null   float64
 4   Impact           7998 non-null   float64
 5   Priority         7998 non-null   float64
 6   Closure Source   7351 non-null   float64
 7   Ticket Type      8005 non-null   float64
 8   Closed Date      7403 non-null   float64
dtypes: float64(9)
memory usage: 625.8 KB


In [11]:
# Describe dataframe
df.describe()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
count,7998.0,7998.0,8000.0,7998.0,7998.0,7998.0,7351.0,8005.0,7403.0
mean,8370.495124,1701535000.0,1.813375,3756.189047,3496.499125,2.755689,2000.0,0.001499,1702052000.0
std,3545.821182,99019.0,1.251296,514.461205,771.134385,0.511053,0.0,0.063212,162436.3
min,1000.0,1701389000.0,0.0,1000.0,1000.0,0.0,2000.0,0.0,1701394000.0
25%,4200.0,1701477000.0,1.0,4000.0,3000.0,3.0,2000.0,0.0,1701936000.0
50%,11000.0,1701519000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702025000.0
75%,11000.0,1701649000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702127000.0
max,12002.0,1701695000.0,3.0,4000.0,4000.0,3.0,2000.0,3.0,1702707000.0


In [12]:
# Check for null values
df.isnull().sum()

Reported Source     900
Reported Date       900
Service Type        898
Urgency             900
Impact              900
Priority            900
Closure Source     1547
Ticket Type         893
Closed Date        1495
dtype: int64

In [13]:
# Drop tows with null columns
df = df.dropna(subset=df.columns.values)

In [14]:
#  Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7351 entries, 0 to 8897
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reported Source  7351 non-null   float64
 1   Reported Date    7351 non-null   float64
 2   Service Type     7351 non-null   float64
 3   Urgency          7351 non-null   float64
 4   Impact           7351 non-null   float64
 5   Priority         7351 non-null   float64
 6   Closure Source   7351 non-null   float64
 7   Ticket Type      7351 non-null   float64
 8   Closed Date      7351 non-null   float64
dtypes: float64(9)
memory usage: 574.3 KB


In [15]:
# Check for null values
df.isnull().sum()

Reported Source    0
Reported Date      0
Service Type       0
Urgency            0
Impact             0
Priority           0
Closure Source     0
Ticket Type        0
Closed Date        0
dtype: int64

In [16]:
# Convert all cols to int64
df = df.astype('int64')

In [17]:
#  Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7351 entries, 0 to 8897
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Reported Source  7351 non-null   int64
 1   Reported Date    7351 non-null   int64
 2   Service Type     7351 non-null   int64
 3   Urgency          7351 non-null   int64
 4   Impact           7351 non-null   int64
 5   Priority         7351 non-null   int64
 6   Closure Source   7351 non-null   int64
 7   Ticket Type      7351 non-null   int64
 8   Closed Date      7351 non-null   int64
dtypes: int64(9)
memory usage: 574.3 KB


In [18]:
# Print dataframe first few rows
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
0,11000,1701388814,3,4000,4000,3,2000,0,1701823127
1,11000,1701388824,3,4000,2000,3,2000,0,1701835737
2,12002,1701388827,0,4000,4000,3,2000,0,1701854404
3,12002,1701388847,0,4000,4000,3,2000,0,1701828000
4,4200,1701388866,1,4000,4000,3,2000,0,1701861304


In [19]:
df.tail()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
8893,11000,1701582900,3,4000,4000,3,2000,0,1702015489
8894,12002,1701582919,0,4000,4000,3,2000,0,1702016739
8895,4200,1701582920,0,4000,4000,3,2000,0,1702026760
8896,11000,1701582961,3,4000,4000,3,2000,0,1702015489
8897,11000,1701583031,3,4000,4000,3,2000,0,1702015489


In [20]:
df.describe()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date
count,7351.0,7351.0,7351.0,7351.0,7351.0,7351.0,7351.0,7351.0,7351.0
mean,8619.628078,1701533000.0,1.905591,3750.238063,3468.779758,2.749422,2000.0,0.0,1702055000.0
std,3431.789666,97342.61,1.230811,515.796337,783.380513,0.512754,0.0,0.0,158601.6
min,1000.0,1701389000.0,0.0,1000.0,1000.0,0.0,2000.0,0.0,1701822000.0
25%,5000.0,1701477000.0,1.0,4000.0,3000.0,3.0,2000.0,0.0,1701937000.0
50%,11000.0,1701517000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702029000.0
75%,11000.0,1701580000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702127000.0
max,12002.0,1701695000.0,3.0,4000.0,4000.0,3.0,2000.0,0.0,1702707000.0


In [21]:
# Calculate Resolution time in Hours = (Closed Date - Reported Date) / 3600
df['ResTime'] = (df['Closed Date'] - df['Reported Date']) / 3600

In [22]:
# Print dataframe first few rows
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
0,11000,1701388814,3,4000,4000,3,2000,0,1701823127,120.6425
1,11000,1701388824,3,4000,2000,3,2000,0,1701835737,124.1425
2,12002,1701388827,0,4000,4000,3,2000,0,1701854404,129.326944
3,12002,1701388847,0,4000,4000,3,2000,0,1701828000,121.986944
4,4200,1701388866,1,4000,4000,3,2000,0,1701861304,131.232778


In [23]:
# Print dataframe first few rows
df.tail()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
8893,11000,1701582900,3,4000,4000,3,2000,0,1702015489,120.163611
8894,12002,1701582919,0,4000,4000,3,2000,0,1702016739,120.505556
8895,4200,1701582920,0,4000,4000,3,2000,0,1702026760,123.288889
8896,11000,1701582961,3,4000,4000,3,2000,0,1702015489,120.146667
8897,11000,1701583031,3,4000,4000,3,2000,0,1702015489,120.127222


In [24]:
# Do ordinal encoding for Impact as there is clear ordering between the values
#   1000 Extensive / Widespread : 4
#   2000 Significant / Large    : 3
#   3000 Moderate / Limited     : 2
#   4000 Minor / Localized      : 1
impact_mapping = {4000: 1, 3000: 2, 2000: 3, 1000: 4}
df['Impact'] = df['Impact'].map(impact_mapping)
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
0,11000,1701388814,3,4000,1,3,2000,0,1701823127,120.6425
1,11000,1701388824,3,4000,3,3,2000,0,1701835737,124.1425
2,12002,1701388827,0,4000,1,3,2000,0,1701854404,129.326944
3,12002,1701388847,0,4000,1,3,2000,0,1701828000,121.986944
4,4200,1701388866,1,4000,1,3,2000,0,1701861304,131.232778


In [25]:
# Do ordinal encoding for Priority as there is clear ordering between the values
#   0 Critical : 4
#   1 High     : 3
#   2 Medium   : 2
#   3 Low      : 1
priority_mapping = {3: 1, 2: 2, 1: 3, 0: 4}
df['Priority'] = df['Priority'].map(priority_mapping)
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
0,11000,1701388814,3,4000,1,1,2000,0,1701823127,120.6425
1,11000,1701388824,3,4000,3,1,2000,0,1701835737,124.1425
2,12002,1701388827,0,4000,1,1,2000,0,1701854404,129.326944
3,12002,1701388847,0,4000,1,1,2000,0,1701828000,121.986944
4,4200,1701388866,1,4000,1,1,2000,0,1701861304,131.232778


In [26]:
# Do ordinal encoding for Urgency as there is clear ordering between the values
#  1000 Critical : 4
#  2000 High     : 3
#  3000 Medium   : 2
#  4000 Low      : 1
urgency_mapping = {4000: 1, 3000: 2, 2000: 3, 1000: 4}
df['Urgency'] = df['Urgency'].map(urgency_mapping)
df.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
0,11000,1701388814,3,1,1,1,2000,0,1701823127,120.6425
1,11000,1701388824,3,1,3,1,2000,0,1701835737,124.1425
2,12002,1701388827,0,1,1,1,2000,0,1701854404,129.326944
3,12002,1701388847,0,1,1,1,2000,0,1701828000,121.986944
4,4200,1701388866,1,1,1,1,2000,0,1701861304,131.232778


In [27]:
df['ResTime'].describe()

count    7351.000000
mean      144.961381
std        38.205842
min       120.003333
25%       120.716111
50%       125.083611
75%       152.743333
max       361.210833
Name: ResTime, dtype: float64

In [28]:
outliers = df[(df['ResTime'] > 120.0) & (df['ResTime'] < 200.0)]
outliers_count = outliers.count()
print(outliers_count)

Reported Source    6616
Reported Date      6616
Service Type       6616
Urgency            6616
Impact             6616
Priority           6616
Closure Source     6616
Ticket Type        6616
Closed Date        6616
ResTime            6616
dtype: int64


In [29]:
# Calculate IQR for column Resolution Time
Q1 = df['ResTime'].quantile(0.25)
Q3 = df['ResTime'].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
# < Q1 - 1.5 x IQR or > Q3 + 15. IQR
threshold = 1.5
outliers = df[(df['ResTime'] < Q1 - threshold * IQR) | (df['ResTime'] > Q3 + threshold * IQR)]
print(outliers)

      Reported Source  Reported Date  Service Type  Urgency  Impact  Priority  \
5                4200     1701388903             1        1       1         1   
11              11000     1701389181             3        2       2         2   
32              11000     1701390367             3        2       2         2   
33              11000     1701390440             3        2       2         2   
40              11000     1701390858             3        2       2         2   
...               ...            ...           ...      ...     ...       ...   
8730            12002     1701570529             0        1       1         1   
8758            11000     1701573993             3        2       2         2   
8802            11000     1701578048             3        1       2         1   
8804            11000     1701578071             3        1       2         1   
8885            12002     1701582127             0        1       1         1   

      Closure Source  Ticke

In [30]:
# Copy dataframe df to df_clean
df_clean = df

# Drop rows containing outliers so that outliers are not considered for Resolution Time prediction
df_clean = df.drop(outliers.index)

In [31]:
df_clean.head()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
0,11000,1701388814,3,1,1,1,2000,0,1701823127,120.6425
1,11000,1701388824,3,1,3,1,2000,0,1701835737,124.1425
2,12002,1701388827,0,1,1,1,2000,0,1701854404,129.326944
3,12002,1701388847,0,1,1,1,2000,0,1701828000,121.986944
4,4200,1701388866,1,1,1,1,2000,0,1701861304,131.232778


In [32]:
df_clean.tail()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
8893,11000,1701582900,3,1,1,1,2000,0,1702015489,120.163611
8894,12002,1701582919,0,1,1,1,2000,0,1702016739,120.505556
8895,4200,1701582920,0,1,1,1,2000,0,1702026760,123.288889
8896,11000,1701582961,3,1,1,1,2000,0,1702015489,120.146667
8897,11000,1701583031,3,1,1,1,2000,0,1702015489,120.127222


In [33]:
df_clean.describe()

Unnamed: 0,Reported Source,Reported Date,Service Type,Urgency,Impact,Priority,Closure Source,Ticket Type,Closed Date,ResTime
count,6622.0,6622.0,6622.0,6622.0,6622.0,6622.0,6622.0,6622.0,6622.0,6622.0
mean,8781.623679,1701538000.0,1.968137,1.242827,1.551193,1.244186,2000.0,0.0,1702022000.0,134.49065
std,3355.029291,96602.37,1.218735,0.510209,0.798744,0.508818,0.0,0.0,123197.5,20.101021
min,1000.0,1701389000.0,0.0,1.0,1.0,1.0,2000.0,0.0,1701822000.0,120.003333
25%,6000.0,1701482000.0,1.0,1.0,1.0,1.0,2000.0,0.0,1701931000.0,120.578819
50%,11000.0,1701521000.0,3.0,1.0,1.0,1.0,2000.0,0.0,1702013000.0,123.700556
75%,11000.0,1701583000.0,3.0,1.0,2.0,1.0,2000.0,0.0,1702115000.0,143.202708
max,12002.0,1701695000.0,3.0,4.0,4.0,4.0,2000.0,0.0,1702411000.0,200.775278


In [34]:
# Drop columns not required for Resolution Time prediction: Reported Date, Closure Source, Ticket Type, Closed Date
df_clean = df_clean.drop(['Reported Date', 'Closure Source', 'Ticket Type', 'Closed Date'], axis=1)

In [35]:
# 1 Hot encoding categorical nominal features: Reported Source, Service Type
df_encoded = pd.get_dummies(df_clean, columns=['Reported Source', 'Service Type'])
print(df_encoded)

      Urgency  Impact  Priority     ResTime  Reported Source_1000  \
0           1       1         1  120.642500                 False   
1           1       3         1  124.142500                 False   
2           1       1         1  129.326944                 False   
3           1       1         1  121.986944                 False   
4           1       1         1  131.232778                 False   
...       ...     ...       ...         ...                   ...   
8893        1       1         1  120.163611                 False   
8894        1       1         1  120.505556                 False   
8895        1       1         1  123.288889                 False   
8896        1       1         1  120.146667                 False   
8897        1       1         1  120.127222                 False   

      Reported Source_2000  Reported Source_3000  Reported Source_4200  \
0                    False                 False                 False   
1                    Fa

In [36]:
# Convert all dataframe columns to float 32
df_encoded = df_encoded.astype('float32')

In [37]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6622 entries, 0 to 8897
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Urgency                6622 non-null   float32
 1   Impact                 6622 non-null   float32
 2   Priority               6622 non-null   float32
 3   ResTime                6622 non-null   float32
 4   Reported Source_1000   6622 non-null   float32
 5   Reported Source_2000   6622 non-null   float32
 6   Reported Source_3000   6622 non-null   float32
 7   Reported Source_4200   6622 non-null   float32
 8   Reported Source_5000   6622 non-null   float32
 9   Reported Source_6000   6622 non-null   float32
 10  Reported Source_8000   6622 non-null   float32
 11  Reported Source_9000   6622 non-null   float32
 12  Reported Source_10000  6622 non-null   float32
 13  Reported Source_11000  6622 non-null   float32
 14  Reported Source_12002  6622 non-null   float32
 15  Service T

In [38]:
display(df_encoded)

Unnamed: 0,Urgency,Impact,Priority,ResTime,Reported Source_1000,Reported Source_2000,Reported Source_3000,Reported Source_4200,Reported Source_5000,Reported Source_6000,Reported Source_8000,Reported Source_9000,Reported Source_10000,Reported Source_11000,Reported Source_12002,Service Type_0,Service Type_1,Service Type_3
0,1.0,1.0,1.0,120.642502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,3.0,1.0,124.142502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,1.0,1.0,129.326950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,1.0,1.0,121.986946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1.0,1.0,1.0,131.232773,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8893,1.0,1.0,1.0,120.163612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8894,1.0,1.0,1.0,120.505554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8895,1.0,1.0,1.0,123.288887,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8896,1.0,1.0,1.0,120.146667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [39]:
# Copy the data 
#df_z_scaled = df.copy() 
  
# Apply Z scaling normalization techniques 
#for column in df_z_scaled.columns: 
#    if (df_z_scaled[column].std() != 0):
#        df_z_scaled[column] = (df_z_scaled[column] -
#                           df_z_scaled[column].mean()) / df_z_scaled[column].std()     
  
# View Z scaled normalized data    
#display(df_z_scaled)

In [40]:
# Create a MinMaxScaler object
#scaler = MinMaxScaler()

# Fit and transform the data
#normalized_data = scaler.fit_transform(df)

# Create a new DataFrame with the minmax normalized data
#normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

# View MinMax scaled normalized data    
#display(normalized_df)

In [41]:
# X: Reported Source, Service Type, Urgency, Impact, Priority
X = df_encoded.iloc[:,[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
# Y: Resolution Time
y = df_encoded.iloc[:, [3]]

In [42]:
# Display dataframe X
display(X)

Unnamed: 0,Urgency,Impact,Priority,Reported Source_1000,Reported Source_2000,Reported Source_3000,Reported Source_4200,Reported Source_5000,Reported Source_6000,Reported Source_8000,Reported Source_9000,Reported Source_10000,Reported Source_11000,Reported Source_12002,Service Type_0,Service Type_1,Service Type_3
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8893,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8894,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8895,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8896,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [43]:
# Display dataframe Y
display(y)

Unnamed: 0,ResTime
0,120.642502
1,124.142502
2,129.326950
3,121.986946
4,131.232773
...,...
8893,120.163612
8894,120.505554
8895,123.288887
8896,120.146667


In [44]:
# Do Train Test split 80:20 split
# Train set 80%
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.20, random_state=1)

In [45]:
# Display frame X_train
display(X_train)

Unnamed: 0,Urgency,Impact,Priority,Reported Source_1000,Reported Source_2000,Reported Source_3000,Reported Source_4200,Reported Source_5000,Reported Source_6000,Reported Source_8000,Reported Source_9000,Reported Source_10000,Reported Source_11000,Reported Source_12002,Service Type_0,Service Type_1,Service Type_3
7101,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3213,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8270,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7051,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2875,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2067,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7272,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5802,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
809,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [46]:
# Display frame X_rest
display(X_rest)

Unnamed: 0,Urgency,Impact,Priority,Reported Source_1000,Reported Source_2000,Reported Source_3000,Reported Source_4200,Reported Source_5000,Reported Source_6000,Reported Source_8000,Reported Source_9000,Reported Source_10000,Reported Source_11000,Reported Source_12002,Service Type_0,Service Type_1,Service Type_3
5843,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7899,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
903,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4628,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
648,2.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6615,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6483,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4128,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3175,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [47]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5297 entries, 7101 to 7237
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Urgency                5297 non-null   float32
 1   Impact                 5297 non-null   float32
 2   Priority               5297 non-null   float32
 3   Reported Source_1000   5297 non-null   float32
 4   Reported Source_2000   5297 non-null   float32
 5   Reported Source_3000   5297 non-null   float32
 6   Reported Source_4200   5297 non-null   float32
 7   Reported Source_5000   5297 non-null   float32
 8   Reported Source_6000   5297 non-null   float32
 9   Reported Source_8000   5297 non-null   float32
 10  Reported Source_9000   5297 non-null   float32
 11  Reported Source_10000  5297 non-null   float32
 12  Reported Source_11000  5297 non-null   float32
 13  Reported Source_12002  5297 non-null   float32
 14  Service Type_0         5297 non-null   float32
 15  Servic

In [48]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5297 entries, 7101 to 7237
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ResTime  5297 non-null   float32
dtypes: float32(1)
memory usage: 62.1 KB


In [49]:
# Do Train Test split 50:50 split
# Validation 10%
# Test 10%
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.50, random_state=1)

In [50]:
# Convert Train, Validation & Test to 2D PyTorch tensors
#TX_train = T.tensor(X_train.values, dtype=T.float64).to(device)
#Ty_train = T.tensor(y_train.values, dtype=T.float64).to(device).reshape(-1, 1)
#TX_val = T.tensor(X_val.values, dtype=T.float64).to(device)
#Ty_val = T.tensor(y_val.values, dtype=T.float64).to(device).reshape(-1, 1)
#TX_test = T.tensor(X_test.values, dtype=T.float64).to(device)
#Ty_test = T.tensor(y_test.values, dtype=T.float64).to(device).reshape(-1, 1)

In [51]:
# Display TX_train shape
#display(TX_train.shape)

In [52]:
# Display Ty_train shape
#display(Ty_train.shape)

In [53]:
# Display TX_val shape
#display(TX_val.shape)

In [54]:
# Display Ty_val shape
#display(Ty_val.shape)

In [55]:
# Display TX_test shape
#display(TX_test.shape)

In [56]:
# Display Ty_test shape
#display(Ty_test.shape)

In [57]:
# Create Train, Validation & Test datasets
train_ds = IncidentDataset(X_train, y_train)
val_ds = IncidentDataset(X_val, y_val)
test_ds = IncidentDataset(X_test, y_test)

torch.Size([5297, 17])

torch.Size([5297, 1])

torch.Size([662, 17])

torch.Size([662, 1])

torch.Size([663, 17])

torch.Size([663, 1])

In [58]:
# Training parameters
batch_size = 10 # Batch size
max_epochs = 500 # Maximum epochs
ep_log_interval = 50 # Epoch log interval
lrn_rate = 0.001 # Learning rate for Gradient descent
weight_decay = 0.0001 # Weight decay for Adam/AdamW optimizer L2 regularization

# Create Training Dataloader
train_loader = T.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
# Calculate Training dataset length
train_ds_length = len(train_ds)

# Calculate batches per epoch
batches_per_epoch = round(train_ds_length / batch_size)

# Create network
net = Net().to(device)

# Define loss function as Mean squared error loss (MSE)
loss_func = T.nn.MSELoss()

# Define optimizer as Adam
# optimizer = T.optim.SGD(net.parameters(), lr=lrn_rate)
#
# Use L2 regularizarion with AdamW by specifying weight_decay parameter
# Also check using AdamW optimizer instead of Adam optimizer which does better regularization/generalization
optimizer = T.optim.AdamW(net.parameters(), lr=lrn_rate, weight_decay=weight_decay)

print("\nbatch_size = %3d " % batch_size)
print("loss = " + str(loss_func))
print("optimizer = Adam")
print("max_epochs = %3d " % max_epochs)
print("lrn_rate = %0.3f " % lrn_rate)
print("weight_decay = %0.4f " % weight_decay)
print("train_ds_length = %4d " % train_ds_length)
print("batches_per_epoch = %3d " % batches_per_epoch)

# Set network in Training mode
net.train()


batch_size =  10 
loss = MSELoss()
optimizer = Adam
max_epochs = 500 
lrn_rate = 0.001 
weight_decay = 0.0001 
train_ds_length = 5297 
batches_per_epoch = 530 


Net(
  (hid1): Linear(in_features=17, out_features=64, bias=True)
  (hid2): Linear(in_features=64, out_features=32, bias=True)
  (hid3): Linear(in_features=32, out_features=16, bias=True)
  (oupt): Linear(in_features=16, out_features=1, bias=True)
)

In [59]:
#Training loop
for epoch in range(0, max_epochs):
    T.manual_seed(1+epoch)  # Recovery reproducibility
    epoch_loss = 0  # Epoch loss for one full epoch

    for (batch_idx, batch) in enumerate(train_loader):
        (X, Y) = batch                 # (predictors, targets)
        optimizer.zero_grad()          # Prepare gradients
        oupt = net(X)                  # Predicted Incident Resolution Time
        loss_val = loss_func(oupt, Y)  # Avgerate per item in batch
        epoch_loss += loss_val.item()  # Accumulate averages
        loss_val.backward()            # Compute gradients
        optimizer.step()               # Update weigths
        
        if epoch % ep_log_interval == 0:
            print("epoch = %4d   batch= %4d  loss = %0.4f" % (epoch, batch_idx, epoch_loss))
            # save checkpoint
            dt = time.strftime("%Y_%m_%d-%H_%M_%S")
            fn = ".\\Log\\" + str(dt) + str("-") + str(epoch) + "_checkpoint.pt"
            
            info_dict = { 
                'epoch' : epoch,
                'net_state' : net.state_dict(),
                'optimizer_state' : optimizer.state_dict()
            }
            T.save(info_dict, fn)

    current_step = epoch+1
    # wandb log epoch and epoch loss
    wandb.log({'epoch': epoch+1, 'epoch loss': epoch_loss/batches_per_epoch}, step=current_step)

    # Set network in Evaluation mode
    net.eval()

    # Validation dataset calculate Accuracy and RMSE
    n_correct, n_wrong, rmse, acc  = accuracy(net, val_ds, 0.10)

    # Set network in Training mode
    net.train()

    # wandb log epoch and Validation RMSE
    wandb.log({'epoch': epoch+1, 'val RMSE': rmse}, step=current_step)
    # wandb log epoch and Validation 
    wandb.log({'epoch': epoch+1, 'val accuracy': acc}, step=current_step)

print("Done ")

epoch =    0   batch=    0  loss = 19281.4531
epoch =    0   batch=    1  loss = 36699.5039
epoch =    0   batch=    2  loss = 58475.2520
epoch =    0   batch=    3  loss = 75289.9375
epoch =    0   batch=    4  loss = 92649.6270
epoch =    0   batch=    5  loss = 109158.9648
epoch =    0   batch=    6  loss = 127128.0000
epoch =    0   batch=    7  loss = 144244.3066
epoch =    0   batch=    8  loss = 162350.1094
epoch =    0   batch=    9  loss = 182301.6270
epoch =    0   batch=   10  loss = 198452.2393
epoch =    0   batch=   11  loss = 216846.8857
epoch =    0   batch=   12  loss = 238341.6709
epoch =    0   batch=   13  loss = 255687.3369
epoch =    0   batch=   14  loss = 272774.8936
epoch =    0   batch=   15  loss = 290056.8252
epoch =    0   batch=   16  loss = 312346.9814
epoch =    0   batch=   17  loss = 330444.2646
epoch =    0   batch=   18  loss = 346339.9834
epoch =    0   batch=   19  loss = 365315.4014
epoch =    0   batch=   20  loss = 381921.5361
epoch =    0   bat

In [60]:
# Evaluate model accuracy
print("\nComputing model accuracy")
# Set network in Evaluation mode
net.eval()

# Training dataset calculate Accuracy and RMSE
n_correct, n_wrong, rmse, acc = accuracy(net, train_ds, 0.10)
print("*** Training dataset ***")
# Print number of correct, number of wrong
print("n_correct = %3d " % n_correct)
print("n_wrong = %3d " % n_wrong)
# Print RMSE
print("RMSE = %0.4f " % rmse)
# Print accuracy
print("Accuracy (within 0.10) = %0.4f" % acc)

# Validation dataset calculate Accuracy and RMSE
n_correct, n_wrong, rmse, acc  = accuracy(net, val_ds, 0.10)
print("*** Validation dataset ***")
# Print number of correct, number of wrong
print("n_correct = %3d " % n_correct)
print("n_wrong = %3d " % n_wrong)
# Print RMSE
print("RMSE = %0.4f " % rmse)
# Print accuracy
print("Accuracy (within 0.10) = %0.4f" % acc)

#acc_test = accuracy(net, test_ds, 0.10) 
#print("Accuracy (within 0.10) on test dataset  = %0.4f" % acc_test)


Computing model accuracy
*** Training dataset ***
n_correct = 3232 
n_wrong = 2065 
RMSE = 0.0267 
Accuracy (within 0.10) = 61.0157
*** Validation dataset ***
n_correct = 426 
n_wrong = 236 
RMSE = 0.4389 
Accuracy (within 0.10) = 64.3505


In [61]:
# Finish wandb run
wrun.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
epoch loss,█▄▂▃▃▃▃▃▃▃▃▂▂▂▃▂▃▂▂▂▂▂▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂
val RMSE,▄▁▄▅▇█▂▆▅▂▆█▃▂▅▃▆▄▅▂▃▁▆▂▃▅▄▇▄▄▅▂▅▂▆▁▇▅▄▄
val accuracy,▆▆▃▁▇▇▄▄▅▅▆▇▄▅▃▂▄▅▅▇▅▆▆▅▇▇▇▇▄█▅▂▅█▅▇▅▅▆▆

0,1
epoch,500.0
epoch loss,344.71511
val RMSE,0.43893
val accuracy,64.35045
