In [9]:
import math
import numpy as np
import pandas as pd
import random
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

In [10]:
dataset_location = "/Users/Xutao/Documents/CR4CR/data/"

In [11]:
df = pd.read_excel(dataset_location + "output.xlsx")
df = df.iloc[:,1:]
df.fillna('', inplace=True)
df.head()

Unnamed: 0,Score,Market.00a_OE,Market.00b_OE,Market.00c_OE
0,3,"Number of people inside the shop, number of pe...",Number of people inside the shop and number of...,We will definitively need to know the amount o...
1,0,what causes a door to open automatically.,what can cause a door to open automatically,i think this piece of information is important...
2,1,How many people are going through the doors. H...,How long it will take for everyone to go throu...,We must know this information to decrease the ...
3,1,"The groups of customers that might go in, the ...",customers that go in and leave,"Because we need to how many customers go in, a..."
4,1,"How big is the store, as a team of constructio...",What does the owner want. How tall the store i...,This can impact how the finished product will ...


In [12]:
from transformers import BertTokenizer, BertModel, DistilBertPreTrainedModel, RobertaForSequenceClassification
#from transformers.modeling_distilbert import DistilBertModel, DistilBertPreTrainedModel
from torch import nn
from sklearn.preprocessing import OneHotEncoder


class MultimodalRoberta(torch.nn.Module):
    def __init__(self, num_labels=5):
        super(MultimodalRoberta, self).__init__()
        self.num_labels = num_labels
        self.roberta1 = RobertaModel.from_pretrained('distilroberta-base')
        self.roberta2 = RobertaModel.from_pretrained('distilroberta-base')
        self.roberta3 = RobertaModel.from_pretrained('distilroberta-base')
        self.classifier = nn.Linear(self.roberta1.config.hidden_size + self.roberta2.config.hidden_size + self.roberta3.config.hidden_size, num_labels)

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b, input_ids_c, attention_mask_c):
        output_a = self.roberta1(input_ids=input_ids_a, attention_mask=attention_mask_a)
        output_b = self.roberta2(input_ids=input_ids_b, attention_mask=attention_mask_b)
        output_c = self.roberta3(input_ids=input_ids_c, attention_mask=attention_mask_c)

        concatenated_output = torch.cat((output_a.pooler_output, output_b.pooler_output, output_c.pooler_output), 1)

        return self.classifier(concatenated_output)


class MultimodalRobertaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.columns = ["Market.00a_OE", "Market.00b_OE", "Market.00c_OE"]
        self.labels = self.dataframe['Score'].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        # Selecting sentence1 and sentence2 at the specified index in the data frame
        row = self.dataframe.iloc[index]
        response_a = row['Market.00a_OE']
        response_b = row['Market.00b_OE']
        response_c = row['Market.00c_OE']
        score = row['Score']

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoding_a = self.tokenizer.encode_plus(response_a, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_attention_mask=True, truncation=True)
        encoding_b = self.tokenizer.encode_plus(response_b, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_attention_mask=True, truncation=True)
        encoding_c = self.tokenizer.encode_plus(response_c, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_attention_mask=True, truncation=True)

        return {
            'input_ids_a': torch.tensor(encoding_a['input_ids'], dtype=torch.long),
            'attention_mask_a': torch.tensor(encoding_a['attention_mask'], dtype=torch.long),
            'input_ids_b': torch.tensor(encoding_b['input_ids'], dtype=torch.long),
            'attention_mask_b': torch.tensor(encoding_b['attention_mask'], dtype=torch.long),
            'input_ids_c': torch.tensor(encoding_c['input_ids'], dtype=torch.long),
            'attention_mask_c': torch.tensor(encoding_c['attention_mask'], dtype=torch.long),
            'score': torch.tensor(score, dtype=torch.float)
        }

In [13]:


tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', output_attentions=False)
train_df, val_df = train_test_split(df, test_size=0.1, random_state = 42)

# Create datasets
train_dataset = MultimodalRobertaDataset(train_df, tokenizer)
val_dataset = MultimodalRobertaDataset(val_df, tokenizer)
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=2)
val_loader = DataLoader(val_dataset, batch_size=2)

model = MultimodalRoberta()
device = torch.device('cuda:0')
model.to(device)

MultimodalRoberta(
  (roberta1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [14]:
torch.cuda.is_available()

True

In [15]:
#loss_function = nn.MSELoss()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [16]:
for epoch in range(10):
    total_loss = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device),
            batch['input_ids_b'].to(device), batch['attention_mask_b'].to(device),
            batch['input_ids_c'].to(device), batch['attention_mask_c'].to(device)
        )
        loss = loss_function(outputs, batch['score'].to(device).long())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

Average training loss: 1.23
Average training loss: 0.72
Average training loss: 0.44
Average training loss: 0.30
Average training loss: 0.19
Average training loss: 0.09
Average training loss: 0.05
Average training loss: 0.02
Average training loss: 0.01
Average training loss: 0.01


In [None]:
model.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
      # Forward pass
      outputs = model(
          batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device),
          batch['input_ids_b'].to(device), batch['attention_mask_b'].to(device),
          batch['input_ids_c'].to(device), batch['attention_mask_c'].to(device)
      )
      #all_predictions.extend(outputs.cpu().numpy())
      #all_labels.extend(batch['score'].numpy())

      _, outputs = torch.max(outputs, 1)
      all_predictions.extend(outputs.cpu().numpy())
      all_labels.extend(batch['score'].numpy())

# Since my predictions return float number, such as 2.3 and 3.5, I decide to round
# or map the number in the following way:
# 2.5 -> 3; 2.3 -> 2; to get a better algorithm to calculate the accuracy

def arrayround(arr,n=0):
    import numpy as np
    flag = np.where(arr>=0,1,-1)
    arr = np.abs(arr)
    arr10 = arr*10**(n+1)
    arr20 = np.floor(arr10)
    arr30 = np.where(arr20%10==5,(arr20+1)/10**(n+1),arr20/10**(n+1))
    result = np.around(arr30,n)
    return result*flag

all_predictions = np.array(all_predictions).flatten()
#all_predictions = arrayround(all_predictions)



# Compute the average accuracy over all batches
correct_predictions = sum(all_predictions == np.array(all_labels))
total_predictions = len(all_predictions)
test_accuracy = correct_predictions / total_predictions

print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

In [None]:
# load a new dataset for prediction
import re
prediction_data = pd.read_excel(dataset_location + "Fall 23 Market_roderick.xlsx")
prediction_data = prediction_data.rename(columns={"Market.00bc_OE":"Market.00b_OE", "Market.00bc_OE follow up": 'Market.00c_OE', "Waypoints": "Score"})
def preprocess_text(text):
    # Lowercase the text
    # text = text.lower()

    text = re.sub(r'\n', ' ', text)
    #text = re.sub(r'^\w+\s*$', '',text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)


    return text


# Ensure data types of responses are string
for col in ["Market.00a_OE", "Market.00b_OE", "Market.00c_OE"]:
  prediction_data[col] = prediction_data[col].astype(str).apply(preprocess_text)
  
prediction_dataset = MultimodalRobertaDataset(prediction_data, tokenizer)
prediction_dataloader = DataLoader(prediction_dataset, batch_size=2)

all_predictions = []

with torch.no_grad():
    for batch in prediction_dataloader:
      # Forward pass
      outputs = model(
          batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device),
          batch['input_ids_b'].to(device), batch['attention_mask_b'].to(device),
          batch['input_ids_c'].to(device), batch['attention_mask_c'].to(device)
      )
      #all_predictions.extend(outputs.cpu().numpy())
      #all_labels.extend(batch['score'].numpy())

      _, outputs = torch.max(outputs, 1)
      all_predictions.extend(outputs.cpu().numpy())
      all_labels.extend(batch['score'].numpy())

all_predictions = np.array(all_predictions).flatten()

# add one more column to the dataset
prediction_data['predict_by_model'] = all_predictions

In [None]:
prediction_data.to_excel(dataset_location + "Fall 2023 Market_roderick and model comparison.xlsx")

In [None]:
import warnings
warnings.filterwarnings("ignore")
# Define the hyperparameter search space
learning_rates = [1e-5]  # List of learning rates to try
batch_sizes = [4, 8, 16, 32]  # List of batch sizes to try
epoch_sizes = [5,6,7,8,9,10]
combination_accuracies = {}

for epoch_size in epoch_sizes:
  for batch_size in batch_sizes:
    # Re-training the model for each combination of hyperparameters
    model = MultimodalRoberta()
    device = torch.device('cuda')
    model.to(device)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    model.train()
    print([epoch_size, batch_size]) # prints out the hyperparameter combination being tested
    for epoch in range(epoch_size):
        total_loss = 0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(
                batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device),
                batch['input_ids_b'].to(device), batch['attention_mask_b'].to(device),
                batch['input_ids_c'].to(device), batch['attention_mask_c'].to(device)
            )
            loss = loss_function(outputs, batch['score'].to(device).long())
            total_loss += loss.item()
    
            loss.backward()
            optimizer.step()
    # Put the model in evaluation mode
    model.eval()

    # Initialize variables to keep track of predictions and ground truth labels
    all_predictions = []
    all_labels = []

    # Evaluate on the test dataset
    with torch.no_grad():
        for batch in val_loader:
          # Forward pass
          outputs = model(
              batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device),
              batch['input_ids_b'].to(device), batch['attention_mask_b'].to(device),
              batch['input_ids_c'].to(device), batch['attention_mask_c'].to(device)
          )
          #all_predictions.extend(outputs.cpu().numpy())
          #all_labels.extend(batch['score'].numpy())
    
          _, outputs = torch.max(outputs, 1)
          all_predictions.extend(outputs.cpu().numpy())
          all_labels.extend(batch['score'].numpy())

    # Calculate accuracy
    all_predictions = np.array(all_predictions).flatten()

    correct_predictions = sum(all_predictions == np.array(all_labels))
    total_predictions = len(all_predictions)
    test_accuracy = correct_predictions / total_predictions
    
    print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

    combination_accuracies[(epoch_size, batch_size)] = test_accuracy

    # Deletes the cache and the model from GPU memory
    torch.cuda.empty_cache()
    del model

In [None]:
torch.cuda.empty_cache()
del model

In [None]:
val_df[['Score']]

In [None]:
val_df['predicted_score'] = all_predictions

In [None]:
val_df

In [None]:
output_df.to_excel("/workspace/difference.xlsx")

In [None]:
output_df = val_df[val_df["Score"] != val_df["predicted_score"]]
output_df

In [None]:
len(val_df)