In [1]:
import math
import numpy as np
import pandas as pd
import random
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

In [2]:
df = pd.read_excel("/workspace/output.xlsx")
df = df.iloc[:,1:]
df.fillna('', inplace=True)
df.head()

Unnamed: 0,Score,Market.01a_OE,Market.01b_OE,Market.01c_OE
0,3,"Number of people inside the shop, number of pe...",Number of people inside the shop and number of...,We will definitively need to know the amount o...
1,0,what causes a door to open automatically.,what can cause a door to open automatically,i think this piece of information is important...
2,1,How many people are going through the doors. H...,How long it will take for everyone to go throu...,We must know this information to decrease the ...
3,1,"The groups of customers that might go in, the ...",customers that go in and leave,"Because we need to how many customers go in, a..."
4,1,"How big is the store, as a team of constructio...",What does the owner want. How tall the store i...,This can impact how the finished product will ...


In [4]:
df['concat_response'] = df.iloc[:,1] + ' [SEP] ' + df.iloc[:,2] + ' [SEP] ' + df.iloc[:,3]

In [12]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, DistilBertPreTrainedModel, RobertaForSequenceClassification
#from transformers.modeling_distilbert import DistilBertModel, DistilBertPreTrainedModel
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


class DistilRobertaClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(DistilRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('distilroberta-base')
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        roberta_output = outputs[0]

        cls_output = roberta_output[:, 0]
        logits = self.classifier(cls_output)

        return logits
    


class RobertaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.columns = ["concat_response"]
        self.labels = self.dataframe['Score'].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        # Selecting sentence1 and sentence2 at the specified index in the data frame
        row = self.dataframe.iloc[index]
        response = row['concat_response']
        score = row['Score']

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoding_a = self.tokenizer.encode_plus(response, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_attention_mask=True, truncation=True)

        return {
            'input_ids_a': torch.tensor(encoding_a['input_ids'], dtype=torch.long),
            'attention_mask_a': torch.tensor(encoding_a['attention_mask'], dtype=torch.long),
            'score': torch.tensor(score, dtype=torch.float)
        }

In [19]:
# Use GPU
from torch.utils.data import Dataset, DataLoader

tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', output_attentions=False)
train_df, val_df = train_test_split(df, test_size=0.2, random_state = 42)

# Create datasets
train_dataset = RobertaDataset(train_df, tokenizer)
val_dataset = RobertaDataset(val_df, tokenizer)
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)

model = DistilRobertaClassifier(5)
device = torch.device('cuda')
model.to(device)

DistilRobertaClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [20]:
#loss_function = nn.MSELoss()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [21]:
for epoch in range(10):
    total_loss = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device)
        )
        loss = loss_function(outputs, batch['score'].to(device).long())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

Average training loss: 1.41
Average training loss: 0.94
Average training loss: 0.72
Average training loss: 0.58
Average training loss: 0.43
Average training loss: 0.30
Average training loss: 0.30
Average training loss: 0.18
Average training loss: 0.20
Average training loss: 0.31


In [22]:
model.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
      # Forward pass
      outputs = model(
          batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device)
      )
      #all_predictions.extend(outputs.cpu().numpy())
      #all_labels.extend(batch['score'].numpy())

      _, outputs = torch.max(outputs, 1)
      all_predictions.extend(outputs.cpu().numpy())
      all_labels.extend(batch['score'].numpy())

# Since my predictions return float number, such as 2.3 and 3.5, I decide to round
# or map the number in the following way:
# 2.5 -> 3; 2.3 -> 2; to get a better algorithm to calculate the accuracy

def arrayround(arr,n=0):
    import numpy as np
    flag = np.where(arr>=0,1,-1)
    arr = np.abs(arr)
    arr10 = arr*10**(n+1)
    arr20 = np.floor(arr10)
    arr30 = np.where(arr20%10==5,(arr20+1)/10**(n+1),arr20/10**(n+1))
    result = np.around(arr30,n)
    return result*flag

all_predictions = np.array(all_predictions).flatten()
#all_predictions = arrayround(all_predictions)



# Compute the average accuracy over all batches
correct_predictions = sum(all_predictions == np.array(all_labels))
total_predictions = len(all_predictions)
test_accuracy = correct_predictions / total_predictions

print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

Test Accuracy: 69.17%


In [18]:
torch.cuda.empty_cache()
del model