In [28]:
import math
import numpy as np
import pandas as pd
import random
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

In [29]:
dataset_location = "/Users/Xutao/Documents/CR4CR/dataset/"

In [30]:
df = pd.read_excel(dataset_location + "Market 00abc_240310.xlsx")
# Since there are some null responses will a score of 0, we want to replace them with empty strings
df.fillna("", inplace=True)

# select only the three responses columns and the score column
response_columns = ["Market.00a_OE", "Market.00bc_OE", "Market.00bc_OE follow up"]
score_column = ["Score"]
df = df[["Respondent Id", "Administration"] + response_columns + score_column]

In [31]:
import re
def preprocess_text(text):
    # Lowercase the text
    # text = text.lower()

    text = re.sub(r'\n', ' ', text)
    #text = re.sub(r'^\w+\s*$', '',text)

    text = re.sub(r'[^a-zA-Z0-9\+\-\s]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text

def preprocess_text1(text):
    # Lowercase the text
    text = text.lower()

    # text = re.sub(r'\n', ' ', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Strip leading and trailing whitespace
    text = text.strip()

    return text

# Preprocess the text in the response columns
for column in response_columns:
    df[column] = df[column].astype(str)

In [32]:
df.head()

Unnamed: 0,Respondent Id,Administration,Market.00a_OE,Market.00bc_OE,Market.00bc_OE follow up,Score
0,24977,Spr 23,"Number of people inside the shop, number of pe...",Number of people inside the shop and number of...,We will definitively need to know the amount o...,3
1,25149,Spr 23,what causes a door to open automatically.,what can cause a door to open automatically,i think this piece of information is important...,0
2,22791,Spr 23,How many people are going through the doors. H...,How long it will take for everyone to go throu...,We must know this information to decrease the ...,1
3,22796,Spr 23,"The groups of customers that might go in, the ...",customers that go in and leave,"Because we need to how many customers go in, a...",1
4,23062,Spr 23,"How big is the store, as a team of constructio...",What does the owner want.\nHow tall the store ...,This can impact how the finished product will ...,1


In [33]:
df['concat_response'] = df.iloc[:,2] + ' [SEP] ' + df.iloc[:,3] + ' [SEP] ' + df.iloc[:,4]

In [34]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, DistilBertPreTrainedModel, RobertaForSequenceClassification
#from transformers.modeling_distilbert import DistilBertModel, DistilBertPreTrainedModel
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


class DistilRobertaClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(DistilRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('distilroberta-base')
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        roberta_output = outputs[0]

        cls_output = roberta_output[:, 0]
        logits = self.classifier(cls_output)

        return logits
    


class RobertaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.columns = ["concat_response"]
        self.labels = self.dataframe['Score'].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        # Selecting sentence1 and sentence2 at the specified index in the data frame
        row = self.dataframe.iloc[index]
        response = row['concat_response']
        score = row['Score']

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoding_a = self.tokenizer.encode_plus(response, add_special_tokens=True, max_length=self.max_length, padding='max_length', return_attention_mask=True, truncation=True)

        return {
            'input_ids_a': torch.tensor(encoding_a['input_ids'], dtype=torch.long),
            'attention_mask_a': torch.tensor(encoding_a['attention_mask'], dtype=torch.long),
            'score': torch.tensor(score, dtype=torch.float)
        }

In [35]:
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base', output_attentions=False)

# First split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Score'], random_state=42)

#
# Create datasets
train_dataset = RobertaDataset(train_df, tokenizer)
test_dataset = RobertaDataset(test_df, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8)  # changed batch_size to 3
test_loader = DataLoader(test_dataset, batch_size=8)  # new line, changed batch_size to 3

model = DistilRobertaClassifier(5)
device = torch.device('cuda:0')
model.to(device)

DistilRobertaClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [36]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 478 entries, 550 to 328
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Respondent Id             478 non-null    int64 
 1   Administration            478 non-null    object
 2   Market.00a_OE             478 non-null    object
 3   Market.00bc_OE            478 non-null    object
 4   Market.00bc_OE follow up  478 non-null    object
 5   Score                     478 non-null    int64 
 6   concat_response           478 non-null    object
dtypes: int64(2), object(5)
memory usage: 29.9+ KB


In [37]:
#loss_function = nn.MSELoss()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [38]:
for epoch in range(15):
    total_loss = 0
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device)
        )
        loss = loss_function(outputs, batch['score'].to(device).long())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

Average training loss: 1.33
Average training loss: 0.84
Average training loss: 0.63
Average training loss: 0.48
Average training loss: 0.37
Average training loss: 0.31
Average training loss: 0.26
Average training loss: 0.17
Average training loss: 0.11
Average training loss: 0.11
Average training loss: 0.09
Average training loss: 0.10
Average training loss: 0.06
Average training loss: 0.11
Average training loss: 0.04


In [40]:
model.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
      # Forward pass
      outputs = model(
          batch['input_ids_a'].to(device), batch['attention_mask_a'].to(device)
      )
      #all_predictions.extend(outputs.cpu().numpy())
      #all_labels.extend(batch['score'].numpy())

      _, outputs = torch.max(outputs, 1)
      all_predictions.extend(outputs.cpu().numpy())
      all_labels.extend(batch['score'].numpy())

# Since my predictions return float number, such as 2.3 and 3.5, I decide to round
# or map the number in the following way:
# 2.5 -> 3; 2.3 -> 2; to get a better algorithm to calculate the accuracy

def arrayround(arr,n=0):
    import numpy as np
    flag = np.where(arr>=0,1,-1)
    arr = np.abs(arr)
    arr10 = arr*10**(n+1)
    arr20 = np.floor(arr10)
    arr30 = np.where(arr20%10==5,(arr20+1)/10**(n+1),arr20/10**(n+1))
    result = np.around(arr30,n)
    return result*flag

all_predictions = np.array(all_predictions).flatten()
#all_predictions = arrayround(all_predictions)



# Compute the average accuracy over all batches
correct_predictions = sum(all_predictions == np.array(all_labels))
total_predictions = len(all_predictions)
test_accuracy = correct_predictions / total_predictions

print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

Test Accuracy: 58.33%


In [27]:
torch.cuda.empty_cache()
del model