In [None]:
train_path = "/Users/pierre/Documents/MSc_AI/NLP/Assignment/nlp_assignment/data/traindata.csv"
test_path = "/Users/pierre/Documents/MSc_AI/NLP/Assignment/nlp_assignment/data/devdata.csv"

In [None]:
import pandas as pd
df_train = pd.read_csv(train_path, sep='\t', header=0, index_col=False)
print(df_train.shape)
df_train.head()

In [None]:
X_train = df_train.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
print(y_train.shape , X_train.shape)

---

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

aspect_encoder = LabelEncoder()
polarity_encoder = LabelEncoder()

df_train['aspect_category_encoded'] = aspect_encoder.fit_transform(df_train.iloc[:, 1])

y_train_encoded = polarity_encoder.fit_transform(y_train)

X_train_sentences = df_train.iloc[:, -1]  # sentence is the last column

print(f"Encoded labels shape: {y_train_encoded.shape}")
print(f"Input sentences shape: {X_train_sentences.shape}")

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=128):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
          sentence,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
        )
        
        return {
          'review_text': sentence,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }


dataset = ReviewDataset(X_train_sentences, y_train_encoded, tokenizer)
loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
from transformers import BertModel
import torch
import torch.nn as nn

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
          return_dict=False
        )
        drop_output = self.drop(pooled_output)
        return self.out(drop_output)

model = SentimentClassifier(len(polarity_encoder.classes_)).to('cuda')
