In [11]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [2]:
import pandas as pd
test_data = pd.read_csv('test_no_answer_2022.csv')
test_data

Unnamed: 0,row_id,TEXT
0,0,good to know if you can t find these elsewhere .
1,1,love it ! the grill plates come out and pop i...
2,2,i m convinced this was a poorly executed refur...
3,3,i would never have complained about that if it...
4,4,"the photo shows the same whole , large candie..."
...,...,...
10995,10995,i didn t quite get it the first time .
10996,10996,i ve tried installing with and without the oem...
10997,10997,i was parked at a truck stop in the cincinnati...
10998,10998,i recently bought this case after seeing some ...


# T5

In [None]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# 加載數據
raw_data = pd.read_csv('train_2022.csv')

# 轉換 LABEL 到文本形式
raw_data['LABEL'] = raw_data['LABEL'].map({0: "negative", 1: "positive"})

# 創建 T5 格式的輸入
raw_data['T5_INPUT'] = "classify sentiment: " + raw_data['TEXT']
raw_data['T5_OUTPUT'] = raw_data['LABEL']

# 切分數據為訓練集和驗證集
train_data, val_data = train_test_split(raw_data, test_size=0.1, random_state=42)  # 以10%的數據作為驗證集

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_ids = self.tokenizer.encode(item['T5_INPUT'], 
                                          max_length=self.max_length, 
                                          truncation=True, 
                                          padding='max_length', 
                                          return_tensors='pt').squeeze()
        labels = self.tokenizer.encode(item['T5_OUTPUT'], 
                                       max_length=self.max_length, 
                                       truncation=True, 
                                       padding='max_length', 
                                       return_tensors='pt').squeeze()
        return input_ids, labels

# 根據切分後的數據集創建 DataLoader
train_dataset = SentimentDataset(train_data, tokenizer)
val_dataset = SentimentDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# 設定設備
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = model.to(device)

# 訓練循環
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# 訓練模型
model.train()
for epoch in range(3):  # 使用3個epoch
    for input_ids, labels in train_loader:
        input_ids, labels = input_ids.to(device), labels.to(device)

        # 前向傳播
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # 反向傳播和優化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch}, Loss: {loss.item()}")

def evaluate(model, val_loader):
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for input_ids, labels in val_loader:
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model.generate(input_ids=input_ids)
            preds = torch.argmax(outputs, dim=-1)
            
            predicted_labels.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return classification_report(true_labels, predicted_labels, target_names=['negative', 'positive'], output_dict=True)

# 在驗證集上評估模型
report = evaluate(model, val_loader)
print(report)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 0, Loss: 2.0679283142089844
Epoch 0, Loss: 2.128352403640747
Epoch 0, Loss: 2.1275112628936768
Epoch 0, Loss: 1.7556647062301636
Epoch 0, Loss: 1.7121444940567017
Epoch 0, Loss: 1.5263192653656006
Epoch 0, Loss: 1.6478596925735474
Epoch 0, Loss: 1.6046369075775146
Epoch 0, Loss: 1.4518781900405884
Epoch 0, Loss: 1.420722246170044
Epoch 0, Loss: 1.2442291975021362
Epoch 0, Loss: 1.2705384492874146
Epoch 0, Loss: 0.9988387227058411
Epoch 0, Loss: 1.0272718667984009
Epoch 0, Loss: 0.9811387658119202
Epoch 0, Loss: 0.9191348552703857
Epoch 0, Loss: 0.7155396342277527
Epoch 0, Loss: 0.7217323780059814
Epoch 0, Loss: 0.6201804876327515
Epoch 0, Loss: 0.5397351980209351
Epoch 0, Loss: 0.432305246591568
Epoch 0, Loss: 0.3552534282207489
Epoch 0, Loss: 0.32101693749427795
Epoch 0, Loss: 0.2773313820362091
Epoch 0, Loss: 0.2618369460105896
Epoch 0, Loss: 0.21358676254749298
Epoch 0, Loss: 0.18035222589969635
Epoch 0, Loss: 0.1817421019077301
Epoch 0, Loss: 0.1652391254901886
Epoch 0, Loss:

In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset

# 加載測試數據
test_data = pd.read_csv('test_no_answer_2022.csv')

# 創建 T5 格式的輸入
test_data['T5_INPUT'] = "classify sentiment: " + test_data['TEXT']

# 定義一個 Dataset 用於測試數據
class PredictionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_ids = self.tokenizer.encode(item['T5_INPUT'], 
                                          max_length=self.max_length, 
                                          truncation=True, 
                                          padding='max_length', 
                                          return_tensors='pt').squeeze()
        return input_ids

# 創建 DataLoader
test_dataset = PredictionDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
def generate_predictions(model, test_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids in test_loader:
            input_ids = input_ids.to(device)
            outputs = model.generate(input_ids=input_ids)
            decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(decoded_preds)

    return predictions

# 生成預測結果
predictions = generate_predictions(model, test_loader)

# 打印一些預測結果看看
for i, text in enumerate(test_data['TEXT'].head(5)):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {predictions[i]}\n")