In [1]:
import pandas as pd
train_2022 = pd.read_csv('train_2022.csv')
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
translated_en_data = pd.read_csv('translated_en_data.csv')
amazon_short_text_data = pd.read_csv('amazon_short_text_data.csv')

In [20]:
random = 4000
amazon_short_text_zero_data = amazon_short_text_data[amazon_short_text_data['LABEL'] == 0]
amazon_short_text_zero_data = amazon_short_text_zero_data.sample(n=random, random_state=42)
amazon_short_text_one_data = amazon_short_text_data[amazon_short_text_data['LABEL'] == 1]
amazon_short_text_one_data = amazon_short_text_one_data.sample(n=random, random_state=42)

In [21]:
amazon = pd.concat([amazon_short_text_zero_data, amazon_short_text_one_data], ignore_index=True)
amazon = amazon.reset_index(drop=True)
amazon = amazon.drop(columns=['row_id'])
amazon['row_id'] = amazon.index
amazon

Unnamed: 0,LABEL,TEXT,row_id
0,0,"Hole on the bottom? I like the glasses, but I ...",0
1,0,Great Authors Low Rate Stories These are four ...,1
2,0,TERRIBLE This book is as boring as watching pa...,2
3,0,Just did not get there Don't even waste your m...,3
4,0,Be Very Careful In three days of using this pr...,4
...,...,...,...
7995,1,one of the best shows on tv i have never laugh...,7995
7996,1,fun movie the kids and the adults all liked th...,7996
7997,1,"It was the perfect addition to my homemade ""Ch...",7997
7998,1,Awesome This is an awesome product. It will wi...,7998


In [23]:
merged_data = pd.concat([train_2022, data_augmentation_chatGPT, amazon], ignore_index=True)
merged_data = merged_data.reset_index(drop=True)
merged_data = merged_data.drop(columns=['row_id'])
merged_data['row_id'] = merged_data.index
merged_data

Unnamed: 0,TEXT,LABEL,row_id
0,director dirk shafer and co-writer greg hinton...,0,0
1,"a charming , quirky and leisurely paced scotti...",1,1
2,"the price was good , and came quickly though ...",1,2
3,i was looking forward to this game for a coupl...,0,3
4,arguably the year 's silliest and most incoher...,0,4
...,...,...,...
11995,one of the best shows on tv i have never laugh...,1,11995
11996,fun movie the kids and the adults all liked th...,1,11996
11997,"It was the perfect addition to my homemade ""Ch...",1,11997
11998,Awesome This is an awesome product. It will wi...,1,11998


In [24]:
%%time
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2).to(device)  # 2 for binary classification

# Convert to DataFrame (assuming you've loaded raw_data from CSV)
train_data = merged_data.copy()

# Tokenize text using tokenizer
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids'].squeeze(0)

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# Convert to model input format
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True).to(device)

# Split data into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)
train_labels, test_labels = torch.tensor(train_labels).to(device), torch.tensor(test_labels).to(device)

# Create PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
model.train()
for epoch in range(5):
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)[0]
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)[0]
    predicted_labels = torch.argmax(outputs, dim=1).cpu().tolist()

# Calculate classification report
report = classification_report(test_labels.cpu(), predicted_labels)
print(report)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1200/1200 [04:47<00:00,  4.17it/s]


Epoch 1, Loss: 0.18661649525165558


Epoch 2: 100%|██████████| 1200/1200 [04:54<00:00,  4.07it/s]


Epoch 2, Loss: 0.19383612275123596


Epoch 3: 100%|██████████| 1200/1200 [04:52<00:00,  4.11it/s]


Epoch 3, Loss: 0.05387023836374283


Epoch 4: 100%|██████████| 1200/1200 [04:56<00:00,  4.04it/s]


Epoch 4, Loss: 0.09578323364257812


Epoch 5: 100%|██████████| 1200/1200 [04:52<00:00,  4.10it/s]


Epoch 5, Loss: 0.004567377734929323


OutOfMemoryError: CUDA out of memory. Tried to allocate 634.00 MiB. GPU 

In [27]:
torch.save(model.state_dict(), 'models/roberta_12000.pth')

In [31]:
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch

# Load the model
model.load_state_dict(torch.load('models/roberta_12000.pth'))
model.to(device)
model.eval()

# Prepare the test data
# 確保test_labels是張量，並且不會改變其梯度跟蹤設置
if not isinstance(test_labels, torch.Tensor):
    test_labels = torch.tensor(test_labels)
elif test_labels.dtype != torch.int64:
    test_labels = test_labels.to(torch.int64)

# 確保尺寸一致
assert test_inputs.size(0) == test_labels.size(0), "Mismatch in number of inputs and labels"

test_dataset = TensorDataset(test_inputs, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

# Initiate a list to store predictions
predictions = []

# Evaluate the model in batches
with torch.no_grad():
    for batch in test_dataloader:
        batch_inputs, _ = batch
        batch_inputs = batch_inputs.to(device)
        outputs = model(batch_inputs)[0]
        batch_predicted_labels = torch.argmax(outputs, dim=1).cpu().tolist()
        predictions.extend(batch_predicted_labels)

# Calculate and print classification report
from sklearn.metrics import classification_report
report = classification_report(test_labels.cpu(), predictions)
print(report)

AssertionError: Mismatch in number of inputs and labels

In [None]:

model.eval()
with torch.no_grad():
    outputs = model(test_inputs)[0]
    predicted_labels = torch.argmax(outputs, dim=1).cpu().tolist()

# Calculate classification report
report = classification_report(test_labels.cpu(), predicted_labels)
print(report)

In [32]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [33]:
%%time
# 設定 device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 載入預測資料集
test_data = pd.read_csv('test_no_answer_2022.csv')

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

test_data['input_ids'] = test_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
test_inputs = pad_sequence(test_data['input_ids'].tolist(), batch_first=True).to(device)

# 創建 PyTorch DataLoader
test_dataset = TensorDataset(test_inputs)
test_loader = DataLoader(test_dataset, batch_size=8)

# 使用模型進行預測
model.eval()
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])  # 確保 inputs[0] 已在 GPU 上
        predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(predicted_labels)

# 將預測結果添加到測試數據集中
test_data['LABEL'] = predictions
# 保存預測結果到 CSV 文件
# 需要你自己定義 export_csv 函數，或使用 pandas 的 to_csv 方法
export_csv(test_data[['row_id', 'LABEL']], 'RoBERTa_12000')
# 打印預測結果
print(test_data[['row_id','TEXT', 'LABEL']])

       row_id                                               TEXT  LABEL
0           0   good to know if you can t find these elsewhere .      1
1           1  love it !  the grill plates come out and pop i...      1
2           2  i m convinced this was a poorly executed refur...      0
3           3  i would never have complained about that if it...      0
4           4  the photo shows the same whole ,  large candie...      0
...       ...                                                ...    ...
10995   10995             i didn t quite get it the first time .      1
10996   10996  i ve tried installing with and without the oem...      1
10997   10997  i was parked at a truck stop in the cincinnati...      0
10998   10998  i recently bought this case after seeing some ...      1
10999   10999  the keyboard types only % of the time and the ...      0

[11000 rows x 3 columns]
CPU times: total: 25.3 s
Wall time: 52.4 s
