In [16]:
import pandas as pd
train_2022 = pd.read_csv('train_2022.csv')
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
translated_en_data = pd.read_csv('translated_en_data.csv')

In [24]:
merged_data = pd.concat([train_2022,data_augmentation_chatGPT,data_augmentation_random_2_words,data_augmentation_random_3_words,translated_en_data], ignore_index=True)
merged_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
9995,1995,An imaginative comedy/thriller.,1
9996,1996,A savvy exploration of paranoia and insecurity...,1
9997,1997,"On the other hand, with the power grating, you...",1
9998,1998,"As in Dickens's novel, McGrath creates many mo...",1


In [20]:
%%time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 確認是否有可用的 CUDA 設備，並設定使用的設備
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"

# 載入預訓練的 tokenizer 和模型，這裡指定了 assemblyai 提供的 BERT large 模型
tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')
model.to(device)

# 轉換文本為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids'][0]

merged_data['input_ids'] = merged_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式，並移動到 CUDA 設備上
inputs = pad_sequence(merged_data['input_ids'].tolist(), batch_first=True).to(device)
labels = torch.tensor(merged_data['LABEL'].tolist()).to(device)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):  # 進行三個 epoch 的訓練
    for batch in train_loader:
        b_input_ids, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
# 評估模型
model.eval()
with torch.no_grad():
    test_outputs = model(test_inputs)
    predicted_labels = torch.argmax(test_outputs.logits, dim=1)

# 產生分類報告
report = classification_report(test_labels.cpu(), predicted_labels.cpu())
print(report)

Epoch 1, Loss: 1.038996696472168
Epoch 2, Loss: 0.415835440158844
Epoch 3, Loss: 0.19637218117713928
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.50      0.50      0.50         4

    accuracy                           0.60        10
   macro avg       0.58      0.58      0.58        10
weighted avg       0.60      0.60      0.60        10

CPU times: total: 3min 28s
Wall time: 52.4 s


In [21]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [23]:
%%time
# 設定 device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 載入預測資料集
test_data = pd.read_csv('test_no_answer_2022.csv')

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

test_data['input_ids'] = test_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
test_inputs = pad_sequence(test_data['input_ids'].tolist(), batch_first=True).to(device)

# 創建 PyTorch DataLoader
test_dataset = TensorDataset(test_inputs)
test_loader = DataLoader(test_dataset, batch_size=8)

# 使用模型進行預測
model.eval()
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])  # 確保 inputs[0] 已在 GPU 上
        predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(predicted_labels)

# 將預測結果添加到測試數據集中
test_data['LABEL'] = predictions
# 保存預測結果到 CSV 文件
# 需要你自己定義 export_csv 函數，或使用 pandas 的 to_csv 方法
export_csv(test_data[['row_id', 'LABEL']], 'bert_large_uncased_sst2')
# 打印預測結果
print(test_data[['row_id','TEXT', 'LABEL']])

       row_id                                               TEXT  LABEL
0           0   good to know if you can t find these elsewhere .      1
1           1  love it !  the grill plates come out and pop i...      1
2           2  i m convinced this was a poorly executed refur...      0
3           3  i would never have complained about that if it...      1
4           4  the photo shows the same whole ,  large candie...      1
...       ...                                                ...    ...
10995   10995             i didn t quite get it the first time .      0
10996   10996  i ve tried installing with and without the oem...      0
10997   10997  i was parked at a truck stop in the cincinnati...      0
10998   10998  i recently bought this case after seeing some ...      1
10999   10999  the keyboard types only % of the time and the ...      0

[11000 rows x 3 columns]
CPU times: total: 29.9 s
Wall time: 1min 44s


# 多加一層MLP 用那層tune

In [11]:
# import torch
# import torch.nn as nn
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
# from torch.utils.data import DataLoader, TensorDataset
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
# from torch.nn.utils.rnn import pad_sequence

# # 設定設備
# device = "cpu"

# # 載入預訓練的 tokenizer 和模型
# tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')

# # 凍結預訓練層的權重
# for param in model.parameters():
#     param.requires_grad = False

# # 新增一個自定義層，只有這層是可訓練的
# num_labels = 2  # 假設是一個二分類問題
# custom_classifier = nn.Sequential(
#     nn.Linear(model.classifier.in_features, num_labels)
# )
# custom_classifier.to(device)

# # 將自定義層加到模型上
# model.classifier = custom_classifier

# # 轉換文本為 token IDs
# def tokenize_text(text):
#     return tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids'][0]

# merged_data['input_ids'] = merged_data['TEXT'].apply(tokenize_text)
# inputs = pad_sequence(merged_data['input_ids'].tolist(), batch_first=True).to(device)
# labels = torch.tensor(merged_data['LABEL'].tolist()).to(device)

# # 將資料拆分為訓練集和測試集
# train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# # 創建 DataLoader
# train_dataset = TensorDataset(train_inputs, train_labels)
# train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)

# # 定義 optimizer 只針對新層
# optimizer = AdamW(model.classifier.parameters(), lr=2e-5)
# loss_fn = torch.nn.CrossEntropyLoss()

# # 訓練模型
# model.train()
# for epoch in range(3):
#     for b_input_ids, b_labels in train_loader:
#         model.zero_grad()
#         outputs = model(b_input_ids, labels=b_labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#     print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# # 評估模型
# model.eval()
# with torch.no_grad():
#     test_outputs = model(test_inputs)
#     predicted_labels = torch.argmax(test_outputs.logits, dim=1)

# # 產生分類報告
# report = classification_report(test_labels.cpu(), predicted_labels.cpu())
# print(report)



Epoch 1, Loss: 0.29902413487434387
Epoch 2, Loss: 0.3729929029941559
Epoch 3, Loss: 0.9795500636100769
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       416
           1       0.79      0.79      0.79       384

    accuracy                           0.80       800
   macro avg       0.80      0.80      0.80       800
weighted avg       0.80      0.80      0.80       800



# 直接使用 model 做預測

In [7]:
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch
# from torch.nn.functional import softmax
# from sklearn.metrics import classification_report

# # 設置設備
# device = "cpu"

# # 載入預訓練的 tokenizer 和模型
# tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model.to(device)
# model.eval()

# # 轉換文本為模型輸入格式
# def prepare_data(texts):
#     encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
#     return encoding['input_ids'], encoding['attention_mask']

# # 使用模型進行批次預測
# def predict(texts):
#     input_ids, attention_mask = prepare_data(texts)
#     input_ids = input_ids.to(device)
#     attention_mask = attention_mask.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
#     logits = outputs.logits
#     probabilities = softmax(logits, dim=1)
#     return probabilities

# # 預測情感
# probabilities = predict(merged_data['TEXT'].tolist())
# predicted_labels = torch.argmax(probabilities, dim=1).numpy()

# # 實際標籤
# real_labels = merged_data['LABEL'].tolist()

# # 計算分類報告
# report = classification_report(real_labels, predicted_labels)
# print(report)

              precision    recall  f1-score   support

           0       0.78      0.80      0.79        49
           1       0.80      0.78      0.79        51

    accuracy                           0.79       100
   macro avg       0.79      0.79      0.79       100
weighted avg       0.79      0.79      0.79       100

