In [1]:
import pandas as pd
train_2022 = pd.read_csv('train_2022.csv')
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
translated_en_data = pd.read_csv('translated_en_data.csv')
amazon_short_text_data = pd.read_csv('amazon_short_text_data.csv')

In [2]:
amazon = amazon_short_text_data.groupby('LABEL').head(3000)
amazon

Unnamed: 0,LABEL,TEXT,row_id
0,0,The Worst! A complete waste of time. Typograph...,0
1,0,sizes recomended in the size chart are not rea...,1
2,0,Nothing you don't already know If you have eve...,2
3,0,Doesn't work on a Mac It clearly says on line ...,3
4,1,Alaska sourdough REad most of the book while v...,4
...,...,...,...
7106,0,Too hard to read. I returned it. The print is ...,7106
7107,0,LARGE PRINT BIBLE THE LARGE PRINT WORDS VERY L...,7107
7110,0,"Lol the Bible sucks Responsible for hate, bigo...",7110
7111,0,"boring book It's a really boring book, I am su...",7111


In [3]:
merged_data = pd.concat([train_2022, data_augmentation_chatGPT, amazon], ignore_index=True)
merged_data = merged_data.reset_index(drop=True)
merged_data = merged_data.drop(columns=['row_id'])
merged_data['row_id'] = merged_data.index
merged_data

Unnamed: 0,TEXT,LABEL,row_id
0,director dirk shafer and co-writer greg hinton...,0,0
1,"a charming , quirky and leisurely paced scotti...",1,1
2,"the price was good , and came quickly though ...",1,2
3,i was looking forward to this game for a coupl...,0,3
4,arguably the year 's silliest and most incoher...,0,4
...,...,...,...
9995,Too hard to read. I returned it. The print is ...,0,9995
9996,LARGE PRINT BIBLE THE LARGE PRINT WORDS VERY L...,0,9996
9997,"Lol the Bible sucks Responsible for hate, bigo...",0,9997
9998,"boring book It's a really boring book, I am su...",0,9998


In [4]:
%%time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# 確認是否有可用的 CUDA 設備，並設定使用的設備
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = "cpu"

# 載入預訓練的 tokenizer 和模型，這裡指定了 assemblyai 提供的 BERT large 模型
tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')
model.to(device)

# 轉換文本為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt', padding=True, truncation=True)['input_ids'][0]

merged_data['input_ids'] = merged_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式，並移動到 CUDA 設備上
inputs = pad_sequence(merged_data['input_ids'].tolist(), batch_first=True).to(device)
labels = torch.tensor(merged_data['LABEL'].tolist()).to(device)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):  # 進行個 epoch 的訓練
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):  # 使用tqdm包裹train_loader
        b_input_ids, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
# 評估模型
model.eval()
with torch.no_grad():
    test_outputs = model(test_inputs)
    predicted_labels = torch.argmax(test_outputs.logits, dim=1)

# 產生分類報告
report = classification_report(test_labels.cpu(), predicted_labels.cpu())
print(report)

Epoch 1:   0%|                                                                                | 0/1000 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [05:05<00:00,  3.28it/s]


Epoch 1, Loss: 0.13762561976909637


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [05:11<00:00,  3.21it/s]


Epoch 2, Loss: 0.04803113639354706


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [05:12<00:00,  3.20it/s]


Epoch 3, Loss: 0.07380876690149307


Epoch 4: 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [05:12<00:00,  3.20it/s]


Epoch 4, Loss: 0.08239731192588806


Epoch 5: 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [05:12<00:00,  3.20it/s]


Epoch 5, Loss: 0.01663840189576149
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       987
           1       0.94      0.91      0.93      1013

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000

CPU times: total: 7min 24s
Wall time: 27min 46s


In [5]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [6]:
%%time
# 設定 device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 載入預測資料集
test_data = pd.read_csv('test_no_answer_2022.csv')

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

test_data['input_ids'] = test_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
test_inputs = pad_sequence(test_data['input_ids'].tolist(), batch_first=True).to(device)

# 創建 PyTorch DataLoader
test_dataset = TensorDataset(test_inputs)
test_loader = DataLoader(test_dataset, batch_size=8)

# 使用模型進行預測
model.eval()
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])  # 確保 inputs[0] 已在 GPU 上
        predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(predicted_labels)

# 將預測結果添加到測試數據集中
test_data['LABEL'] = predictions
# 保存預測結果到 CSV 文件
# 需要你自己定義 export_csv 函數，或使用 pandas 的 to_csv 方法
export_csv(test_data[['row_id', 'LABEL']], 'bert_large_uncased_sst2_Amazon_10000')
# 打印預測結果
print(test_data[['row_id','TEXT', 'LABEL']])

       row_id                                               TEXT  LABEL
0           0   good to know if you can t find these elsewhere .      1
1           1  love it !  the grill plates come out and pop i...      1
2           2  i m convinced this was a poorly executed refur...      0
3           3  i would never have complained about that if it...      1
4           4  the photo shows the same whole ,  large candie...      0
...       ...                                                ...    ...
10995   10995             i didn t quite get it the first time .      0
10996   10996  i ve tried installing with and without the oem...      1
10997   10997  i was parked at a truck stop in the cincinnati...      0
10998   10998  i recently bought this case after seeing some ...      1
10999   10999  the keyboard types only % of the time and the ...      0

[11000 rows x 3 columns]
CPU times: total: 58.2 s
Wall time: 1min


# 多加一層MLP 用那層tune

In [7]:
# import torch
# import torch.nn as nn
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
# from torch.utils.data import DataLoader, TensorDataset
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
# from torch.nn.utils.rnn import pad_sequence

# # 設定設備
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # 載入預訓練的 tokenizer 和模型
# tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')

# # 凍結預訓練層的權重
# for param in model.parameters():
#     param.requires_grad = False

# # 新增一個自定義層，只有這層是可訓練的
# num_labels = 2  # 假設是一個二分類問題
# custom_classifier = nn.Sequential(
#     nn.Linear(model.classifier.in_features, num_labels)
# )

# # 將自定義層加到模型上
# model.classifier = custom_classifier
# model.to(device)  # 確保整個模型都在同一設備上

# # 轉換文本為 token IDs
# def tokenize_text(text):
#     return tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)['input_ids'][0]

# merged_data['input_ids'] = merged_data['TEXT'].apply(tokenize_text)
# inputs = pad_sequence(merged_data['input_ids'].tolist(), batch_first=True).to(device)
# labels = torch.tensor(merged_data['LABEL'].tolist()).to(device)

# # 將資料拆分為訓練集和測試集
# train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# # 創建 DataLoader
# train_dataset = TensorDataset(train_inputs, train_labels)
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# # 定義 optimizer 只針對新層
# optimizer = AdamW(model.classifier.parameters(), lr=2e-5)
# loss_fn = torch.nn.CrossEntropyLoss()

# # 訓練模型
# model.train()
# for epoch in range(5):
#     for b_input_ids, b_labels in train_loader:
#         model.zero_grad()
#         outputs = model(b_input_ids, labels=b_labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#     print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# # 評估模型
# model.eval()
# with torch.no_grad():
#     test_outputs = model(test_inputs)
#     predicted_labels = torch.argmax(test_outputs.logits, dim=1)

# # 產生分類報告
# report = classification_report(test_labels.cpu(), predicted_labels.cpu())
# print(report)

# 直接使用 model 做預測

In [8]:
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch
# from torch.nn.functional import softmax
# from sklearn.metrics import classification_report

# # 設置設備
# device = "cpu"

# # 載入預訓練的 tokenizer 和模型
# tokenizer = AutoTokenizer.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model = AutoModelForSequenceClassification.from_pretrained('assemblyai/bert-large-uncased-sst2')
# model.to(device)
# model.eval()

# # 轉換文本為模型輸入格式
# def prepare_data(texts):
#     encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
#     return encoding['input_ids'], encoding['attention_mask']

# # 使用模型進行批次預測
# def predict(texts):
#     input_ids, attention_mask = prepare_data(texts)
#     input_ids = input_ids.to(device)
#     attention_mask = attention_mask.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
#     logits = outputs.logits
#     probabilities = softmax(logits, dim=1)
#     return probabilities

# # 預測情感
# probabilities = predict(merged_data['TEXT'].tolist())
# predicted_labels = torch.argmax(probabilities, dim=1).numpy()

# # 實際標籤
# real_labels = merged_data['LABEL'].tolist()

# # 計算分類報告
# report = classification_report(real_labels, predicted_labels)
# print(report)