In [1]:
import pandas as pd
train_2022 = pd.read_csv('train_2022.csv')
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
translated_en_data = pd.read_csv('translated_en_data.csv')

In [6]:
merged_data = pd.concat([train_2022,data_augmentation_chatGPT], ignore_index=True)
# merged_data = merged_data.sample(n=100, random_state=42)
merged_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
3995,1995,A creative comedy/thriller.,1
3996,1996,Explores paranoia and insecurity in America's ...,1
3997,1997,Good for power grating.,1
3998,1998,McGrath's variation on the novel crafts moving...,1


In [7]:
%%time
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 確認CUDA是否可用
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
# 載入預訓練的 DistilBERT tokenizer 和模型，並將它們移動到CUDA設備上
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)
model.to(device)

# 讀取數據
train_data = merged_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs 和注意力遮罩
def tokenize_text(text):
    return tokenizer.encode_plus(
        text, 
        max_length=128, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )

train_data['encoding'] = train_data['TEXT'].apply(tokenize_text)
train_data['input_ids'] = train_data['encoding'].apply(lambda x: x['input_ids'].squeeze(0))
train_data['attention_mask'] = train_data['encoding'].apply(lambda x: x['attention_mask'].squeeze(0))

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(
    torch.stack(train_data['input_ids'].tolist()),
    torch.stack(train_data['attention_mask'].tolist()),
    train_data['LABEL'].tolist(), 
    test_size=0.2,
    random_state=42
)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(3):
    for batch in train_loader:
        inputs, masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# 評估模型
model.eval()
with torch.no_grad():
    test_inputs, test_masks = test_inputs.to(device), test_masks.to(device)
    outputs = model(input_ids=test_inputs, attention_mask=test_masks)
    predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)



Epoch 1, Loss: 0.24324165284633636
Epoch 2, Loss: 0.13849379122257233
Epoch 3, Loss: 0.007841243408620358
              precision    recall  f1-score   support

           0       0.92      0.76      0.83       416
           1       0.78      0.93      0.85       384

    accuracy                           0.84       800
   macro avg       0.85      0.84      0.84       800
weighted avg       0.85      0.84      0.84       800

CPU times: total: 3h 15min 53s
Wall time: 1h 44min 33s


In [8]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [9]:
%%time
# 預處理測試數據
test_data = pd.read_csv('test_no_answer_2022.csv')
result = test_data.copy()
def tokenize_text(text):
    return tokenizer.encode_plus(
        text, 
        max_length=128, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )

test_data['encoding'] = test_data['TEXT'].apply(tokenize_text)
test_data['input_ids'] = test_data['encoding'].apply(lambda x: x['input_ids'].squeeze(0))
test_data['attention_mask'] = test_data['encoding'].apply(lambda x: x['attention_mask'].squeeze(0))

# 將處理好的數據轉換為 DataLoader
test_inputs = torch.stack(test_data['input_ids'].tolist())
test_masks = torch.stack(test_data['attention_mask'].tolist())
test_dataset = TensorDataset(test_inputs, test_masks)
test_loader = DataLoader(test_dataset, batch_size=32)  # 可以調整batch size

# 使用模型進行預測
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        inputs, masks = tuple(t.to(device) for t in batch)
        outputs = model(input_ids=inputs, attention_mask=masks)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

# 將預測結果附加到 DataFrame 並保存或打印結果
result['LABEL'] = predictions
print(result[['row_id', 'TEXT', 'LABEL']])

# 可選：將預測結果保存到 CSV 文件
export_csv(result.drop(columns=['TEXT']),'Distill_BERT_FineTune_SST2_4000_data')

       row_id                                               TEXT  LABEL
0           0   good to know if you can t find these elsewhere .      0
1           1  love it !  the grill plates come out and pop i...      1
2           2  i m convinced this was a poorly executed refur...      0
3           3  i would never have complained about that if it...      0
4           4  the photo shows the same whole ,  large candie...      1
...       ...                                                ...    ...
10995   10995             i didn t quite get it the first time .      0
10996   10996  i ve tried installing with and without the oem...      1
10997   10997  i was parked at a truck stop in the cincinnati...      0
10998   10998  i recently bought this case after seeing some ...      1
10999   10999  the keyboard types only % of the time and the ...      0

[11000 rows x 3 columns]
CPU times: total: 1h 30min 20s
Wall time: 31min 4s


In [6]:
test_predictions = pd.read_csv('test_predictions.csv')
test_predictions

Unnamed: 0,row_id,TEXT,encoding,input_ids,attention_mask,predicted_labels
0,0,good to know if you can t find these elsewhere .,"{'input_ids': tensor([[ 101, 2204, 2000, 2113,...","tensor([ 101, 2204, 2000, 2113, 2065, 2017, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
1,1,love it ! the grill plates come out and pop i...,"{'input_ids': tensor([[ 101, 2293, 2009, ...","tensor([ 101, 2293, 2009, 999, 1996, 186...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
2,2,i m convinced this was a poorly executed refur...,"{'input_ids': tensor([[ 101, 1045, 1049, 6...","tensor([ 101, 1045, 1049, 6427, 2023, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
3,3,i would never have complained about that if it...,"{'input_ids': tensor([[ 101, 1045, 2052, 2...","tensor([ 101, 1045, 2052, 2196, 2031, 108...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
4,4,"the photo shows the same whole , large candie...","{'input_ids': tensor([[ 101, 1996, 6302, 3...","tensor([ 101, 1996, 6302, 3065, 1996, 21...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
...,...,...,...,...,...,...
10995,10995,i didn t quite get it the first time .,"{'input_ids': tensor([[ 101, 1045, 2134, 1056,...","tensor([ 101, 1045, 2134, 1056, 3243, 2131, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...",0
10996,10996,i ve tried installing with and without the oem...,"{'input_ids': tensor([[ 101, 1045, 2310, 2...","tensor([ 101, 1045, 2310, 2699, 23658, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
10997,10997,i was parked at a truck stop in the cincinnati...,"{'input_ids': tensor([[ 101, 1045, 2001, 9083,...","tensor([ 101, 1045, 2001, 9083, 2012, 1037, 47...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
10998,10998,i recently bought this case after seeing some ...,"{'input_ids': tensor([[ 101, 1045, 3728, 4149,...","tensor([ 101, 1045, 3728, 4149, 2023, 2553, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1


In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


'POSITIVE'