In [11]:
import pandas as pd
train_2022 = pd.read_csv('train_2022.csv')
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
translated_en_data = pd.read_csv('translated_en_data.csv')
amazon_short_text_data = pd.read_csv('amazon_short_text_data.csv')

In [14]:
random = 9000
amazon_short_text_zero_data = amazon_short_text_data[amazon_short_text_data['LABEL'] == 0]
amazon_short_text_zero_data = amazon_short_text_zero_data.sample(n=random, random_state=42)
amazon_short_text_one_data = amazon_short_text_data[amazon_short_text_data['LABEL'] == 1]
amazon_short_text_one_data = amazon_short_text_one_data.sample(n=random, random_state=42)

In [15]:
merged_data = pd.concat([train_2022, amazon_short_text_zero_data, amazon_short_text_one_data], ignore_index=True)
merged_data = merged_data.reset_index(drop=True)
merged_data = merged_data.drop(columns=['row_id'])
merged_data['row_id'] = merged_data.index
merged_data

Unnamed: 0,TEXT,LABEL,row_id
0,director dirk shafer and co-writer greg hinton...,0,0
1,"a charming , quirky and leisurely paced scotti...",1,1
2,"the price was good , and came quickly though ...",1,2
3,i was looking forward to this game for a coupl...,0,3
4,arguably the year 's silliest and most incoher...,0,4
...,...,...,...
19995,Great piano music Great music if you like clas...,1,19995
19996,Excellent Album Another fantastic album by Arm...,1,19996
19997,iPod Dock and Speaker Station PRODUCT WAS JUST...,1,19997
19998,Yea Man - is the Best Put the Top down and cra...,1,19998


In [16]:
%%time
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# 確認CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
# 載入預訓練的 DistilBERT tokenizer 和模型，並將它們移動到CUDA設備上
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)
model.to(device)

# 讀取數據
train_data = merged_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs 和注意力遮罩
def tokenize_text(text):
    return tokenizer.encode_plus(
        text, 
        max_length=128, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )

train_data['encoding'] = train_data['TEXT'].apply(tokenize_text)
train_data['input_ids'] = train_data['encoding'].apply(lambda x: x['input_ids'].squeeze(0))
train_data['attention_mask'] = train_data['encoding'].apply(lambda x: x['attention_mask'].squeeze(0))

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(
    torch.stack(train_data['input_ids'].tolist()),
    torch.stack(train_data['attention_mask'].tolist()),
    train_data['LABEL'].tolist(), 
    test_size=0.2,
    random_state=42
)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):  # 使用tqdm包裹train_loader
        inputs, masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# 評估模型
model.eval()
with torch.no_grad():
    test_inputs, test_masks = test_inputs.to(device), test_masks.to(device)
    outputs = model(input_ids=test_inputs, attention_mask=test_masks)
    predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

Epoch 1: 100%|██████████| 3200/3200 [06:17<00:00,  8.47it/s]


Epoch 1, Loss: 0.009683920070528984


Epoch 2: 100%|██████████| 3200/3200 [06:05<00:00,  8.74it/s]


Epoch 2, Loss: 0.032490458339452744


Epoch 3: 100%|██████████| 3200/3200 [06:00<00:00,  8.88it/s]


Epoch 3, Loss: 0.0017600416904315352


Epoch 4: 100%|██████████| 3200/3200 [05:48<00:00,  9.19it/s]


Epoch 4, Loss: 0.010685762390494347


Epoch 5: 100%|██████████| 3200/3200 [06:03<00:00,  8.81it/s]


Epoch 5, Loss: 0.0009067291393876076


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.46 GiB. GPU 

In [17]:
torch.save(model.state_dict(), 'models/DistilBERT_20000.pth')

In [18]:
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
model.load_state_dict(torch.load('models/DistilBERT_20000.pth'))
# 定義一個函數來處理分批預測
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(predicted_labels)
    return predictions

# 創建數據加載器，您可以調整batch_size以避免OOM
batch_size = 16  # 您可以根據GPU的容量進行調整
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

# 計算分類報告
predicted_labels = evaluate_model(model, test_dataloader)
report = classification_report(test_labels, predicted_labels)
print(report)

Evaluating: 100%|██████████| 250/250 [00:16<00:00, 15.34it/s]

              precision    recall  f1-score   support

           0       0.93      0.94      0.93      2045
           1       0.94      0.93      0.93      1955

    accuracy                           0.93      4000
   macro avg       0.93      0.93      0.93      4000
weighted avg       0.93      0.93      0.93      4000






In [19]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [21]:
%%time
# 預處理測試數據
test_data = pd.read_csv('test_no_answer_2022.csv')
result = test_data.copy()
def tokenize_text(text):
    return tokenizer.encode_plus(
        text, 
        max_length=128, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )

test_data['encoding'] = test_data['TEXT'].apply(tokenize_text)
test_data['input_ids'] = test_data['encoding'].apply(lambda x: x['input_ids'].squeeze(0))
test_data['attention_mask'] = test_data['encoding'].apply(lambda x: x['attention_mask'].squeeze(0))

# 將處理好的數據轉換為 DataLoader
test_inputs = torch.stack(test_data['input_ids'].tolist())
test_masks = torch.stack(test_data['attention_mask'].tolist())
test_dataset = TensorDataset(test_inputs, test_masks)
test_loader = DataLoader(test_dataset, batch_size=32)  # 可以調整batch size

# 使用模型進行預測
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        inputs, masks = tuple(t.to(device) for t in batch)
        outputs = model(input_ids=inputs, attention_mask=masks)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

# 將預測結果附加到 DataFrame 並保存或打印結果
result['LABEL'] = predictions
print(result[['row_id', 'TEXT', 'LABEL']])

# 可選：將預測結果保存到 CSV 文件
export_csv(result.drop(columns=['TEXT']),'Distill_BERT_FineTune_SST2_20000_data')

       row_id                                               TEXT  LABEL
0           0   good to know if you can t find these elsewhere .      0
1           1  love it !  the grill plates come out and pop i...      1
2           2  i m convinced this was a poorly executed refur...      0
3           3  i would never have complained about that if it...      1
4           4  the photo shows the same whole ,  large candie...      1
...       ...                                                ...    ...
10995   10995             i didn t quite get it the first time .      0
10996   10996  i ve tried installing with and without the oem...      1
10997   10997  i was parked at a truck stop in the cincinnati...      0
10998   10998  i recently bought this case after seeing some ...      1
10999   10999  the keyboard types only % of the time and the ...      0

[11000 rows x 3 columns]
CPU times: total: 26 s
Wall time: 51.5 s


In [6]:
test_predictions = pd.read_csv('test_predictions.csv')
test_predictions

Unnamed: 0,row_id,TEXT,encoding,input_ids,attention_mask,predicted_labels
0,0,good to know if you can t find these elsewhere .,"{'input_ids': tensor([[ 101, 2204, 2000, 2113,...","tensor([ 101, 2204, 2000, 2113, 2065, 2017, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
1,1,love it ! the grill plates come out and pop i...,"{'input_ids': tensor([[ 101, 2293, 2009, ...","tensor([ 101, 2293, 2009, 999, 1996, 186...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
2,2,i m convinced this was a poorly executed refur...,"{'input_ids': tensor([[ 101, 1045, 1049, 6...","tensor([ 101, 1045, 1049, 6427, 2023, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
3,3,i would never have complained about that if it...,"{'input_ids': tensor([[ 101, 1045, 2052, 2...","tensor([ 101, 1045, 2052, 2196, 2031, 108...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
4,4,"the photo shows the same whole , large candie...","{'input_ids': tensor([[ 101, 1996, 6302, 3...","tensor([ 101, 1996, 6302, 3065, 1996, 21...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1
...,...,...,...,...,...,...
10995,10995,i didn t quite get it the first time .,"{'input_ids': tensor([[ 101, 1045, 2134, 1056,...","tensor([ 101, 1045, 2134, 1056, 3243, 2131, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,...",0
10996,10996,i ve tried installing with and without the oem...,"{'input_ids': tensor([[ 101, 1045, 2310, 2...","tensor([ 101, 1045, 2310, 2699, 23658, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
10997,10997,i was parked at a truck stop in the cincinnati...,"{'input_ids': tensor([[ 101, 1045, 2001, 9083,...","tensor([ 101, 1045, 2001, 9083, 2012, 1037, 47...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",0
10998,10998,i recently bought this case after seeing some ...,"{'input_ids': tensor([[ 101, 1045, 3728, 4149,...","tensor([ 101, 1045, 3728, 4149, 2023, 2553, 20...","tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",1


In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


'POSITIVE'