In [1]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [2]:
import pandas as pd
data_augmentation_random_2_words = pd.read_csv('data_augmentation_random_2_words.csv')
data_augmentation_random_2_words

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk broke and co-writer greg hinton ...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was my , and came quickly by my prim...",1
3,3,", was great forward to this game for a couple ...",0
4,4,arguably the year 's silliest and most terribl...,0
...,...,...,...
1995,1995,of imaginative comedy\/thriller and,1
1996,1996,"a savvy exploration of paranoia , insecurity i...",1
1997,1997,", the other hand for only grating you ve got s...",1
1998,1998,"like 10-inch with his passages , mcgrath craft...",1


In [3]:
import pandas as pd
data_augmentation_random_3_words = pd.read_csv('data_augmentation_random_3_words.csv')
data_augmentation_random_3_words

Unnamed: 0,row_id,TEXT,LABEL
0,0,director stunts shafer and co-writer greg hint...,0
1,1,"a charming , quirky and company paced spy come...",1
2,2,"the price was good , and came you though the p...",1
3,3,", was looking forward to . game for a couple f...",0
4,4,"arguably the year 's half-hearted , to incoher...",0
...,...,...,...
1995,1995,of piece comedy\/thriller of,1
1996,1996,a savvy kind of paranoia and insecurity in ame...,1
1997,1997,", the other hand for power hundred you ve got ...",1
1998,1998,"like 10-inch . his passages , mcgrath crafts q...",1


In [4]:
import pandas as pd
data_augmentation_chatGPT = pd.read_csv('data_augmentation_chatGPT.csv')
data_augmentation_chatGPT

Unnamed: 0,TEXT,LABEL,row_id
0,Director Dirk Shafer and co-writer Greg Hinton...,0,0
1,"This Scottish comedy is delightful, quirky, an...",1,1
2,"The price was reasonable, and thanks to my Pri...",1,2
3,I had been eagerly anticipating this game for ...,0,3
4,"Arguably, this is the silliest and most confus...",0,4
...,...,...,...
1995,A creative comedy/thriller.,1,1995
1996,Explores paranoia and insecurity in America's ...,1,1996
1997,Good for power grating.,1,1997
1998,McGrath's variation on the novel crafts moving...,1,1998


In [5]:
merged_data = pd.concat([raw_data, data_augmentation_random_2_words, data_augmentation_random_3_words, data_augmentation_chatGPT], ignore_index=True)
merged_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
7995,1995,A creative comedy/thriller.,1
7996,1996,Explores paranoia and insecurity in America's ...,1
7997,1997,Good for power grating.,1
7998,1998,McGrath's variation on the novel crafts moving...,1


# cuda

In [6]:
%%time
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 確認CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 載入預訓練的 DistilBERT tokenizer 和模型，並將它們移動到CUDA設備上
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感
model.to(device)

# 轉換成 DataFrame
train_data = merged_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式，並移動到 CUDA 設備上
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True).to(device)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# 創建 PyTorch DataLoader，並設定為使用 CUDA
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels).to(device))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs, labels = inputs.to(device), labels.to(device)  # 將資料移動到 CUDA 設備上
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# 評估模型
model.eval()
with torch.no_grad():
    test_inputs = test_inputs.to(device)  # 將測試資料移動到 CUDA 設備上
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.17 GiB. GPU 

In [7]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [8]:
%%time
# 載入預測資料集
test_data = pd.read_csv('test_no_answer_2022.csv')

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

test_data['input_ids'] = test_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
test_inputs = pad_sequence(test_data['input_ids'].tolist(), batch_first=True)

# 創建 PyTorch DataLoader
test_dataset = TensorDataset(test_inputs)
test_loader = DataLoader(test_dataset, batch_size=8)

# 使用模型進行預測
model.eval()
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])  # inputs[0] 是 token IDs
        predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(predicted_labels)

# 將預測結果添加到測試數據集中
test_data['LABEL'] = predictions
# 保存預測結果到 CSV 文件
export_csv(test_data.drop(columns=['TEXT','input_ids']),'Distill_BERT_FineTune_DataAugmentation_ChatGPT_Random_2words_3words')

# 打印預測結果
print(test_data[['TEXT', 'LABEL']])

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [6]:
import torch

# 檢查CUDA是否可用
cuda_available = torch.cuda.is_available()

if cuda_available:
    print("CUDA is available.")
else:
    print("CUDA is not available.")

CUDA is available.
