In [1]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [2]:
import pandas as pd
test_data = pd.read_csv('test_no_answer_2022.csv')
test_data

Unnamed: 0,row_id,TEXT
0,0,good to know if you can t find these elsewhere .
1,1,love it ! the grill plates come out and pop i...
2,2,i m convinced this was a poorly executed refur...
3,3,i would never have complained about that if it...
4,4,"the photo shows the same whole , large candie..."
...,...,...
10995,10995,i didn t quite get it the first time .
10996,10996,i ve tried installing with and without the oem...
10997,10997,i was parked at a truck stop in the cincinnati...
10998,10998,i recently bought this case after seeing some ...


# 資料前處理

In [2]:
def data_lowercase(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].str.lower()
    return new_df
import contractions
def data_contraction(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: contractions.fix(x))
    return new_df
import re
def data_remove_tags_punctuations_numbers(df):
    new_df = df.copy();
    def remove_tags_punctuations_numbers(sentense):
        sentense = re.sub(r'<[^>]+>', '', sentense)
        sentense = re.sub(r'[^\w\s]', '', sentense)
        sentense = re.sub(r'\d+', '', sentense)
        return sentense;
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: remove_tags_punctuations_numbers(x))
    return new_df
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def data_remove_stopword(df):
    new_df = df.copy();
    def remove_stopword(sentence):
        sentence_arr = sentence.split()
        filtered_sentence = [word for word in sentence_arr if word.lower() not in stop_words]
        filtered_sentence = ' '.join(filtered_sentence)
        return filtered_sentence
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: remove_stopword(x))
    return new_df
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
lemmatizer = WordNetLemmatizer()

def data_lemmatize_text(df):
    new_df = df.copy();
    def lemmatize_text(text):
        tokens = word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(lemmatized_tokens)
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: lemmatize_text(x))
    return new_df
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')

def data_stem(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    return new_df
def data_preprocess(df):
    df = data_lowercase(df)
    df = data_contraction(df)
    df = data_remove_tags_punctuations_numbers(df)
    df = data_remove_stopword(df)
    return df

# BERT

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感

# 轉換成 DataFrame
train_data = raw_data.copy()
# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式，對文本進行填充
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 使用模型進行預測
with torch.no_grad():
    outputs = model(inputs)

# 從模型輸出中獲取預測結果
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 真實標籤
true_labels = train_data['LABEL'].tolist()

# 計算分類報告
report = classification_report(true_labels, predicted_labels)
print(report)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              precision    recall  f1-score   support

           0       0.40      0.05      0.09      1000
           1       0.49      0.92      0.64      1000

    accuracy                           0.49      2000
   macro avg       0.45      0.49      0.37      2000
weighted avg       0.45      0.49      0.37      2000



# 前處理 + BERT

In [6]:
%%time
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感

# 資料
data = {
    'TEXT': [
        "director dirk shafer and co-writer greg hinton ride the dubious divide where gay porn reaches for serious drama .",
        "a charming , quirky and leisurely paced scottish comedy -- except with an outrageous central gimmick that could have been a reject from monty python 's meaning of life .",
        # 其他資料行
    ],
    'LABEL': [
        0,
        1,
        # 其他標籤
    ]
}

# 轉換成 DataFrame
train_data = raw_data.copy()
train_data= data_preprocess(train_data)
# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式，對文本進行填充
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 使用模型進行預測
with torch.no_grad():
    outputs = model(inputs)

# 從模型輸出中獲取預測結果
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 真實標籤
true_labels = train_data['LABEL'].tolist()

# 計算分類報告
report = classification_report(true_labels, predicted_labels)
print(report)

  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: 

# BERT + Fine Tune

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感


# 轉換成 DataFrame
train_data = raw_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# 評估模型
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

           0       0.73      0.77      0.75       189
           1       0.78      0.75      0.77       211

    accuracy                           0.76       400
   macro avg       0.76      0.76      0.76       400
weighted avg       0.76      0.76      0.76       400



# BERT + Fine Tune + 調整參數

In [8]:
%%time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# Configurable parameters
batch_size = 128  # Default batch size
learning_rate = 5e-5  # Default learning rate
num_train_epochs = 10  # Default number of training epochs
weight_decay = 0.00  # Default weight decay

# Load the pre-trained BERT tokenizer and model with two labels for binary classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Assuming 'raw_data' is a DataFrame with 'TEXT' and 'LABEL' columns
train_data = raw_data.copy()

# Tokenize text and convert it to token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# Convert to a format that can be input into the model
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# Split data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# Create PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define optimizer with weight decay and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
model.train()
for epoch in range(num_train_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# Calculate and print the classification report
report = classification_report(test_labels, predicted_labels)
print(report)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              precision    recall  f1-score   support

           0       0.84      0.69      0.76       212
           1       0.71      0.86      0.78       188

    accuracy                           0.77       400
   macro avg       0.78      0.77      0.77       400
weighted avg       0.78      0.77      0.77       400

CPU times: total: 5h 41min 42s
Wall time: 1h 52min 22s


# preprocess + BERT + Fine Tune

In [3]:
%time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感


# 轉換成 DataFrame
train_data = raw_data.copy()
train_data= data_preprocess(train_data)

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# 評估模型
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              precision    recall  f1-score   support

           0       0.68      0.66      0.67       183
           1       0.72      0.74      0.73       217

    accuracy                           0.70       400
   macro avg       0.70      0.70      0.70       400
weighted avg       0.70      0.70      0.70       400

CPU times: total: 1h 8min 16s
Wall time: 54min 56s


# RoBERTa (Robustly Optimized BERT Pretraining Approach)

In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# Load pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # 2 for binary classification, positive and negative sentiment

# Convert to DataFrame (assuming you've loaded train_data from CSV)
train_data = raw_data.copy()

# Tokenize text using tokenizer
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# Convert to model input format
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# Split data into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# Create PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)[0]  # RoBERTa's model returns a tuple, so we take the first element
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(torch.tensor(test_inputs))[0]
    predicted_labels = torch.argmax(outputs, dim=1).tolist()

# Calculate classification report
report = classification_report(test_labels, predicted_labels)
print(report)

  torch.utils._pytree._register_pytree_node(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  outputs = model(torch.tensor(test_inputs))[0]


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       206
           1       0.48      1.00      0.65       194

    accuracy                           0.48       400
   macro avg       0.24      0.50      0.33       400
weighted avg       0.24      0.48      0.32       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# DistilBERT + Fine Tune

In [2]:
%%time
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 DistilBERT tokenizer 和模型
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感

# 轉換成 DataFrame
train_data = raw_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# 評估模型
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              precision    recall  f1-score   support

           0       0.78      0.79      0.79       215
           1       0.75      0.75      0.75       185

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.77      0.77      0.77       400

CPU times: total: 1h 42min 58s
Wall time: 37min 36s


# bert-large-uncased + fine tune

In [3]:
%%time

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

# 載入預訓練的 BERT tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)  # 2 表示二分類，正向和負向情感

# 轉換成 DataFrame
train_data = raw_data.copy()

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

train_data['input_ids'] = train_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
inputs = pad_sequence(train_data['input_ids'].tolist(), batch_first=True)

# 將資料拆分為訓練集和測試集
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, train_data['LABEL'].tolist(), test_size=0.2)

# 創建 PyTorch DataLoader
train_dataset = TensorDataset(train_inputs, torch.tensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 定義 optimizer 和損失函數
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 訓練模型
model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()

# 評估模型
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# 計算分類報告
report = classification_report(test_labels, predicted_labels)
print(report)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              precision    recall  f1-score   support

           0       0.83      0.88      0.85       194
           1       0.88      0.83      0.86       206

    accuracy                           0.85       400
   macro avg       0.86      0.86      0.85       400
weighted avg       0.86      0.85      0.86       400

CPU times: total: 35min 53s
Wall time: 1h 3min 35s


# Test 資料集

In [10]:
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [5]:
import datetime
import pytz
import pandas as pd
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y%m%d')
  df.to_csv('result/'+ formatted_time + '_' + name + ".csv", index=False,encoding="utf_8_sig")

In [4]:
%%time
# 載入預測資料集
test_data = pd.read_csv('test_no_answer_2022.csv')

# 使用 tokenizer 將文本轉換為 token IDs
def tokenize_text(text):
    return tokenizer(text, return_tensors='pt')['input_ids'][0]

test_data['input_ids'] = test_data['TEXT'].apply(tokenize_text)

# 轉換成可以輸入模型的格式
test_inputs = pad_sequence(test_data['input_ids'].tolist(), batch_first=True)

# 創建 PyTorch DataLoader
test_dataset = TensorDataset(test_inputs)
test_loader = DataLoader(test_dataset, batch_size=8)

# 使用模型進行預測
model.eval()
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])  # inputs[0] 是 token IDs
        predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()
        predictions.extend(predicted_labels)

# 將預測結果添加到測試數據集中
test_data['LABEL'] = predictions
# 保存預測結果到 CSV 文件
export_csv(test_data.drop(columns=['TEXT','input_ids']),'naming')

# 打印預測結果
print(test_data[['TEXT', 'PREDICTED_LABEL']])

                                                    TEXT  PREDICTED_LABEL
0       good to know if you can t find these elsewhere .                0
1      love it !  the grill plates come out and pop i...                1
2      i m convinced this was a poorly executed refur...                0
3      i would never have complained about that if it...                0
4      the photo shows the same whole ,  large candie...                0
...                                                  ...              ...
10995             i didn t quite get it the first time .                0
10996  i ve tried installing with and without the oem...                1
10997  i was parked at a truck stop in the cincinnati...                0
10998  i recently bought this case after seeing some ...                1
10999  the keyboard types only % of the time and the ...                0

[11000 rows x 2 columns]
CPU times: total: 6min 47s
Wall time: 14min 35s
