In [29]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv', index_col=False).reset_index(drop=True)
add_classify_prefix(raw_data,'TEXT')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,classify: director dirk shafer and co-writer g...,0
1,1,"classify: a charming , quirky and leisurely pa...",1
2,2,"classify: the price was good , and came quick...",1
3,3,classify: i was looking forward to this game f...,0
4,4,classify: arguably the year 's silliest and mo...,0
...,...,...,...
1995,1995,classify: an imaginative comedy\/thriller .,1
1996,1996,classify: a savvy exploration of paranoia and ...,1
1997,1997,classify: on the other hand for power grating ...,1
1998,1998,"classify: like dickens with his passages , mcg...",1


In [2]:
import pandas as pd
test_data = pd.read_csv('test_no_answer_2022.csv')
test_data

Unnamed: 0,row_id,TEXT
0,0,good to know if you can t find these elsewhere .
1,1,love it ! the grill plates come out and pop i...
2,2,i m convinced this was a poorly executed refur...
3,3,i would never have complained about that if it...
4,4,"the photo shows the same whole , large candie..."
...,...,...
10995,10995,i didn t quite get it the first time .
10996,10996,i ve tried installing with and without the oem...
10997,10997,i was parked at a truck stop in the cincinnati...
10998,10998,i recently bought this case after seeing some ...


In [21]:
def add_classify_prefix(dataframe, text_column):
    dataframe[text_column] = "classify: " + dataframe[text_column].astype(str)
    return dataframe

In [18]:
def data_lowercase(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].str.lower()
    return new_df
import contractions
def data_contraction(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: contractions.fix(x))
    return new_df
import re
def data_remove_tags_punctuations_numbers(df):
    new_df = df.copy();
    def remove_tags_punctuations_numbers(sentense):
        sentense = re.sub(r'<[^>]+>', '', sentense)
        sentense = re.sub(r'[^\w\s]', '', sentense)
        sentense = re.sub(r'\d+', '', sentense)
        return sentense;
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: remove_tags_punctuations_numbers(x))
    return new_df
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def data_remove_stopword(df):
    new_df = df.copy();
    def remove_stopword(sentence):
        sentence_arr = sentence.split()
        filtered_sentence = [word for word in sentence_arr if word.lower() not in stop_words]
        filtered_sentence = ' '.join(filtered_sentence)
        return filtered_sentence
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: remove_stopword(x))
    return new_df
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
lemmatizer = WordNetLemmatizer()

def data_lemmatize_text(df):
    new_df = df.copy();
    def lemmatize_text(text):
        tokens = word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(lemmatized_tokens)
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: lemmatize_text(x))
    return new_df
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')

def data_stem(df):
    new_df = df.copy();
    new_df['TEXT'] = new_df['TEXT'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    return new_df
def data_preprocess(df):
    df = data_lowercase(df)
    df = data_contraction(df)
    df = data_remove_tags_punctuations_numbers(df)
    df = data_remove_stopword(df)
    return df

In [32]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['TEXT']
        label = self.data.iloc[idx]['LABEL']
        inputs = self.tokenizer(
            "classify: " + text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        inputs['labels'] = torch.tensor(label)
        return inputs
        
# 讀取資料
raw_data = pd.read_csv('train_2022.csv', index_col=False)

# 在 TEXT 欄位加上 classify: 前綴
def add_classify_prefix(data, column_name):
    data[column_name] = "classify: " + data[column_name].astype(str)

add_classify_prefix(raw_data, 'TEXT')

# 載入 T5 模型和 tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 切分訓練集和驗證集
train_df, eval_df = train_test_split(raw_data, test_size=0.2, random_state=42)


# 定義訓練函式
def train_model(model, tokenizer, train_df, eval_df, epochs=3, batch_size=8):
    optimizer = AdamW(model.parameters(), lr=5e-5)

    train_dataset = tokenizer(train_df['TEXT'].tolist(), truncation=True, padding=True)
    eval_dataset = tokenizer(eval_df['TEXT'].tolist(), truncation=True, padding=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch, labels=batch["input_ids"])
            loss = outputs.loss
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Avg Train Loss: {avg_train_loss}")

        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for batch in tqdm(eval_loader, desc=f"Epoch {epoch+1} Evaluation"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch, labels=batch["input_ids"])
                loss = outputs.loss
                total_eval_loss += loss.item()
            avg_eval_loss = total_eval_loss / len(eval_loader)
            print(f"Avg Eval Loss: {avg_eval_loss}")

# 訓練模型
train_model(model, tokenizer, train_df, eval_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Epoch 1 Training:   0%|          | 0/1 [00:00<?, ?it/s]


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

# 讀取測試資料
test_data = pd.read_csv('test_no_answer_2022.csv')

# 初始化 T5 模型和 tokenizer
model = T5ForConditionalGeneration.from_pretrained('./results')  # 指定訓練完成的模型目錄路徑
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# 資料處理
test_inputs = test_data['TEXT'].apply(lambda x: f"classify: {x}")  # 加入前綴 "classify: "
test_encodings = tokenizer(test_inputs.tolist(), truncation=True, padding=True)

# 使用模型預測
input_ids = test_encodings['input_ids']
attention_mask = test_encodings['attention_mask']

with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

pred_labels = outputs.logits.argmax(dim=-1)

# 將預測結果加入測試資料集中
test_data['PREDICTED_LABEL'] = pred_labels.tolist()

# 儲存包含預測結果的測試資料集
test_data.to_csv('test_predicted.csv', index=False)