In [None]:
!pip install transformers
!pip install pytorch-pretrained-bert
!pip install spacy ftfy==4.4.3
!python -m spacy download en
!pip install accelerate -U
!pip install wandb

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import torch
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

import argparse
import logging
from tqdm import trange

import torch
import torch.nn.functional as F
import numpy as np


from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments

In [None]:
from google.colab import drive
import pandas as pd

# Google Drive 마운트
drive.mount('/content/drive')

# 데이터셋 경로
dataset_path = '/content/drive/MyDrive/aiffelthon_mine/GiTi/MIntRec'
train_file_path = dataset_path + "/train.tsv"
test_file_path = dataset_path + "/test.tsv"
dev_file_path = dataset_path + "/dev.tsv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 데이터 불러오기
train = pd.read_csv(train_file_path, sep='\t')
# train.head()

train.tail()  #1334개

Unnamed: 0,season,episode,clip,text,label
1329,S05,E15,113,"speaking of cereal, you sound just like tony t...",Joke
1330,S06,E04,396,this job is hard enough.,Complain
1331,S05,E11,151,and i need a floor supervisor,Ask for help
1332,S06,E03,458,glenn gave me ultimate executive power.,Flaunt
1333,S04,E11,85,it lowers your blood pressure and increases yo...,Introduce


In [None]:
test = pd.read_csv(test_file_path, sep='\t')
# test.head()

test.tail()  #445개

Unnamed: 0,season,episode,clip,text,label
440,S05,E18,366,"i mean, i do kind of have to go.",Leave
441,S05,E18,395,"oh, okay, well, yeah. don't--do not give him t...",Prevent
442,S05,E19,389,"guys, i sent carol to lunch so that we could t...",Inform
443,S05,E08,90,"i'm sorry, garrett",Apologise
444,S05,E09,540,your fault because the two of you don't know,Criticize


In [None]:
dev = pd.read_csv(dev_file_path, sep='\t')
# dev.head()

dev.tail()  #445개

Unnamed: 0,season,episode,clip,text,label
440,S04,E05,134,thank you. happy 0th birthday.,Thank
441,S05,E10,115,hey.,Greet
442,S04,E16,418,cool shoes.,Praise
443,S05,E13,52,"jonah, i'm not taking you off the schedule.",Inform
444,S04,E14,507,"yes, except i already told corporate that i ga...",Inform


In [None]:
label_to_index = {
    'Complain' : 0,
    'Praise' : 0,
    'Apologise' : 0,
    'Apologise': 0,
    'Thank' : 0,
    'Criticize': 0,
    'Care': 0,
    'Agree' : 0,
    'Taunt' : 0,
    'Flaunt' : 0,
    'Oppose' : 0,
    'Joke' : 0,
    'Inform' : 1,
    'Advise': 1,
    'Arrange': 1,
    'Introduce': 1,
    'Comfort' : 1,
    'Leave' : 1,
    'Prevent' : 1,
    'Greet' : 1,
    'Ask for help' : 1,
}

# index_to_label = {v : k for k,v in label_to_index.items()}
# index_to_label  # {1: 'Ask for help', 0: 'Flaunt'}

In [None]:
# train_test_split으로 나누지 말고, train은 train data, test는 test data로
# dev로 (객관적인) 성능평가할 수 있게 코드를 짜기

train_texts = train['text'].tolist()
train_labels = train['label'].tolist()

test_texts = test['text'].tolist()
test_labels = test['label'].tolist()

dev_texts = dev['text'].tolist()
dev_labels = dev['label'].tolist()

In [None]:
# 라벨 개수
num_labels = len(set(label_to_index.values()))
print('Label 개수 :', num_labels)

Label 개수 : 2


In [None]:
train_labels = [label_to_index[label] for label in train_labels]
test_labels = [label_to_index[label] for label in test_labels]
dev_labels = [label_to_index[label] for label in dev_labels]

In [None]:
len(train_labels)  # 1067
len(test_labels)   # 267

type(train_labels)  # list

type(train_labels[0]) # int

int

In [None]:
# GPT-2 tokenizer 및 model 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_labels) # 바이너리 분류를 위해 num_labels=2로 설정합니다.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# 토크나이징 및 데이터셋 형식 변경
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)

In [None]:
class MIntRecDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = MIntRecDataset(train_encodings, train_labels)
test_dataset = MIntRecDataset(test_encodings, test_labels)
dev_dataset = MIntRecDataset(dev_encodings, dev_labels)

In [None]:
%cd /content/drive/MyDrive/aiffelthon_mine/GiTi

/content/drive/MyDrive/aiffelthon_mine/GiTi


In [None]:
# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir = './results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    report_to = "wandb"

)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
    # dev_dataset=dev_dataset
)

In [None]:
# 모델 학습
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myoojinshin9918[0m ([33m5fingers[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.8229
200,0.6192
300,0.7036
400,0.5073
500,0.3788
600,0.4308
700,0.3086
800,0.3114
900,0.2237
1000,0.2188


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1002, training_loss=0.45161552120189735, metrics={'train_runtime': 136.7216, 'train_samples_per_second': 29.271, 'train_steps_per_second': 7.329, 'total_flos': 65356849741824.0, 'train_loss': 0.45161552120189735, 'epoch': 3.0})

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# 검증 데이터셋에 대한 예측
predictions = trainer.predict(dev_dataset)
# 57% - test dataset
# 88% - dev dataset

# 예측된 라벨
predicted_labels = predictions.predictions.argmax(axis=1)

# 실제 라벨
true_labels = dev_labels

# 정확도 계산
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

# 분류 보고서 출력
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))

Accuracy: 0.8808988764044944
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89       249
           1       0.86      0.87      0.87       196

    accuracy                           0.88       445
   macro avg       0.88      0.88      0.88       445
weighted avg       0.88      0.88      0.88       445



In [None]:
import wandb

In [None]:
# WandB 프로젝트 초기화
wandb.init(project='gpt2_mintrec', name='3rd_try')

# 검증 데이터셋에 대한 평가
eval_results = trainer.evaluate()

# 평가 결과 WandB에 기록
wandb.log(eval_results)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▇▄▃▃▂▂▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,3.0
train/global_step,1002.0
train/learning_rate,0.0
train/loss,0.2188
train/total_flos,65356849741824.0
train/train_loss,0.45162
train/train_runtime,136.7216
train/train_samples_per_second,29.271
train/train_steps_per_second,7.329
