In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, ElectraTokenizer, BertForSequenceClassification, ElectraForSequenceClassification
from tqdm.notebook import tqdm
import re

# dialog-koelectra

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [3]:
train=pd.read_csv('/aiffel/aiffel/dktc/train3.csv')
test=pd.read_json('/aiffel/aiffel/dktc/data/test.json').transpose()
submission = pd.read_csv('/aiffel/aiffel/dktc/data/new_submission.csv')

In [5]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'[^가-힣a-z\s]', ' ', text)
    
    text = ' '.join(text.split())
    
    # 한국어 불용어 리스트
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', 
        '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', 
        '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', 
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', 
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우',
        '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]
    
    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)
    
    return text

In [6]:
train['conversation'] = train['conversation'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

#label encoding
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3,
    '일반 대화':4
}
train['label_encoded'] = train['class'].map(label_dict)

In [8]:
train=train[['conversation','label_encoded']]
train=train.rename(columns={'label_encoded':'class'})

In [9]:
train

Unnamed: 0,conversation,class
0,당장 뉴스 기사 내가 불러준 대로 보도 해 팩트 체크가 되지 않은 기사는 낼 없습니...,0
1,버러지 같은게 너 내가 누군줄알아 손님 욕하시면 안됩니다 어디서 말대꾸야 미친년이 ...,3
2,공책 돌려받길 원하면 빨리 뛰어봐 굼벵아 빨리 내놔 빨릐 내놔아 빨리 내놓으래 웃기...,3
3,사장님 저기 말할게 있는데요 뭔데 임마 게임회사는 전체이용가 게임이잖아요 그래서 그...,2
4,죽고 싶어서 환장했어 왜 이렇게 말을 들어 죄송해요 번만 봐주세요 시키는 대로 하라...,0
...,...,...
4901,이대리는 남자친구 있나 네 네 있습니다 아 진짜 어떻게 네 그냥 학교에서 만났어요 ...,2
4902,새로 나온 책 읽어봤어 응 정말 재밌었어 내용이야 미스터리 소설인데 반전이 많아 새...,4
4903,어이 너희 어딘지 이제 아는데 선생님 제발 집에는 오지 마세요 그럼 내가 얘기한 거...,0
4904,니가 연락 안받는다고 내가 못찾을 줄 알았어 일부러 안받은건 아니었습니다 죄송합니다...,0


In [10]:
# train, valid data 분리
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

In [11]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
  
tokenizer = ElectraTokenizer.from_pretrained("skplanet/dialog-koelectra-small-discriminator")

model = ElectraForSequenceClassification.from_pretrained("skplanet/dialog-koelectra-small-discriminator")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at skplanet/dialog-koelectra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128, is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        if not self.is_test:
            conversation = str(self.data.iloc[index]['conversation'])
            label = self.data.iloc[index]['class']
        else:
            conversation = str(self.data.iloc[index]['text'])
            label = None

        encoding = self.tokenizer.encode_plus(
            conversation,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length' if not self.is_test else 'max_length',  # 테스트 데이터셋도 padding 적용
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        if not self.is_test:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()
            }


In [24]:
train_dataset = CustomDataset(train_df, tokenizer,max_length=128,is_test=False)
val_dataset = CustomDataset(val_df, tokenizer,max_length=128,is_test=False)

test_dataset = CustomDataset(test, tokenizer, max_length=128, is_test=True)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ElectraForSequenceClassification.from_pretrained("skplanet/dialog-koelectra-small-discriminator", num_labels=5)
model.to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at skplanet/dialog-koelectra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(40000, 128, padding_idx=0)
      (position_embeddings): Embedding(128, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [26]:
# 하이퍼파라미터 설정
batch_size = 16
epochs = 3
learning_rate = 2e-5

# 옵티마이저 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset) * epochs)

# 데이터로더 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False)

# 손실 함수 정의
loss_fn = nn.CrossEntropyLoss()

In [27]:
# 모델 훈련
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
        
    # Validation
    model.eval()
    val_accuracy = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predictions = torch.max(logits, dim=1)
            val_accuracy += accuracy_score(predictions.cpu().numpy(), labels.cpu().numpy())
    
    print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader)}, Val Accuracy: {val_accuracy / len(val_loader)}')

Epoch 1/3, Train Loss: 1.2209652659854269, Val Accuracy: 0.8256048387096774
Epoch 2/3, Train Loss: 0.5569019668349405, Val Accuracy: 0.8991935483870968
Epoch 3/3, Train Loss: 0.28300675789151736, Val Accuracy: 0.9143145161290323


In [28]:
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, batch_predictions = torch.max(logits, dim=1)
        predictions.extend(batch_predictions.cpu().numpy())

# Submission 
submission['class']=predictions
submission.head(10)

Unnamed: 0,file_name,class
0,t_000,1
1,t_001,2
2,t_002,2
3,t_003,3
4,t_004,3
5,t_005,0
6,t_006,2
7,t_007,1
8,t_008,3
9,t_009,1


In [34]:
submission.to_csv('submission_dialogKobert2.csv',index=False)

In [30]:
submission[submission['class']==4]

Unnamed: 0,file_name,class
221,t_221,4
