# DLTHON

## DKTC (Dataset of Korean Threatening Conversations)

- 텍스트 다중분류 프로젝트

## 데이터셋 정보

train.csv

    1. idx = 인덱스
    2. class = 0~4
        class 0; 협박 대화
        class 1; 갈취 대화
        class 2; 직장 내 괴롭힘 대화
        class 3; 기타 괴롭힘 대화
    3. conversation = \n으로 구분된 멀티턴 텍스트 대화

test.json

    1. t_### = 인덱스
    2. text = 대화

submission.csv

    1. file_name = 인덱스
    2. class = 예측값

## 평가기준
> - 데이터 EDA와 데이터 전처리가 적절하게 이뤄졌는가?
> - Task에 알맞게 적절한 모델을 찾아보고 선정했는가?
> - 성능향상을 위해 논리적으로 접근했는가?
> - 결과 도출을 위해 여러가지 시도를 진행했는가?
> - 도출된 결론에 충분한 설득력이 있는가?
> - 적절한 metric을 설정하고 그 사용 근거 및 결과를 분석하였는가?
> - 발표가 매끄럽게 진행되었고 발표시간을 준수하였는지? (발표 10분-15분)

## TO-DO-LIST
- 일반 대화 데이터셋 만들어야함 (800-1000개정도)
- ppt 제작
- 평가지표 : f1-score

## EDA


In [156]:
import matplotlib.pyplot as plt
import seaborn as sns

In [157]:
import os
import pandas as pd

train_data_path ="./data/train.csv"
train_data = pd.read_csv(train_data_path)
train_data.head()

In [158]:
train_data.shape

In [159]:
add_data = pd.read_csv('pred/normal_df.csv')
add_data.head()

In [160]:
add_data.shape

In [161]:
train_data.drop('idx', axis=1, inplace=True)

In [162]:
train_data.head()

In [164]:
add = add_data

In [165]:
train_data = pd.concat([train_data, add])

In [166]:
train_data['class'].value_counts()

In [167]:
import json
with open('./data/test.json') as f:
    test_data = json.load(f)

test_data = pd.DataFrame(test_data).T
test_data.reset_index(drop=True, inplace=True)

In [168]:
test_data['cleaned'] = test_data.text.apply(lambda x: clean(x))

In [169]:
test_data

In [170]:
b = pd.read_csv('pred/n.csv')
b.shape

In [171]:
test_data['class'] = b['class']

In [172]:
test_data['class'].value_counts()

In [173]:
test_data

## 데이터 전처리

In [174]:
# !pip install soynlp

In [175]:
import re
# import emoji
from soynlp.normalizer import repeat_normalize

In [176]:
def clean(x):
    # emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    
    x = re.sub(r"([?.!,])", r" \1 ", x)
    x = re.sub(r'[" "]+', " ", x)

    # (가-힣, a-z, A-Z, ".", "?", "!", ",")를 제외한 모든 문자를 공백인 ' '로 대체합니다.
    x = re.sub(r'[^가-힣a-zA-Z.?!,]', ' ', x)
    x = x.strip()
    x = ' '.join([i for i in x.split() if i != None])
    return x

In [177]:
train_data['cleaned'] = train_data.conversation.apply(lambda x: clean(x))

In [178]:
train_data['cleaned'].tail()

### 한국어 문장 분리기

In [179]:
# !pip install kss

In [180]:
samples = train_data.sample(5)

In [181]:
samples

In [182]:
samples.cleaned.iloc[0]

In [183]:
# from kss import split_sentences

# split_sentences(samples.cleaned.iloc[0])

In [184]:
# split_sentences(samples.cleaned.iloc[2])

In [185]:
# split_sentences(samples.cleaned.iloc[-1])

## BERT

In [186]:
# !pip install -U transformers

In [188]:
# from sklearn.preprocessing import LabelEncoder
# 
# # LabelEncoder를 객체로 생성
# encoder = LabelEncoder()
# 
# # fit, transform 메소드를 통한 레이블 인코딩
# encoder.fit(train_data['class'])
# 
# train_data["label"] = encoder.transform(train_data['class'])

In [189]:
label_encode = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4
}

In [190]:
train_data["label"] = train_data['class'].map(label_encode)

In [191]:
dataset = train_data[['cleaned', 'label']]

In [192]:
dataset.sample(n=10)

In [193]:
max_seq_len = 0
len_64_128 = 0
len_128_256 = 0

mylist = [0 for i in range(10000)]
for index, value in dataset['cleaned'].items():
    mylist[len(value)] += 1
    max_seq_len = max(max_seq_len, len(value))

    if len(value) >= 64 and len(value) < 128:
        len_64_128 += 1
    elif len(value) >= 128 and len(value) < 256:
        len_128_256 += 1

print(f'가장 긴 문장 길이: {max_seq_len}')
print(f'64과 128 사이 길이 문장 개수: {len_64_128}')
print(f'128과 256 사이 길이 문장 개수: {len_128_256}')

In [194]:
max_seq_len = 800

In [195]:
dataset

In [196]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraModel, ElectraConfig, AdamW
import torch.nn as nn
import wandb
from sklearn.metrics import f1_score

# W&B 초기화
wandb.init(project="koelectra-multi-label-classification4", entity="seongyeonkim")

# 데이터 로드
train_df = train_data[['cleaned', 'label']]
test_data['label'] = test_data['class'].astype('int')
val_df = test_data[['cleaned', 'label']]

# KoElectra tokenizer 로드
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

# 토큰화 및 레이블 변환 함수 정의
label_map = {
    0: [1, 0, 0, 0, 0],
    1: [0, 1, 0, 0, 0],
    2: [0, 0, 1, 0, 0],
    3: [0, 0, 0, 1, 0],
    4: [0, 0, 0, 0, 1],
    5: [0, 0, 0, 0, 0]
}

In [197]:
# ViolenceDataset 정의
class ViolenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, 0]
        labels = self.dataframe.iloc[idx, 1]
        labels = label_map[labels]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels, dtype=torch.float)
        }

In [198]:
# 데이터셋 준비
max_length = 128
train_dataset = ViolenceDataset(train_df, tokenizer, max_length)
val_dataset = ViolenceDataset(val_df, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [199]:
# 모델 정의
class ElectraForMultiLabelClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super(ElectraForMultiLabelClassification, self).__init__()
        self.electra = ElectraModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.electra.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = sequence_output[:, 0, :]  # cls token
        logits = self.classifier(pooled_output)
        return logits

model = ElectraForMultiLabelClassification("monologg/koelectra-base-v3-discriminator", num_labels=5)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

# W&B에 모델을 로깅할 수 있도록 W&B에 등록
wandb.watch(model, log="all")

# 학습 루프
model.train()
for epoch in range(3):
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        preds = torch.sigmoid(outputs).detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        all_preds.append(preds)
        all_labels.append(labels)
        
        wandb.log({"epoch": epoch + 1, "train_loss": loss.item()})
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    
    # 에포크당 F1 스코어 계산
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    f1 = f1_score(all_labels, (all_preds > 0.5).astype(int), average='micro')
    
    wandb.log({"epoch": epoch + 1, "train_f1_score": f1})
    print(f"Epoch {epoch+1}, Train F1 Score: {f1}")

    # Validation loop
    model.eval()
    val_preds = []
    val_labels = []
    val_losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_losses.append(loss.item())
            
            preds = torch.sigmoid(outputs).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            
            val_preds.append(preds)
            val_labels.append(labels)
    
    val_preds = np.concatenate(val_preds, axis=0)
    val_labels = np.concatenate(val_labels, axis=0)
    val_f1 = f1_score(val_labels, (val_preds > 0.5).astype(int), average='micro')
    val_loss = np.mean(val_losses)
    
    wandb.log({"epoch": epoch + 1, "val_f1_score": val_f1, "val_loss": val_loss})
    print(f"Epoch {epoch+1}, Validation F1 Score: {val_f1}, Validation Loss: {val_loss}")
    
    model.train()

In [200]:
# 모델 저장
model_save_path = './koelectra_multilabel'
torch.save(model.state_dict(), model_save_path + '/pytorch_model3.bin')
tokenizer.save_pretrained(model_save_path)

# W&B에 모델 아티팩트로 업로드
artifact = wandb.Artifact('koelectra-multi-label-classification4', type='model')
artifact.add_dir(model_save_path)
wandb.log_artifact(artifact)

# W&B 실행 종료
wandb.finish()

In [109]:
!pip install nlpaug

In [111]:
!pip install nltk

In [114]:
import nltk
# download the wordnet corpus
nltk.download('wordnet')
# import wordnet
from nltk.corpus import wordnet

In [117]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import ElectraModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import wandb
import numpy as np
import nlpaug.augmenter.word as naw

# 데이터 증강 설정
augmenter = naw.SynonymAug(aug_src='wordnet')

# ViolenceDataset 정의
class ViolenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, augment=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, 0]
        labels = self.dataframe.iloc[idx, 1]
        labels = label_map[labels]
        
        if self.augment:
            text = augmenter.augment(text)
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# 데이터셋 준비
max_length = 128
train_dataset = ViolenceDataset(train_df, tokenizer, max_length, augment=True)
val_dataset = ViolenceDataset(val_df, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

# 모델 정의
class ElectraForMultiLabelClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super(ElectraForMultiLabelClassification, self).__init__()
        self.electra = ElectraModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)  # 추가된 Dropout 레이어
        self.classifier = nn.Linear(self.electra.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = sequence_output[:, 0, :]  # cls token
        pooled_output = self.dropout(pooled_output)  # Dropout 적용
        logits = self.classifier(pooled_output)
        return logits

model = ElectraForMultiLabelClassification("monologg/koelectra-base-v3-discriminator", num_labels=5)
criterion = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

# 학습률 스케줄러 추가
total_steps = len(train_dataloader) * 10  # 10은 총 에포크 수
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# W&B에 모델을 로깅할 수 있도록 W&B에 등록
wandb.watch(model, log="all")

# 학습 루프
model.train()
best_val_f1 = 0.0
early_stopping_counter = 0
for epoch in range(10):
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()  # 학습률 스케줄러 업데이트
        
        preds = torch.sigmoid(outputs).detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        all_preds.append(preds)
        all_labels.append(labels)
        
        wandb.log({"epoch": epoch + 1, "train_loss": loss.item()})
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    
    # 에포크당 F1 스코어 계산
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    f1 = f1_score(all_labels, (all_preds > 0.5).astype(int), average='micro')
    
    wandb.log({"epoch": epoch + 1, "train_f1_score": f1})
    print(f"Epoch {epoch+1}, Train F1 Score: {f1}")

    # Validation loop
    model.eval()
    val_preds = []
    val_labels = []
    val_losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_losses.append(loss.item())
            
            preds = torch.sigmoid(outputs).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            
            val_preds.append(preds)
            val_labels.append(labels)
    
    val_preds = np.concatenate(val_preds, axis=0)
    val_labels = np.concatenate(val_labels, axis=0)
    val_f1 = f1_score(val_labels, (val_preds > 0.5).astype(int), average='micro')
    val_loss = np.mean(val_losses)
    
    wandb.log({"epoch": epoch + 1, "val_f1_score": val_f1, "val_loss": val_loss})
    print(f"Epoch {epoch+1}, Validation F1 Score: {val_f1}, Validation Loss: {val_loss}")
    
    # 조기 종료 체크
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        early_stopping_counter = 0  # 조기 종료 카운터 초기화
    else:
        early_stopping_counter += 1
    
    if early_stopping_counter >= 3:  # 3 에포크 동안 개선이 없으면 종료
        print("Early stopping due to no improvement in validation F1 score.")
        break
    
    model.train()

## Predict

일반대화 예시

```json
{
	"id": {
		"text": "이거 들어봐 와 이 노래 진짜 좋다 그치 요즘 이 것만 들어 진짜 너무 좋다 내가 요즘 듣는 것도 들어봐 음 난 좀 별론데 좋을 줄 알았는데 아쉽네 내 취향은 아닌 듯 배고프다 밥이나 먹으러 가자 그래"
	}
}
```

In [119]:
import json
with open('./data/test.json') as f:
    test_data = json.load(f)

test_data = pd.DataFrame(test_data).T
test_data.reset_index(drop=True, inplace=True)

In [201]:
test_data

In [202]:
# 예측 함수
def predict(text):
    try:
        model.eval()
        with torch.no_grad():
            encoding = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs)
        return probs
    except:
        print(text)
        return torch.zeros([1, 5])

In [203]:
from tqdm import tqdm, trange

tqdm.pandas()

In [123]:
test_data['cleaned'] = test_data.text.apply(lambda x: clean(x))

In [204]:
test_data['pred'] = test_data['cleaned'].progress_apply(lambda x: predict(x))

In [125]:
n = pd.read_csv('pred/n.csv')

In [128]:
n['class'].value_counts()

In [205]:
test_data['pred_label'] = test_data['pred'].apply(lambda x: np.argmax(x).item())

In [211]:
test_data['pred_label'].value_counts()

In [215]:
sample = test_data.pred[0]

In [218]:
sample.max()

In [244]:
test_data['pred']

In [219]:
test_data['max_val'] = test_data.pred.apply(lambda x: x.max().item())

In [221]:
test_data['max_val'].hist()

In [223]:
test_data['max_val'].describe()

In [246]:
test_data.loc[test_data['max_val'] < 0.5].text.tolist()

In [229]:
test_data.loc[test_data['max_val'] < 0.6,'pred_label']

In [144]:
test_data['d'] = (test_data['label'] - n['class'])

In [146]:
test_data['class'] = n['class']

In [236]:
test_data.loc[test_data['class'] == 4,'pred_label'] = 4

In [208]:
new = pd.read_csv('./data/new_submission.csv')

In [234]:
new['class'] = test_data['pred_label']

In [237]:
new.to_csv('new_pred2.csv', index=False)

In [238]:
test_data