# DLTHON

## DKTC (Dataset of Korean Threatening Conversations)

- 텍스트 다중분류 프로젝트

## 데이터셋 정보

train.csv

    1. idx = 인덱스
    2. class = 0~4
        class 0; 협박 대화
        class 1; 갈취 대화
        class 2; 직장 내 괴롭힘 대화
        class 3; 기타 괴롭힘 대화
    3. conversation = \n으로 구분된 멀티턴 텍스트 대화

test.json

    1. t_### = 인덱스
    2. text = 대화

submission.csv

    1. file_name = 인덱스
    2. class = 예측값

## 평가기준 
> - 데이터 EDA와 데이터 전처리가 적절하게 이뤄졌는가?
> - Task에 알맞게 적절한 모델을 찾아보고 선정했는가?
> - 성능향상을 위해 논리적으로 접근했는가?
> - 결과 도출을 위해 여러가지 시도를 진행했는가?
> - 도출된 결론에 충분한 설득력이 있는가?
> - 적절한 metric을 설정하고 그 사용 근거 및 결과를 분석하였는가?
> - 발표가 매끄럽게 진행되었고 발표시간을 준수하였는지? (발표 10분-15분)

## TO-DO-LIST
- 일반 대화 데이터셋 만들어야함 (800-1000개정도)
- ppt 제작
- 평가지표 : f1-score

## EDA


In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import os
import pandas as pd

train_data_path ="./data/train.csv"
train_data = pd.read_csv(train_data_path)
train_data.head()

In [4]:
cate = train_data['class'].unique().tolist()

In [5]:
cate

In [6]:
intimidation = train_data[train_data['class'] == '협박 대화']
extortion = train_data[train_data['class'] == '갈취 대화']
harassment_workplace = train_data[train_data['class'] == '직장 내 괴롭힘 대화']
harassment_others = train_data[train_data['class'] == '기타 괴롭힘 대화']

In [7]:
intimidation

In [8]:
extortion

In [9]:
harassment_workplace

In [10]:
harassment_others

일반대화 예시

```json
{
	"id": {
		"text": "이거 들어봐 와 이 노래 진짜 좋다 그치 요즘 이 것만 들어 진짜 너무 좋다 내가 요즘 듣는 것도 들어봐 음 난 좀 별론데 좋을 줄 알았는데 아쉽네 내 취향은 아닌 듯 배고프다 밥이나 먹으러 가자 그래"
	}
}
```

In [12]:
import json
from os.path import join

path = join('./data/test.json')
        
with open(path) as f:
    test_data = json.load(f)

test_data = pd.DataFrame(test_data).T
test_data.reset_index(drop=True)

## Modeling

#### BERT



#### train set 과 validation set 분리

In [56]:
from sklearn.model_selection import train_test_split

# Stratified Split Train and Validation data 
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=777, stratify=train_labels)


#### MODEL : klue bert 

#### Tokenizer

In [59]:
MODEL_PATH = "klue/bert-base"

In [60]:
from transformers import BertTokenizerFast

# Load Tokenizer 
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

# Tokenizing
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask']) 이런식으로 
train_encodings = tokenizer(train_texts, truncation=True, padding=True) 
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [61]:
print(dict(val_encodings).keys())

In [62]:
import tensorflow as tf

# trainset-set
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# validation-set
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

## Tuning

In [63]:
from transformers import TFBertForSequenceClassification

num_labels = len(label_encode)
print(num_labels)

# TODO : from_pt=False 혹은 없이 해보기
# from_pt – (optional) boolean, default False: Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
model = TFBertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=num_labels, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

In [64]:
model.compute_loss

###  1. callbacks

In [65]:

from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint

callback_earlystopping = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001, # the threshold that triggers the termination (acc should at least improve 0.001)
    patience=2)

callback_learningrate_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=10,
    verbose=0,
    mode='auto',
    min_delta=0.0001,
    cooldown=0,
    min_lr=0,
)


callback_modelcheckpoint = ModelCheckpoint(
    filepath = "BERT_BestModel.keras",
    monitor="val_accuracy",
    save_best_only=True,
)

callback_list = [callback_earlystopping, callback_learningrate_scheduler, callback_modelcheckpoint]

model.fit(
    train_dataset.shuffle(1000).batch(8), epochs=50, batch_size=8,
    validation_data=val_dataset.shuffle(1000).batch(16),
    callbacks = callback_list
)


#### model, tokenalzer 저장

In [71]:
MODEL_NAME = 'fine-tuned-klue-bert-base'
MODEL_SAVE_PATH = os.path.join("/aiffel/aiffel/workplace/20240624/model/", MODEL_NAME) # change this to your preferred location

if os.path.exists(MODEL_SAVE_PATH):
    print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")
else:
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

# save tokenizer, model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

In [72]:
from transformers import TextClassificationPipeline

# Load Fine-tuning model
loaded_tokenizer = BertTokenizerFast.from_pretrained(MODEL_SAVE_PATH)
loaded_model = TFBertForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer, 
    model=loaded_model, 
    framework='tf',
    return_all_scores=True
)

In [73]:
test_df = pd.DataFrame(['file_name', 'class'])
test_df.head()

### inference

In [75]:
import json

path = join('/aiffel/aiffel/dktc/data/test.json')
        
with open(path) as f:
    test_data = json.load(f)
    
# test_json



### predict : tqdm 진행률(progress) 표시

In [77]:
from tqdm.auto import tqdm
answer_dict = {}
for file_name, text in tqdm(test_data.items()):
    preds_list = text_classifier(text['text'])[0]
    best_label = int(sorted(preds_list, key=lambda x : x['score'])[-1]['label'].split('_')[-1])
    answer_dict[file_name] = best_label
          


In [78]:
answer_dict

In [79]:
for key, value in answer_dict.items():
    test_df = test_df.append({'file_name': key, 'class': value}, ignore_index=True)

test_df


In [80]:

## 저장
test_df = test_df[2:]
test_df = test_df[['class', 'file_name']]
test_df['class'] = test_df['class'].astype('int32')
test_df.set_index('file_name', inplace=True)

test_df.to_csv('FIN_BERT_L_0598.csv', index="file_name")




In [82]:
test_df