# Data-Centric NLP 대회: 주제 분류 프로젝트

-----

## Load Libraries

In [32]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

import torch.nn.functional as F

## Set Hyperparameters

In [33]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [34]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [35]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [36]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /opt/ml/.cache/huggingface/hub/models--klue--bert-base/snapshots/77c8b3d707df785034b4e50f2da5d37be5f0f546/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tra

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initial

-----

## Define Dataset

In [37]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.2, stratify=data['target'],random_state=SEED)

In [38]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5600 entries, 4355 to 2590
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      5600 non-null   object
 1   text    5600 non-null   object
 2   target  5600 non-null   int64 
 3   url     5600 non-null   object
 4   date    5600 non-null   object
dtypes: int64(1), object(4)
memory usage: 262.5+ KB


_____

## Data Augmentation (AEDA, 띄어쓰기 제거)

1. AEDA

In [39]:
def AEDA(dataset_train) :
    PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
    PUNC_RATIO = 0.3

    df = dataset_train.copy()

    num_samples = len(df)
    sample_indices = random.sample(range(num_samples), num_samples // 7)    # 랜덤으로 7분의 1 선택

    dataset_train_aug = pd.DataFrame()

    for idx in sample_indices:
        try:
            sentence = df.loc[idx, 'text']
            target = df.loc[idx, 'target']
            data = sentence
            words = data.split(' ')
            q = np.random.randint(1, max(2, int(PUNC_RATIO * len(words) + 1)))
            qs = np.random.choice(range(0, len(words)), q)
        except Exception as e:
            continue
            
        new_line = []  
        for j, word in enumerate(words):
            if j in qs:
                new_line.append(PUNCTUATIONS[np.random.randint(0, len(PUNCTUATIONS))])
                new_line.append(word)
            else:
                new_line.append(word)

        new_line_str = ' '.join(new_line)
        
        dataset_train_aug.loc[idx, 'ID'] = df.loc[idx, 'ID']
        dataset_train_aug.loc[idx, 'target'] = int(target)
        dataset_train_aug.loc[idx, 'text'] = new_line_str    
        dataset_train_aug.loc[idx, 'url'] = df.loc[idx, 'url']
        dataset_train_aug.loc[idx, 'date'] = df.loc[idx, 'date']
        
    dataset_train_aug['target'] = dataset_train_aug['target'].astype(int)
    
    return dataset_train_aug

2.  띄어쓰기 제거

In [42]:
def delete_spacing(dataset_train) :
    df = dataset_train.copy()

    num_samples = len(df)
    sample_indices = random.sample(range(num_samples), num_samples // 7)

    dataset_train_aug = pd.DataFrame()

    for idx in sample_indices:
        try: 
            sentence = df.loc[idx, 'text']
                
            sentence_without_spaces = ''.join(sentence.split()) # 띄어쓰기 제거
        
            dataset_train_aug.loc[idx, 'ID'] = df.loc[idx, 'ID']
            dataset_train_aug.loc[idx, 'target'] = df.loc[idx, 'target']
            dataset_train_aug.loc[idx, 'text'] = sentence_without_spaces    
            dataset_train_aug.loc[idx, 'url'] = df.loc[idx, 'url']
            dataset_train_aug.loc[idx, 'date'] = df.loc[idx, 'date']
        except: 
            continue

- concat

In [40]:
# AEDA, 띄어쓰기 제거 (노이즈 생성) concat
dataset_train = pd.concat([dataset_train, AEDA(dataset_train), delete_spacing(dataset_train)], axis=0, ignore_index=True)

dataset_train = dataset_train.drop_duplicates(subset=['text'])


________________________

In [43]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [44]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [45]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

-------------

## Define Metric

In [46]:
f1 = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

## Train Model

In [47]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [48]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [50]:
trainer.train()

***** Running training *****
  Num examples = 7146
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 448
  Number of trainable parameters = 110622727
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
100,1.0831,0.541617,0.844667
200,0.6277,0.508494,0.847509
300,0.4898,0.484132,0.853942
400,0.4277,0.482199,0.858893


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-100
Configuration saved in /data/ephemeral/code/../output/checkpoint-100/config.json
Model weights saved in /data/ephemeral/code/../output/checkpoint-100/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/code/../output/checkpoint-800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-200
Configuration saved in /data/ephemeral/code/../output/checkpoint-200/config.json
Model weights saved in /data/ephemeral/code/../output/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [/data/ephemeral/code/../output/checkpoint-900] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 32
Saving model checkpoint to /data/ephemeral/code/../output/checkpoint-300
Configuration saved in /da

TrainOutput(global_step=448, training_loss=0.6316027556146894, metrics={'train_runtime': 472.6063, 'train_samples_per_second': 30.241, 'train_steps_per_second': 0.948, 'total_flos': 3760552018022400.0, 'train_loss': 0.6316027556146894, 'epoch': 2.0})

--------------------
## Evaluate Model

In [51]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [52]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_test.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        #prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)
        #probs.extend(prob)

7743it [01:06, 114.11it/s]

In [None]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'output.csv'), index=False)

In [None]:
dataset_test.head()