In [2]:
# Import necessary libraries
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("../../data/job/rallit_text.csv", quoting=2)
print(df.head())

# Map job labels to numbers and rename to 'labels' for compatibility with Hugging Face
job_map = {
    "PM": 0, "Sales": 1, "데브옵스 엔지니어": 2, "데이터 분석가": 3,
    "데이터 엔지니어": 4, "백엔드 개발자": 5, "풀스택 개발자": 6, "프론트엔드 개발자": 7
}
df['labels'] = df['job'].map(job_map)
df.dropna(inplace=True)

# Clean text data
def clean_text(text):
    # Add your cleaning rules here
    text = re.sub('[^가-힣a-zA-Z0-9]', ' ', text)
    return text

df['text'] = df['text'].apply(clean_text)

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("klue/roberta-large", num_labels=len(job_map))

# Prepare the datasets
def tokenize_data(example):
    model_inputs = tokenizer(example['text'], max_length=128, padding="max_length", truncation=True)
    model_inputs['labels'] = example['labels']
    return model_inputs

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='models/kor_base', #모델 저장
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Define a metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro')
    _, _, f1_weighted, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    _, _, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro')
    accuracy = accuracy_score(labels, preds)
    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "f1_micro": f1_micro
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()

         job                                               text
0    백엔드 개발자  기술 스택: Python, Java, C, Django, Spring Boot, F...
1  프론트엔드 개발자  기술 스택: JavaScript, TypeScript, React, Python, ...
2    백엔드 개발자  기술 스택: Java\r\nSpring Boot\r\nJPA\r\nTypeScrip...
3    백엔드 개발자  기술 스택: Java\r\nKotlin\r\nJavaScript\r\nTypeScr...
4    풀스택 개발자  기술 스택: JavaScript, Docker, Elasticsearch, Vue....


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/141 [00:00<?, ? examples/s]

  0%|          | 0/720 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4532681703567505, 'eval_accuracy': 0.4397163120567376, 'eval_f1_macro': 0.07635467980295567, 'eval_f1_weighted': 0.2685951856898299, 'eval_f1_micro': 0.4397163120567376, 'eval_runtime': 2.3772, 'eval_samples_per_second': 59.314, 'eval_steps_per_second': 1.262, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0507887601852417, 'eval_accuracy': 0.723404255319149, 'eval_f1_macro': 0.20230594758064516, 'eval_f1_weighted': 0.6387911805078929, 'eval_f1_micro': 0.723404255319149, 'eval_runtime': 2.3911, 'eval_samples_per_second': 58.969, 'eval_steps_per_second': 1.255, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8900315165519714, 'eval_accuracy': 0.75177304964539, 'eval_f1_macro': 0.21155989771162548, 'eval_f1_weighted': 0.6638792666077936, 'eval_f1_micro': 0.75177304964539, 'eval_runtime': 2.3364, 'eval_samples_per_second': 60.35, 'eval_steps_per_second': 1.284, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.835749089717865, 'eval_accuracy': 0.7588652482269503, 'eval_f1_macro': 0.34233853394927216, 'eval_f1_weighted': 0.7191114754159164, 'eval_f1_micro': 0.7588652482269503, 'eval_runtime': 2.3422, 'eval_samples_per_second': 60.2, 'eval_steps_per_second': 1.281, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8113102316856384, 'eval_accuracy': 0.7943262411347518, 'eval_f1_macro': 0.36745032545392253, 'eval_f1_weighted': 0.7254179635015392, 'eval_f1_micro': 0.7943262411347518, 'eval_runtime': 2.205, 'eval_samples_per_second': 63.945, 'eval_steps_per_second': 1.361, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8153453469276428, 'eval_accuracy': 0.7446808510638298, 'eval_f1_macro': 0.3374521072796935, 'eval_f1_weighted': 0.7027335032247469, 'eval_f1_micro': 0.7446808510638298, 'eval_runtime': 2.236, 'eval_samples_per_second': 63.059, 'eval_steps_per_second': 1.342, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8190867304801941, 'eval_accuracy': 0.75177304964539, 'eval_f1_macro': 0.3272873509062224, 'eval_f1_weighted': 0.731749774621102, 'eval_f1_micro': 0.75177304964539, 'eval_runtime': 2.2444, 'eval_samples_per_second': 62.823, 'eval_steps_per_second': 1.337, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8044435977935791, 'eval_accuracy': 0.7943262411347518, 'eval_f1_macro': 0.4559577522559475, 'eval_f1_weighted': 0.7677642734455957, 'eval_f1_micro': 0.7943262411347518, 'eval_runtime': 2.163, 'eval_samples_per_second': 65.187, 'eval_steps_per_second': 1.387, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0600979328155518, 'eval_accuracy': 0.6595744680851063, 'eval_f1_macro': 0.34609614802913774, 'eval_f1_weighted': 0.6836107974954213, 'eval_f1_micro': 0.6595744680851063, 'eval_runtime': 2.4103, 'eval_samples_per_second': 58.499, 'eval_steps_per_second': 1.245, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9134504199028015, 'eval_accuracy': 0.7943262411347518, 'eval_f1_macro': 0.5073488645857067, 'eval_f1_weighted': 0.7709794042380828, 'eval_f1_micro': 0.7943262411347518, 'eval_runtime': 2.3387, 'eval_samples_per_second': 60.289, 'eval_steps_per_second': 1.283, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0947916507720947, 'eval_accuracy': 0.7588652482269503, 'eval_f1_macro': 0.5606467494486256, 'eval_f1_weighted': 0.7524790114611601, 'eval_f1_micro': 0.7588652482269503, 'eval_runtime': 2.3474, 'eval_samples_per_second': 60.067, 'eval_steps_per_second': 1.278, 'epoch': 11.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.0631508827209473, 'eval_accuracy': 0.7801418439716312, 'eval_f1_macro': 0.4114584731481067, 'eval_f1_weighted': 0.7624811874760232, 'eval_f1_micro': 0.7801418439716312, 'eval_runtime': 2.3413, 'eval_samples_per_second': 60.223, 'eval_steps_per_second': 1.281, 'epoch': 12.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.287196159362793, 'eval_accuracy': 0.75177304964539, 'eval_f1_macro': 0.48362806287563564, 'eval_f1_weighted': 0.7499581943911628, 'eval_f1_micro': 0.75177304964539, 'eval_runtime': 2.3229, 'eval_samples_per_second': 60.699, 'eval_steps_per_second': 1.291, 'epoch': 13.0}
{'loss': 0.5654, 'learning_rate': 5e-05, 'epoch': 13.89}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4910178184509277, 'eval_accuracy': 0.6879432624113475, 'eval_f1_macro': 0.4713073219589442, 'eval_f1_weighted': 0.7021723848713496, 'eval_f1_micro': 0.6879432624113475, 'eval_runtime': 2.3379, 'eval_samples_per_second': 60.31, 'eval_steps_per_second': 1.283, 'epoch': 14.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.516188144683838, 'eval_accuracy': 0.723404255319149, 'eval_f1_macro': 0.508532932088656, 'eval_f1_weighted': 0.6972449823466612, 'eval_f1_micro': 0.723404255319149, 'eval_runtime': 2.3146, 'eval_samples_per_second': 60.918, 'eval_steps_per_second': 1.296, 'epoch': 15.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.6791754961013794, 'eval_accuracy': 0.7375886524822695, 'eval_f1_macro': 0.47870314127039226, 'eval_f1_weighted': 0.7347898065338818, 'eval_f1_micro': 0.7375886524822695, 'eval_runtime': 2.3316, 'eval_samples_per_second': 60.472, 'eval_steps_per_second': 1.287, 'epoch': 16.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4186511039733887, 'eval_accuracy': 0.7872340425531915, 'eval_f1_macro': 0.48169204568222795, 'eval_f1_weighted': 0.766201858380546, 'eval_f1_micro': 0.7872340425531915, 'eval_runtime': 2.5301, 'eval_samples_per_second': 55.729, 'eval_steps_per_second': 1.186, 'epoch': 17.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.53592050075531, 'eval_accuracy': 0.8085106382978723, 'eval_f1_macro': 0.4093992732521872, 'eval_f1_weighted': 0.7794597605568535, 'eval_f1_micro': 0.8085106382978723, 'eval_runtime': 2.5201, 'eval_samples_per_second': 55.95, 'eval_steps_per_second': 1.19, 'epoch': 18.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4733200073242188, 'eval_accuracy': 0.7801418439716312, 'eval_f1_macro': 0.4334761385470219, 'eval_f1_weighted': 0.7658615435779639, 'eval_f1_micro': 0.7801418439716312, 'eval_runtime': 2.4546, 'eval_samples_per_second': 57.443, 'eval_steps_per_second': 1.222, 'epoch': 19.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.4708234071731567, 'eval_accuracy': 0.7801418439716312, 'eval_f1_macro': 0.4778756957328386, 'eval_f1_weighted': 0.7589941972920696, 'eval_f1_micro': 0.7801418439716312, 'eval_runtime': 2.4562, 'eval_samples_per_second': 57.405, 'eval_steps_per_second': 1.221, 'epoch': 20.0}
{'train_runtime': 674.5137, 'train_samples_per_second': 16.723, 'train_steps_per_second': 1.067, 'train_loss': 0.41724361843532987, 'epoch': 20.0}


  0%|          | 0/3 [00:00<?, ?it/s]