In [67]:
import pandas as pd

df = pd.read_csv("../../data/rallit_text.csv", quoting=2); df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   job     705 non-null    object
 1   text    705 non-null    object
dtypes: object(2)
memory usage: 11.1+ KB


Unnamed: 0,job,text
0,백엔드 개발자,"기술 스택: Python, Java, C, Django, Spring Boot, F..."
1,프론트엔드 개발자,"기술 스택: JavaScript, TypeScript, React, Python, ..."
2,백엔드 개발자,기술 스택: Java\r\nSpring Boot\r\nJPA\r\nTypeScrip...
3,백엔드 개발자,기술 스택: Java\r\nKotlin\r\nJavaScript\r\nTypeScr...
4,풀스택 개발자,"기술 스택: JavaScript, Docker, Elasticsearch, Vue...."


In [68]:
# 직업 레이블을 숫자로 매핑
job_map = {
    "PM": 0, "Sales": 1, "데브옵스 엔지니어": 2, "데이터 분석가": 3,
    "데이터 엔지니어": 4, "백엔드 개발자": 5, "풀스택 개발자": 6, "프론트엔드 개발자": 7
}
df['job'] = df['job'].map(job_map)

In [69]:
df.dropna(inplace = True)
df.head()

Unnamed: 0,job,text
0,5,"기술 스택: Python, Java, C, Django, Spring Boot, F..."
1,7,"기술 스택: JavaScript, TypeScript, React, Python, ..."
2,5,기술 스택: Java\r\nSpring Boot\r\nJPA\r\nTypeScrip...
3,5,기술 스택: Java\r\nKotlin\r\nJavaScript\r\nTypeScr...
4,6,"기술 스택: JavaScript, Docker, Elasticsearch, Vue...."


In [70]:
import re
import nltk
from nltk.corpus import stopwords

# 정규 표현식을 사용하여 클리닝과 특수 문자 매핑
def clean_text(text):
    # 이메일 주소 제거
    text = re.sub(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', '', text)
    # URL 제거
    text = re.sub(r'(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text)
    # 한글 자음, 모음 제거
    text = re.sub(r'([ㄱ-ㅎㅏ-ㅣ]+)', '', text)
    # HTML 태그 제거
    text = re.sub(r'<[^>]*>', '', text)
    # 모든 특수 문자 및 구두점 제거
    text = re.sub(r'[\s./?!,;:\'"+=\-_\[\]{}()*&^%$#@<>`~]', ' ', text)
    return text

# 한국어 불용어 제거
def remove_kor_stopwords(text, stopwords):
    words = text.split()
    cleaned_words = [word for word in words if word not in stopwords]
    return ' '.join(cleaned_words)

# 영어 불용어 제거
def remove_eng_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

# 불용어 파일 로드
def load_stopwords(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        stopwords = [line.strip() for line in file]
    return stopwords

# 파일 경로 설정 및 불용어 로드
stopwords_file = '../stopword.txt'
korean_stopwords = load_stopwords(stopwords_file)

# DataFrame에 클리닝 함수 적용
df['text'] = df['text'].apply(clean_text)  # 공통 클리닝
df['text'] = df['text'].apply(lambda x: remove_kor_stopwords(x, korean_stopwords))
df['text'] = df['text'].apply(remove_eng_stopwords)

# 결과 확인
df.head()

Unnamed: 0,job,text
0,5,기술 스택 Python Java C Django Spring Boot FastAPI...
1,7,기술 스택 JavaScript TypeScript React Python IPFS ...
2,5,기술 스택 Java Spring Boot JPA TypeScript AWS Spri...
3,5,기술 스택 Java Kotlin JavaScript TypeScript Node j...
4,6,기술 스택 JavaScript Docker Elasticsearch Vue js R...


In [71]:
import random
import re


########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words




def EDA(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	words = sentence.split(' ')
	words = [word for word in words if word != ""]
	num_words = len(words)

	augmented_sentences = []
	num_new_per_technique = int(num_aug/4) + 1

	n_rs = max(1, int(alpha_rs*num_words))

	# rs
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(" ".join(a_words))

	# rd
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(" ".join(a_words))

	augmented_sentences = [sentence for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	augmented_sentences.append(sentence)

	return augmented_sentences

In [72]:
# 라벨 필터링
selected_labels = [0, 1, 2, 3, 4]
filtered_df = df[df['job'].isin(selected_labels)]

In [73]:
# 데이터 증강
augmented_texts = []
augmented_labels = []

for _, row in filtered_df.iterrows():
    original_text = row['text']
    augmented_text = EDA(original_text, p_rd=0.25, alpha_rs=0.1, num_aug=20)
    augmented_texts.extend(augmented_text)
    augmented_labels.extend([row['job']] * len(augmented_text))


# 증강된 데이터를 기존 데이터프레임에 추가
augmented_df = pd.DataFrame({'text': augmented_texts, 'job': augmented_labels})
df_augmented = pd.concat([df, augmented_df]).reset_index(drop=True)

In [74]:
augmented_df.head()

Unnamed: 0,text,job
0,기술 스택 JavaScript TypeScript Google Sheets Noti...,0
1,개발에 스택 JavaScript TypeScript Google 작성 B2B Rea...,0
2,기술 스택 JavaScript TypeScript 배포 Sheets Notion R...,0
3,기술 스택 JavaScript TypeScript Google Sheets Noti...,0
4,기술 스택 JavaScript TypeScript Google Sheets Noti...,0


In [75]:
# 'job' 열의 각 분류 및 그 빈도수 출력
print(augmented_df['job'].value_counts())

job
0    260
3    156
2    104
1     78
4     78
Name: count, dtype: int64


In [76]:
print(df_augmented['job'].value_counts())

job
5    325
0    280
7    254
3    168
2    112
1     84
4     84
6     74
Name: count, dtype: int64


In [77]:
MODEL_NAME = "klue/roberta-large" # "klue/bert-base", "klue/bert-large", "klue/roberta-base"
batch_size = 64
learning_rate = 5e-5
num_epochs = 1

In [78]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Assuming df_augmented is already loaded as a pandas DataFrame
# Splitting data into train and test using sklearn
train_df, test_df = train_test_split(df_augmented, test_size=0.2, random_state=42)

# Further split train data into train and validation sets
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Convert pandas DataFrames into Hugging Face datasets.Dataset objects
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict for easier management and use with Hugging Face transformers
dataset = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

# Display the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['job', 'text', '__index_level_0__'],
        num_rows: 993
    })
    valid: Dataset({
        features: ['job', 'text', '__index_level_0__'],
        num_rows: 111
    })
    test: Dataset({
        features: ['job', 'text', '__index_level_0__'],
        num_rows: 277
    })
})


In [79]:
import os
import tqdm
import pandas as pd
import torch
import datasets

from glob import glob
from tqdm import tqdm

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer.tokenize(train['text'][0]))

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1379 > 512). Running this sequence through the model will result in indexing errors


['기술', '스', '##택', 'P', '##y', '##th', '##on', 'J', '##av', '##a', 'C', 'D', '##j', '##ang', '##o', 'Sp', '##ring', 'Bo', '##ot', 'F', '##ast', '##AP', '##I', 'V', '##ue', 'j', '##s', 'Mon', '##g', '##o', '##D', '##B', 'Mar', '##ia', '##D', '##B', 'My', '##S', '##Q', '##L', 'G', '##it', '##H', '##ub', 'p', '##ost', '##man', 'Li', '##n', '##ux', 'K', '##af', '##k', '##a', '경력', 'SL', '##BM', '연구실', '학부', '##연구', '##생', '|', 'SL', '##BM', '연구실', '|', '재직', '중', '2022', '10', '재직', '중', '1', '##년', '7', '##개', '##월', '강원', '##대', '##학교', 'SL', '##BM', '연구실', '학부', '연구', '##생', '소속', '##으로', '담당', '교수', '##님', '##께', '##서', '진행', '##하', '##시', '##는', '연구', '##과', '##제', '##에', '참여', '##하여', '실험', '설계', 'DB', '구축', '실험', 'Fr', '##am', '##work', '구축', '실험', '관리', '데이터', '관리', '등', '##을', '전반', '##적으로', '담당', '##하고', '있', '##습', '##니다', '현재', '참여', '##하고', '있', '##는', '연구', '##과', '##제', '##는', '웨어', '##러', '##블', '기기', '##의', '디지털', '신호', '##들', 'e', 'g', 'PP', '##G', 'He', '##art', '##R', '

Map:   0%|          | 0/993 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

In [80]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['job', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 993
    })
    valid: Dataset({
        features: ['job', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 111
    })
    test: Dataset({
        features: ['job', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 277
    })
})

In [82]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])
tokenized_datasets['train'] = tokenized_datasets['train'].rename_column("job", "labels")
tokenized_datasets['valid'] = tokenized_datasets['valid'].rename_column("job", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [83]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
valid_dataloader = DataLoader(tokenized_datasets["valid"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=batch_size, collate_fn=data_collator)

In [84]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([64]),
 'input_ids': torch.Size([64, 512]),
 'token_type_ids': torch.Size([64, 512]),
 'attention_mask': torch.Size([64, 512])}

In [85]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=8) # 편의상 6으로 설정

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
from transformers import get_scheduler, AdamW
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

16


In [87]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [88]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.save_pretrained(f"./result/{MODEL_NAME}/{epoch}")
    tokenizer.save_pretrained(f"./result/{MODEL_NAME}/{epoch}")

  0%|          | 0/16 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 

In [None]:
from torchmetrics import Accuracy
accuracy = Accuracy()

prediction_list_valid = []
target_list_valid = []

model.eval()
for batch in valid_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu()
    targets = batch['labels'].cpu()

    prediction_list_valid.extend(predictions)
    target_list_valid.extend(targets)
    #print(accuracy(predictions, targets)) # 매 batch 마다의 Accuracy

print(f'valid acc: {accuracy(torch.IntTensor(prediction_list_valid), torch.IntTensor(target_list_valid)).cpu().tolist():.4f}')

In [None]:
prediction_list = []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_list.extend(predictions.cpu().tolist())

In [None]:
import os
import re
import warnings

 
import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split 
 
warnings.filterwarnings("ignore")

In [None]:
MODEL_NAME = "klue/roberta-large" # "klue/bert-base", "klue/bert-large", "klue/roberta-base"
batch_size = 64
learning_rate = 5e-5
num_epochs = 1