In [1]:
import pandas as pd

df = pd.read_csv("../data/rallit_text.csv", quoting=2); df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   job     705 non-null    object
 1   text    705 non-null    object
dtypes: object(2)
memory usage: 11.1+ KB


Unnamed: 0,job,text
0,백엔드 개발자,"기술 스택: Python, Java, C, Django, Spring Boot, F..."
1,프론트엔드 개발자,"기술 스택: JavaScript, TypeScript, React, Python, ..."
2,백엔드 개발자,기술 스택: Java\r\nSpring Boot\r\nJPA\r\nTypeScrip...
3,백엔드 개발자,기술 스택: Java\r\nKotlin\r\nJavaScript\r\nTypeScr...
4,풀스택 개발자,"기술 스택: JavaScript, Docker, Elasticsearch, Vue...."


In [2]:
# 직업 레이블을 숫자로 매핑
job_map = {
    "PM": 0, "Sales": 1, "데브옵스 엔지니어": 2, "데이터 분석가": 3,
    "데이터 엔지니어": 4, "백엔드 개발자": 5, "풀스택 개발자": 6, "프론트엔드 개발자": 7
}
df['job'] = df['job'].map(job_map)

In [3]:
df.dropna(inplace = True)
df.head()

Unnamed: 0,job,text
0,5,"기술 스택: Python, Java, C, Django, Spring Boot, F..."
1,7,"기술 스택: JavaScript, TypeScript, React, Python, ..."
2,5,기술 스택: Java\r\nSpring Boot\r\nJPA\r\nTypeScrip...
3,5,기술 스택: Java\r\nKotlin\r\nJavaScript\r\nTypeScr...
4,6,"기술 스택: JavaScript, Docker, Elasticsearch, Vue...."


In [4]:
import re
import nltk
from nltk.corpus import stopwords

# 정규 표현식을 사용하여 클리닝과 특수 문자 매핑
def clean_text(text):
    # 이메일 주소 제거
    text = re.sub(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', '', text)
    # URL 제거
    text = re.sub(r'(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text)
    # 한글 자음, 모음 제거
    text = re.sub(r'([ㄱ-ㅎㅏ-ㅣ]+)', '', text)
    # HTML 태그 제거
    text = re.sub(r'<[^>]*>', '', text)
    # 모든 특수 문자 및 구두점 제거
    text = re.sub(r'[\s./?!,;:\'"+=\-_\[\]{}()*&^%$#@<>`~]', ' ', text)
    return text

# 한국어 불용어 제거
def remove_kor_stopwords(text, stopwords):
    words = text.split()
    cleaned_words = [word for word in words if word not in stopwords]
    return ' '.join(cleaned_words)

# 영어 불용어 제거
def remove_eng_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

# 불용어 파일 로드
def load_stopwords(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        stopwords = [line.strip() for line in file]
    return stopwords

# 파일 경로 설정 및 불용어 로드
stopwords_file = 'stopword.txt'
korean_stopwords = load_stopwords(stopwords_file)

# DataFrame에 클리닝 함수 적용
df['text'] = df['text'].apply(clean_text)  # 공통 클리닝
df['text'] = df['text'].apply(lambda x: remove_kor_stopwords(x, korean_stopwords))
df['text'] = df['text'].apply(remove_eng_stopwords)

# 결과 확인
df.head()

Unnamed: 0,job,text
0,5,기술 스택 Python Java C Django Spring Boot FastAPI...
1,7,기술 스택 JavaScript TypeScript React Python IPFS ...
2,5,기술 스택 Java Spring Boot JPA TypeScript AWS Spri...
3,5,기술 스택 Java Kotlin JavaScript TypeScript Node j...
4,6,기술 스택 JavaScript Docker Elasticsearch Vue js R...


In [5]:
import random
import re


########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words




def EDA(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	words = sentence.split(' ')
	words = [word for word in words if word != ""]
	num_words = len(words)

	augmented_sentences = []
	num_new_per_technique = int(num_aug/4) + 1

	n_rs = max(1, int(alpha_rs*num_words))

	# rs
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(" ".join(a_words))

	# rd
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(" ".join(a_words))

	augmented_sentences = [sentence for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	augmented_sentences.append(sentence)

	return augmented_sentences

In [6]:
# 라벨 필터링
selected_labels = [0, 1, 2, 3, 4]
filtered_df = df[df['job'].isin(selected_labels)]

In [7]:
# 데이터 증강
augmented_texts = []
augmented_labels = []

for _, row in filtered_df.iterrows():
    original_text = row['text']
    augmented_text = EDA(original_text, p_rd=0.25, alpha_rs=0.1, num_aug=20)
    augmented_texts.extend(augmented_text)
    augmented_labels.extend([row['job']] * len(augmented_text))


# 증강된 데이터를 기존 데이터프레임에 추가
augmented_df = pd.DataFrame({'text': augmented_texts, 'job': augmented_labels})
df_augmented = pd.concat([df, augmented_df]).reset_index(drop=True)

In [8]:
augmented_df.head()

Unnamed: 0,text,job
0,스택 Sheets Notion Zapier 경력 코드스테이츠 진단 평가 역량 프로덕...,0
1,스택 JavaScript Sheets Notion React Zapier 경력 주식...,0
2,기술 TypeScript Google Sheets Notion React 주식회사 ...,0
3,기술 스택 프로세스상 TypeScript Google Sheets Notion Re...,0
4,기술 지랩스 JavaScript TypeScript Google Sheets Not...,0


In [9]:
# 'job' 열의 각 분류 및 그 빈도수 출력
print(augmented_df['job'].value_counts())

job
0    260
3    156
2    104
1     78
4     78
Name: count, dtype: int64


In [10]:
print(df_augmented['job'].value_counts())

job
5    325
0    280
7    254
3    168
2    112
1     84
4     84
6     74
Name: count, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_augmented, test_size=0.2, random_state=42)

train_texts = train_df['text'].astype(str).tolist() # 문자열 데이터로 명시 후 리스트 화
train_labels = train_df['job'].tolist()
test_texts = test_df['text'].astype(str).tolist()
test_labels = test_df['job'].tolist()

In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_df['text'].astype(str).tolist() # 문자열 데이터로 명시 후 리스트 화
train_labels = train_df['job'].tolist()
test_texts = test_df['text'].astype(str).tolist()
test_labels = test_df['job'].tolist()

In [13]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'monologg/kobert'
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
batch_size = 16  # 배치 사이즈는 직접 지정해야 합니다.
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # GPU 사용이 가능한 경우 설정

model.to(device) # GPU 사용이 가능한 경우

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [17]:
import torch

# GPU 사용 가능 -> True, GPU 사용 불가 -> False
print(torch.cuda.is_available())

True


In [18]:
# GPU 사용 가능 -> 가장 빠른 번호 GPU, GPU 사용 불가 -> CPU 자동 지정 예시
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [19]:
# GPU 이름 체크(cuda:0에 연결된 그래픽 카드 기준)
print(torch.cuda.get_device_name(device = 0)) # 'NVIDIA TITAN X (Pascal)'

# 사용 가능 GPU 개수 체크
print(torch.cuda.device_count()) # 3

NVIDIA GeForce RTX 3060
1


In [20]:
from tqdm.auto import tqdm # 반복문이 얼마나 진행되었는지 알 수 있도록 프로그레스바를 표시합니다.

num_epochs = 10
learning_rate = 2e-5 #2e-5는 0.00002
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train() # 훈련 모드 지정
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

100%|██████████| 36/36 [00:43<00:00,  1.22s/it]


Epoch 1/10 - Average Loss: 1.5252


100%|██████████| 36/36 [00:41<00:00,  1.16s/it]


Epoch 2/10 - Average Loss: 1.2832


100%|██████████| 36/36 [00:41<00:00,  1.17s/it]


Epoch 3/10 - Average Loss: 1.2572


100%|██████████| 36/36 [00:41<00:00,  1.15s/it]


Epoch 4/10 - Average Loss: 1.2425


100%|██████████| 36/36 [00:44<00:00,  1.22s/it]


Epoch 5/10 - Average Loss: 1.2279


100%|██████████| 36/36 [00:44<00:00,  1.23s/it]


Epoch 6/10 - Average Loss: 1.2539


100%|██████████| 36/36 [00:43<00:00,  1.21s/it]


Epoch 7/10 - Average Loss: 1.2295


100%|██████████| 36/36 [00:41<00:00,  1.14s/it]


Epoch 8/10 - Average Loss: 1.1982


100%|██████████| 36/36 [00:42<00:00,  1.17s/it]


Epoch 9/10 - Average Loss: 1.1700


100%|██████████| 36/36 [00:40<00:00,  1.14s/it]

Epoch 10/10 - Average Loss: 1.1460





In [23]:
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)

        correct_predictions += torch.sum(predicted_labels == labels).item()
        total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.3972


In [22]:
input_text = '학과: 컴퓨터공학과, 기술 스택: python, java, sql, 프로젝트: python을 이용해서 학교 웹사이트 제작, 수행 역할: ci/cd 구현을 통한 배포'
input_encoding = tokenizer.encode_plus(
    input_text,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    _, predicted_labels = torch.max(outputs.logits, dim=1)
predicted_labels = predicted_labels.item()

print(predicted_labels)

5
