In [4]:
#필요 패키지 설치
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [5]:
from google.colab import files
uploaded = files.upload() #파일 업로드 기능 수행

Saving sampled_ntis_chat.csv to sampled_ntis_chat.csv


In [6]:
# pandas: 데이터 처리를 위한 라이브러리
import pandas as pd

# 데이터셋 불러오기
data = pd.read_csv('sampled_ntis_chat.csv')  # 'your_data.csv'를 실제 파일 경로로 변경하세요.

print(set(data.label))
data

{'Statistics', 'QnA', 'Search', 'IntroduceServices'}


Unnamed: 0,question,label
0,R&D 정보 요약은 어디서 확인하는지 궁금합니다,IntroduceServices
1,분류 기반 특허 분석 서비스는 어디서 보나요,IntroduceServices
2,활용가이드 서비스가 궁금해,IntroduceServices
3,국가R&D 기관평가 서비스 어떻게 사용하는지,IntroduceServices
4,국가R&D통합공고 서비스는 어디서 보는지 알려줘,IntroduceServices
...,...,...
6049,농업회사법인토마토연구소(주)의 최근 5년동안 과제 연구비 총액이 보고싶어요,Statistics
6050,2012 2020년 이화여자대학교 의료원에서 출원한 특허 개수 알려줘,Statistics
6051,2011년도에 (주)네이쳐패브릭에서 수행한 과제의 연구비 총액 보여줄 수 있어,Statistics
6052,국립기상과학원에서 수행한 사업화의 건수가 궁금해,Statistics


In [7]:
# scikit-learn: 머신러닝을 위한 라이브러리

from sklearn.model_selection import train_test_split
# 데이터셋 분할: 80%는 훈련 데이터, 20%는 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(data['question'], data['label'], test_size=0.2, random_state=777, stratify=data['label'])
print(X_train.shape)
print(X_test.shape)

(4843,)
(1211,)


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(data['label'])
y_train = le.transform(y_train)
y_test = le.transform(y_test)

le.classes_

array(['IntroduceServices', 'QnA', 'Search', 'Statistics'], dtype=object)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score


# 벡터화: 텍스트 데이터를 숫자 벡터로 변환
# TF-IDF 벡터화: 단어 빈도와 역문서 빈도를 사용하여 벡터화
vectorizer = TfidfVectorizer(max_features=1000)  # max_features는 조정 가능
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 회귀 모델 훈련
model = DecisionTreeClassifier()
model.fit(X_train_tfidf, y_train)

# 예측 및 모델 평가
y_pred = model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred, sample_weight=None)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {acc}')
print(f'F1: {f1}')

Accuracy: 0.9810074318744839
F1: 0.9809134387484894


In [10]:
from sklearn.ensemble import RandomForestClassifier
regr = RandomForestClassifier(random_state=0)
regr.fit(X_train_tfidf, y_train)

y_pred = regr.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred, sample_weight=None)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {acc}')
print(f'F1: {f1}')

Accuracy: 0.985962014863749
F1: 0.9858360131766463


In [11]:
from sentence_transformers import SentenceTransformer
import numpy as np


# 사전 훈련된 Sentence Transformer 모델 로드
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS').to('cuda')

X_train_emb = model.encode(np.array(X_train)).tolist()
X_test_emb = model.encode(np.array(X_test)).tolist()
# 회귀 모델 훈련
model = DecisionTreeClassifier()
model.fit(X_train_emb, y_train)


# 예측 및 모델 평가
y_pred = model.predict(X_test_emb)


acc = accuracy_score(y_test, y_pred, sample_weight=None)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {acc}')
print(f'F1: {f1}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/336k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/967k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Accuracy: 0.8364987613542527
F1: 0.8371548904317964
