1. 단일문장 분류

In [None]:
import pandas as pd
import torch
from transformers import pipeline, BertTokenizer, BertForSequenceClassification

In [None]:
# 1. 문서 요약을 위한 파이프라인 생성
## 'summarization' 모델을 이용하여 문서를 요약하는 파이프라인을 생성
summarizer = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



In [None]:
# 2. 감정 분석을 위한 파이프라인 생성
## 'bert-base-uncased'를 파인튜닝한 감정 분석 모델을 이용(5가지의 감정으로 분류)
sentiment_model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')



In [None]:
# 3. 분석할 문서 입력 (긴 텍스트)
document = """
The immune system is a complex network of cells and proteins that defends the body against infection. The immune system
keeps a record of every germ it has ever defeated so it can recognize and destroy the microbe quickly if it enters
the body again. Abnormalities of the immune system can lead to allergic diseases, immunodeficiencies, and autoimmune
disorders.
"""

In [None]:
# 4. 문서 요약
## 긴 문서를 130자 이내로 요약, 샘플링 없이 단일한 요약 생성
summary = summarizer(document, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
print("문서 요약:", summary)

Your max_length is set to 130, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


문서 요약:  The immune system is a complex network of cells and proteins that defends the body against infection . Abnormalities of the immune system can lead to allergic diseases, immunodeficiencies, and autoimmune disorders .


In [None]:
# 5. 요약된 문서에 대한 감정 분석
## 요약된 텍스트를 토큰화하여 감정 분석 모델에 입력
inputs = tokenizer(summary, return_tensors="pt")
outputs = sentiment_model(**inputs)
print(inputs)
print(outputs)

{'input_ids': tensor([[  101, 10103, 81998, 10472, 10127,   143, 15798, 12990, 10108, 22596,
         10110, 37145, 10203, 43461, 10107, 10103, 12788, 11423, 47835,   119,
         11088, 80593, 19471, 10108, 10103, 81998, 10472, 10743, 13868, 10114,
         17721, 44609, 38096,   117, 10205, 45560, 20528, 61463, 68722, 10165,
           117, 10110, 14929, 11947, 73106, 52769,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-1.7297, -0.7768,  0.6325,  1.2274,  0.4610]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
# 6. 감정 분석 결과 해석
## 모델이 감정에 대한 확률을 반환하는데, 가장 높은 확률을 가진 클래스를 선택
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1)
print(logits)
print(predicted_class)

tensor([[-1.7297, -0.7768,  0.6325,  1.2274,  0.4610]],
       grad_fn=<AddmmBackward0>)
tensor([3])


In [None]:
# 7. 감정 분류 결과 출력
## 0에서 4까지의 점수를 부여하는 모델을 사용
## 1: 매우 부정적, 2: 부정적, 3: 중립적, 4: 긍정적, 5: 매우 긍정적
sentiment_labels = {0: "매우 부정적", 1: "부정적", 2: "중립적", 3: "긍정적", 4: "매우 긍정적"}
print("감정 분석 결과:", sentiment_labels[predicted_class.item()])

감정 분석 결과: 긍정적


In [None]:
# 1. 스팸 메일과 정상 메일 예시 데이터 생성
data = {
    "email": [
        "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize!",
        "Hi, I hope you're doing well. Let's schedule a meeting next week.",
        "Get paid to work from home! Limited time offer!",
        "Dear customer, your invoice is attached. Please review it.",
        "This is not a scam! You have a chance to win a free iPhone!",
        "Don't forget our appointment tomorrow at 10 AM.",
        "Claim your free trial now! Act fast, this offer won't last.",
        "Thank you for your purchase! Your order will be shipped soon.",
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1: 스팸, 0: 정상
}

# 데이터프레임 생성
df = pd.DataFrame(data)

df

Unnamed: 0,email,label
0,"Congratulations! You've won a $1,000 Walmart g...",1
1,"Hi, I hope you're doing well. Let's schedule a...",0
2,Get paid to work from home! Limited time offer!,1
3,"Dear customer, your invoice is attached. Pleas...",0
4,This is not a scam! You have a chance to win a...,1
5,Don't forget our appointment tomorrow at 10 AM.,0
6,"Claim your free trial now! Act fast, this offe...",1
7,Thank you for your purchase! Your order will b...,0


In [None]:
# 2. 사전 학습된 BERT 모델과 토크나이저 로드
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 3. 각 이메일에 대해 스팸 분류 수행
results = []
for email in df['email']:
    # 이메일을 BERT 입력 형태로 변환
    inputs = tokenizer(email, return_tensors="pt", padding=True, truncation=True)

    # 모델에 입력하여 예측값 생성
    with torch.no_grad():
        outputs = model(**inputs)

    # 출력된 Logits에서 가장 높은 값을 가진 클래스를 선택
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1)

    # 예측 결과 저장
    results.append(predicted_class.item())

In [None]:
# 4. 결과를 데이터프레임에 추가 후 출력
df['predicted_label'] = results
print(df)

                                               email  label  predicted_label
0  Congratulations! You've won a $1,000 Walmart g...      1                0
1  Hi, I hope you're doing well. Let's schedule a...      0                0
2    Get paid to work from home! Limited time offer!      1                0
3  Dear customer, your invoice is attached. Pleas...      0                0
4  This is not a scam! You have a chance to win a...      1                0
5    Don't forget our appointment tomorrow at 10 AM.      0                0
6  Claim your free trial now! Act fast, this offe...      1                0
7  Thank you for your purchase! Your order will b...      0                0


In [None]:
df[df['label']==df['predicted_label']]

Unnamed: 0,email,label,predicted_label
1,"Hi, I hope you're doing well. Let's schedule a...",0,0
3,"Dear customer, your invoice is attached. Pleas...",0,0
5,Don't forget our appointment tomorrow at 10 AM.,0,0
7,Thank you for your purchase! Your order will b...,0,0


2. 두 문장의 관계 분류(자연어추론_NLI)

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# 1. 사전 학습된 BERT 모델과 토크나이저 로드
## 'bert-base-uncased' 모델을 불러오고, 자연어 추론(NLI) 작업에 맞게 파인튜닝된 모델 사용
tokenizer = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model = BertForSequenceClassification.from_pretrained('textattack/bert-base-uncased-snli')




In [None]:
# 2. 두 문장 입력
## 첫 번째 문장은 전제(premise), 두 번째 문장은 가설(hypothesis) - 참/중립/거짓 판단
premise = "The weather is nice today."
hypothesis = "It is sunny and pleasant outside."

In [None]:
# 3. 두 문장을 BERT 입력 형태로 변환 (BERT는 두 문장을 [SEP] 토큰으로 구분함)
## 두 문장을 토크나이저를 이용해 하나의 입력으로 변환
## padding(길이 맞추기) truncation(길이 제한)
inputs = tokenizer(premise, hypothesis, return_tensors="pt", padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# 4. 모델에 입력하여 문장 관계 예측
## 두 문장 사이의 관계를 예측하는 logits 출력 (세 관계에 대한 점수 출력)
outputs = model(**inputs)
print(inputs)

{'input_ids': tensor([[  101,  1996,  4633,  2003,  3835,  2651,  1012,   102,  2009,  2003,
         11559,  1998,  8242,  2648,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
# 5. 출력된 Logits에서 가장 높은 값을 가진 클래스를 선택
## logits는 3개의 값을 가지며, 각각 Entailment(참), Neutral(중립), Contradiction(거짓)을 나타냄
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1)
print(logits)
print(predicted_class)

tensor([[-5.1923,  2.5811,  1.3691]], grad_fn=<AddmmBackward0>)
tensor([1])


In [None]:
# 6. 관계 레이블 설정 (0: Contradiction, 1: Neutral, 2: Entailment)
relationship_labels = {0: "Contradiction (거짓)", 1: "Neutral (중립)", 2: "Entailment (참)"}

In [None]:
# 7. 예측된 관계 출력
print(f"두 문장 사이의 관계: {relationship_labels[predicted_class.item()]}")

두 문장 사이의 관계: Neutral (중립)


3. 문장 토큰 분류(번역)

In [None]:
from transformers import MarianMTModel, MarianTokenizer

In [None]:
# 1. 번역할 언어 모델과 토크나이저 로드
## 'Helsinki-NLP/opus-mt-en-ko'는 영어에서 한국어로 번역하는 모델
model_name = 'Helsinki-NLP/opus-mt-en-ko'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

OSError: Helsinki-NLP/opus-mt-en-ko is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# 2. 번역할 영어 문장 입력
english_sentence = "The weather is nice today."

In [None]:
# 3. 입력 문장을 토크나이즈하여 텐서 형태로 변환
## 모델에 입력하기 위해 토큰화 및 인코딩
inputs = tokenizer(english_sentence, return_tensors="pt", padding=True, truncation=True)

In [None]:
# 4. 모델에 입력하여 번역 생성
## 모델의 forward 메서드를 호출하여 번역 결과를 얻음
with torch.no_grad():  # 그래디언트 계산을 하지 않음
    translated_tokens = model.generate(**inputs)

In [None]:
# 5. 토큰을 디코딩하여 번역된 문장 생성
translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# 6. 번역 결과 출력
print("영어 문장:", english_sentence)
print("번역된 한국어 문장:", translated_sentence)

4. 질의응답

In [None]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer

In [None]:
# 1.BERT 모델과 토크나이저 다운로드
## 사전학습된 모델과 그에 맞는 토크나이저 (SQUAD데이터셋으로 파이뉴닝 완료된 모델 // 질문에 대해 본문에서 정답 탐색)
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# 2.질문과 본문 입력
## 본문에서 질문에 대한 정답을 찾을 예정
question = "What is the immune system?"
paragraph = ("The immune system is a system of many biological structures and processes within an organism that "
             "protects against disease. To function properly, an immune system must detect a wide variety of agents, "
             "known as pathogens, from viruses to parasitic worms, and distinguish them from the organism's own healthy tissue.")

In [None]:
# 3.질문에 [CLS] 토큰, 본문에 [SEP] 토큰 추가
## [CLS]:문장의 시작 , [SEP]:질문과 본문 구분
question = '[CLS] ' + question + ' [SEP]'
paragraph = paragraph + ' [SEP]'

In [None]:
# 4.질문과 본문을 각각 토큰화 후 결합
## 각 단어를 토큰화 후 질문과 본문을 하나의 시퀀스로 결합해 숫자ID로 변환
question_tokens = tokenizer.tokenize(question)
paragraph_tokens = tokenizer.tokenize(paragraph)

tokens = question_tokens + paragraph_tokens
input_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
# 5.세그먼트 임베딩 생성: 질문은 세그먼트 0, 본문은 세그먼트 1
segment_ids = [0] * len(question_tokens)
segment_ids += [1] * len(paragraph_tokens)

In [None]:
# 6.텐서로 변환 (모델에 입력할 수 있도록 변환)
##모델은 토큰의 시작과 끝 위치에 대한 점수 반환
input_ids = torch.tensor([input_ids])
segment_ids = torch.tensor([segment_ids])

scores = model(input_ids, token_type_ids=segment_ids)

In [None]:
# 7.답변의 시작 인덱스와 끝 인덱스 추출
##본문에서 질문에 대한 답변 시작과 끝 위치 예측을 위해 시작되고 끝날 확률이 높은 토큰의 점수를 가진 인덱스 추출
start_index = torch.argmax(scores.start_logits)
end_index = torch.argmax(scores.end_logits)

In [None]:
# 8.추출된 시작과 끝 인덱스 사이의 토큰을 결합해 답변을 출력
print(' '.join(tokens[start_index:end_index+1]))

a system of many biological structures and processes within an organism that protects against disease
