In [41]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:85% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:12pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:12px;}
</style>
"""))

**<font size="6" color="red">ch1. 허깅페이스</font>**

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

# <span style="color:red">ch.01 허깅페이스</span>
- Transformers라이브러리 내 pipeline()함수
- INference API : key를 사용
# 1. 텍스트 기반 감정분석(긍정/부정)

In [3]:
from transformers import pipeline
classifier = pipeline(task="text-classification",
                     model="distilbert-base-uncased-finetuned-sst-2-english")
classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598050713539124}]

In [4]:
classifier("이 영화는 정말 최고였어요. 감독적이고 연기가 대단해요")

[{'label': 'POSITIVE', 'score': 0.8857762217521667}]

In [5]:
result = classifier(["I've been waiting for a HuggingFace course my whole life.",
           "I hate this so much!"])
[r.get('label')  for r in result]

['POSITIVE', 'NEGATIVE']

In [6]:
classifier = pipeline(task="sentiment-analysis",
                     model="distilbert-base-uncased-finetuned-sst-2-english")
classifier(["I've been waiting for a HuggingFace course my whole life.",
           "I hate this so much!"])

[{'label': 'POSITIVE', 'score': 0.9598050713539124},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

# 2. 제로-샷 분류(zero-shot-classification)
- 비지도학습 

In [13]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                     "facebook/bart-large-mnli")
classifier("I have a problem with my iphone that needs to be resolved asap!!",
          candidate_labels=["phone", "urgent", "tablet", "computer"])

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!',
 'labels': ['urgent', 'phone', 'computer', 'tablet'],
 'scores': [0.5049759745597839,
  0.4800756573677063,
  0.012633566744625568,
  0.0023149014450609684]}

In [14]:
# 제시된 문장이 어떤 분류에 속할지 
classifier(
    "This is a course about the transformers library.",
    candidate_labels=["education", "business", "politics"]
)

{'sequence': 'This is a course about the transformers library.',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.9053573608398438, 0.07259701192378998, 0.02204558067023754]}

# 3. text 생성

In [15]:
generator = pipeline(task="text-generation",
                    model="gpt2") # 허깅페이스에는 gpt2까지
generator("In this course. We will teach you how to",
         pad_token_id=generator.tokenizer.eos_token_id)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'generated_text': 'In this course. We will teach you how to easily generate a small set of code using Lua. We will provide examples of a more sophisticated but efficient form of Lua that requires a little more time than the first two.\n\n\nPlease share your Lua'}]

In [16]:
generator("이 과정은 다음과 같은 방법을 알려드려요. ",
         pad_token_id=generator.tokenizer.eos_token_id)

Input length of input_ids is 50, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


[{'generated_text': '이 과정은 다음과 같은 방법을 알려드려요. �'}]

# 4. 마스크 채우기

In [43]:
unmasker = pipeline("fill-mask", "distilroberta-base")
unmasker("I'm going to pospital and meet a <mask>")

[{'score': 0.06455978751182556,
  'token': 1816,
  'token_str': ' girl',
  'sequence': "I'm going to pospital and meet a girl"},
 {'score': 0.03595840185880661,
  'token': 1441,
  'token_str': ' friend',
  'sequence': "I'm going to pospital and meet a friend"},
 {'score': 0.02911880798637867,
  'token': 2173,
  'token_str': ' guy',
  'sequence': "I'm going to pospital and meet a guy"},
 {'score': 0.025361763313412666,
  'token': 6429,
  'token_str': ' lady',
  'sequence': "I'm going to pospital and meet a lady"},
 {'score': 0.021882314234972,
  'token': 36289,
  'token_str': ' prostitute',
  'sequence': "I'm going to pospital and meet a prostitute"}]

In [44]:
unmasker("Hello, I'm a <mask> model",
        top_k=2) #top_k를 안 주면 5개

[{'score': 0.036119475960731506,
  'token': 265,
  'token_str': ' business',
  'sequence': "Hello, I'm a business model"},
 {'score': 0.02683805488049984,
  'token': 18150,
  'token_str': ' freelance',
  'sequence': "Hello, I'm a freelance model"}]

In [45]:
# google-bert/bert-base-uncased사용을 위해 key 발부
from transformers import pipeline
unmasker = pipeline(task="fill-mask",
                   model="google-bert/bert-base-uncased")
unmasker("Hello, I'm a [MASK] model", top_k=2)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.06705840677022934,
  'token': 4827,
  'token_str': 'fashion',
  'sequence': "hello, i'm a fashion model"},
 {'score': 0.058972690254449844,
  'token': 2047,
  'token_str': 'new',
  'sequence': "hello, i'm a new model"}]

In [29]:
import os
from dotenv import load_dotenv
load_dotenv()
# print(os.environ['HF_TOKEN'])

True

In [47]:
from huggingface_hub import InferenceClient
client = InferenceClient(provider="hf-inference",
                         api_key=os.environ['HF_TOKEN'])
result = client.fill_mask(
    "Hello, I'm a [MASK] model",
    model="google-bert/bert-base-uncased",
    top_k=2
)

ImportError: cannot import name 'InferenceClient' from 'huggingface_hub' (C:\Users\Admin\anaconda3\lib\site-packages\huggingface_hub\__init__.py)

In [48]:
[r.sequence for r in result]

AttributeError: 'dict' object has no attribute 'sequence'

In [49]:
# 다국어지원 모델도 한글 지원 만족스럽지 않을 수 있음
unmasker = pipeline("fill-mask",
                   model="bert-base-multilingual-cased") 

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
unmasker("안녕하세요? 나는 [MASK] 모델입니다", top_k=3)

# 5. 개체명 인식(NER : Naned Entity REcognition)

In [36]:
ner = pipeline(task="ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

In [40]:
ner("My name is Sylvain and I work at Hugging Face in Brookly")

[{'entity': 'I-PER',
  'score': 0.99940455,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.9983193,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.9966254,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.9993112,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9817053,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.96778005,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9814528,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.94097567,
  'index': 16,
  'word': 'Brook',
  'start': 49,
  'end': 54},
 {'entity': 'I-LOC',
  'score': 0.8209802,
  'index': 17,
  'word': '##ly',
  'start': 54,
  'end': 56}]

# 6. 질의응답

In [50]:
from transformers import pipeline
question_answer = pipeline("question-answering",
                          "distilbert-base-cased-distilled-squad")

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [51]:
question_answer(
    question="Where do I work?",
    context="My name is Tom and I work at Facebook in Brooklyn"
    
)

{'score': 0.5976612567901611, 'start': 29, 'end': 37, 'answer': 'Facebook'}

In [52]:

context="My name is Tom and I work at Facebook in Brooklyn"
context[29:37]

'Facebook'

# 7. 문서요약
- 현재 torch 2.6이상 추천

In [37]:
import torch
torch.__version__

'1.12.1'

In [38]:
summarizer = pipeline(task="summarization",
                     model="facebook/bart-large-cnn")

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

ValueError: Could not load model facebook/bart-large-cnn with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForSeq2SeqLM'>, <class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>).

# 8. 번역

In [53]:
# 한 -> 영
ko2en = pipeline("translation",
                 model="Helsinki-NLP/opus-mt-ko-en")

# 영 -> 한(?)
en2ko = pipeline("translation",
                 model="Helsinki-NLP/opus-mt-tc-big-en-ko")

ValueError: This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.

In [54]:
# 테스트 문장
ko_sentence = "이 문장을 영어로 번역해 주세요"
en_sentence = "I enjoy learning about AI."