##### 讀取模型

In [6]:
# 檢查 CUDA 版本和 PyTorch 版本
import torch
cuda_version = torch.version.cuda
print("cuda_version :", cuda_version)
print("torch.__version__ :", torch.__version__)

# 設置設備，0 表示第一個 GPU，-1 表示 CPU
device = 0 if torch.cuda.is_available() else -1
print("device :", device)

cuda_version : 11.8
torch.__version__ : 2.4.0+cu118
device : 0


In [None]:
# 載入 tokenizer 與模型
from transformers import LlamaForCausalLM, LlamaTokenizer

model_or_path_name = "C:/Users/PARALELL/Desktop/code/DistilBert BERT/0. models/Llama-2-7b-hf"
model = LlamaForCausalLM.from_pretrained(
    model_or_path_name,
    device_map = {"": "cuda"}, # {"": "cuda:0"}
    torch_dtype=torch.float16,
    use_cache=True
)
tokenizer = LlamaTokenizer.from_pretrained(model_or_path_name)

# 啟用 xFormers 注意力計算（前提是模型和環境支援）
model = model.to("cuda")
model.config.use_cache = True
model.config.use_xformers = True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
# 定義生成文字的函式
def generate_text(prompt, max_length=1024, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length = max_length,
            temperature = temperature,
            top_p = top_p,
            do_sample = True,
            eos_token_id = tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
def create_polarity_reversed_sentence(original_sentence):
    """
    要求模型將正向句子翻轉為負向，或負向翻轉為正向。
    Prompt 可指示模型：給定此句子，請以同樣句子結構，但將其情緒極性反轉。
    """
    prompt = (
        "The following sentence is a movie review. Please rewrite the sentence so that its sentiment polarity is reversed. "
        "For example, if it's positive, rewrite it as negative, and if it's negative, rewrite it as positive.\n\n"
        f"Original: {original_sentence}\nRewritten:"
    )
    return generate_text(prompt)

In [None]:
def create_double_negation_sentence(original_sentence):
    """
    要求模型將句子改寫為包含雙重否定的句子。
    Prompt：請將此句子改寫為一個語意保持相同(正向或負向不變)，但包含雙重否定的英文句子。
    """
    prompt = (
        "Rewrite the following movie review sentence to include double negation, without changing its overall sentiment. "
        "Double negation means using two negatives that logically cancel out, but keep the original sentiment context.\n\n"
        f"Original: {original_sentence}\nRewritten:"
    )
    return generate_text(prompt)

In [None]:
def create_noisy_sentence(original_sentence):
    """
    要求模型在句子中加入噪音，如拼字錯誤、口語化字詞、網路用語 (lol, btw)、隨機插入無意義字詞。
    Prompt：請將原句改寫為含有拼寫錯誤與口語化字詞的英文句子，但保持原本的情緒傾向。
    """
    prompt = (
        "Rewrite the following movie review sentence by adding noise such as misspellings, slang, internet abbreviations (like 'lol', 'btw'), "
        "and random filler words, while maintaining its original sentiment.\n\n"
        f"Original: {original_sentence}\nNoisy Version:"
    )
    return generate_text(prompt)

In [4]:
def generate_text(prompt, max_new_tokens=2048):
    """
    使用 Llama 模型生成文本，僅輸出重寫後的句子。

    Args:
        prompt (str): 輸入的提示語。
        max_new_tokens (int): 生成的最大新 token 數量。

    Returns:
        str: 重寫後的句子。
    """
    # Tokenize 輸入
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')

    # 定義生成參數
    generation_config = {
        "max_new_tokens": max_new_tokens,
        "temperature": 0.7,
        "top_p": 0.95,
        "repetition_penalty": 1.2,
        "eos_token_id": tokenizer.eos_token_id,
        "pad_token_id": tokenizer.eos_token_id,  # 避免 pad token 問題
        "no_repeat_ngram_size": 3,  # 避免重複 n-gram
    }

    # 生成回應
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            **generation_config
        )

    # 解碼回應
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 提取重寫後的句子
    if "Rewritten:" in decoded_output:
        rewritten = decoded_output.split("Rewritten:")[-1].strip()
    else:
        # 若模型未按照預期生成，返回整個回應
        rewritten = decoded_output.strip()

    return rewritten

def create_polarity_reversed_sentence(original_sentence):
    """
    將正向句子翻轉為負向，或負向句子翻轉為正向。
    要求模型僅輸出重寫後的句子，不要重複原始內容。
    """
    prompt = (
        "Below is an original movie review sentence and its sentiment-reversed version.\n\n"
        "Original: I absolutely loved this movie! The acting was superb and the storyline was captivating.\n"
        "Rewritten: I absolutely hated this movie! The acting was terrible and the storyline was boring.\n\n"
        "Original: {original_sentence}\nRewritten:"
    )
    return generate_text(prompt)


def create_double_negation_sentence(original_sentence):
    """
    將句子改寫為包含雙重否定的句子，同時保持原本的正負向情感不變。
    要求模型僅輸出重寫後的句子，不要重複原始內容。
    """
    prompt = (
        "You are a helpful assistant. Rewrite the given movie review sentence to include double negation "
        "while keeping the same sentiment. Output only the rewritten sentence without including the original sentence."
        f"\n\nOriginal: {original_sentence}\nRewritten:"
    )
    return generate_text(prompt)

def create_noisy_sentence(original_sentence):
    """
    將句子改寫為含有拼字錯誤、口語化字詞、網路用語和雜訊的版本，並保持原本情緒。
    要求模型僅輸出重寫後的句子，不要重複原始內容。
    """
    prompt = (
        "You are a helpful assistant. Rewrite the given movie review sentence by adding misspellings, slang, "
        "internet abbreviations (e.g., 'lol', 'btw'), and random filler words while maintaining its original sentiment. "
        "Output only the rewritten sentence without including the original sentence."
        f"\n\nOriginal: {original_sentence}\nRewritten:"
    )
    return generate_text(prompt)


##### 載入資料集

In [None]:
# 載入IMDB資料集
from datasets import load_dataset
# imdb = load_dataset("imdb",split="train[:2000]+train[-2000:]")
# imdb.shape
imdb = load_dataset("imdb")

In [6]:
imdb

NameError: name 'imdb' is not defined

In [5]:
# 取出部分句子作為示範（這裡使用測試集前5筆正負各2筆為例）
test_data = imdb["test"]
sample_reviews = []
pos_count = 0
neg_count = 0

for example in test_data:
    if ((pos_count < 2) and (example["label"] == 1)):
        sample_reviews.append((example["text"], "positive"))
        pos_count += 1
    if ((neg_count < 2) and (example["label"] == 0)):
        sample_reviews.append((example["text"], "negtive"))
        neg_count += 1
    if ((pos_count == 2) and (neg_count == 2)):
        break


NameError: name 'imdb' is not defined

In [None]:
for L in sample_reviews:
    print(L)

In [17]:
import re
# 清理文本函數
def clean_text(text):
    # 去除HTML標籤
    text = re.sub(r'<[^>]+>', '', text)
    return text

In [2]:
# 產生各種類型的挑戰句子
challenging_samples = []

index = 1
for review, sentiment in sample_reviews:
    # 為了簡化，只取前一句或前幾個字元
    # 實務中可根據需要，先對review做適當的分句或摘取 
    print(f"index : {index} ")  
    original_sentence = clean_text(review)
    # if (len(original_sentence) > 200):
    #     original_sentence = original_sentence[:200] # ??
    
    reversed_sentence = create_polarity_reversed_sentence(original_sentence)
    double_neg_sentence = create_double_negation_sentence(original_sentence)
    noisy_sentence = create_noisy_sentence(original_sentence)

    challenging_samples.append({
        "original":original_sentence,
        "sentiment":sentiment,
        "polarity_reversed":reversed_sentence,
        "double_negation":double_neg_sentence,
        "noisy":noisy_sentence,
    })
    index += 1

NameError: name 'sample_reviews' is not defined

In [35]:
for k1 in range(len(challenging_samples)):
    print(f"第{k1+1}個")
    oneDict = challenging_samples[k1]
    print(oneDict)
    break

第1個
{'original': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rub

In [None]:
{
"polarity_reversed" : 
'''I hate sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.''',

"double_negation": 
'''I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.''',

"noisy" : 
'''I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.'''
}

In [12]:
for k1 in range(len(challenging_samples)):
    print(f"第{k1+1}個")
    oneDict = challenging_samples[k1]
    #print(oneDict)
    for key,value in oneDict.items():
        print("key :",key)
        print("value :",value)
    print("="*100)

第1個
key : original
value : I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's ru

In [22]:
def create_challenging_test_data(original_text):
    # 將 original_text 中的雙引號轉譯
    safe_text = original_text.replace('"', '\\"')
    # 使用 \n 代表換行
    safe_text = safe_text.replace('\n', '\\n')

    prompt = f"""You are a helpful assistant. Given the following movie review text, produce a JSON object with:
        - "original_text": the original text (exactly as given)
        - "polarity_reversed": the text with reversed sentiment
        - "double_negation": the text including double negation while keeping the same sentiment
        - "noisy": the text with added misspellings, slang, internet abbreviations, and filler words while keeping the same sentiment

        Do not include any explanations or additional commentary. 
        Do not include the original instructions in your final output.
        Only output the JSON object.

        Original text:
        "{safe_text}"
        """
        #     Output:
        # {{
        # "original_text": "{safe_text}",
        # "polarity_reversed": 
        # """.strip()

    return generate_text(prompt)


In [None]:
# 定義生成文字的函式
def generate_text(prompt, max_length=1024, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length = max_length,
            temperature = temperature,
            top_p = top_p,
            do_sample = True,
            eos_token_id = tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 產生各種類型的挑戰句子
challenging_samples = []

index = 1
for review, sentiment in sample_reviews:
    # 為了簡化，只取前一句或前幾個字元
    # 實務中可根據需要，先對review做適當的分句或摘取 
    print(f"index : {index} ")  
    original_sentence = clean_text(review)
    
    # if (len(original_sentence) > 200):
    #     original_sentence = original_sentence[:200] # ??
    
    reversed_sentence = create_polarity_reversed_sentence(original_sentence)
    double_neg_sentence = create_double_negation_sentence(original_sentence)
    noisy_sentence = create_noisy_sentence(original_sentence)

    challenging_samples.append({
        "original":original_sentence,
        "sentiment":sentiment,
        "polarity_reversed":reversed_sentence,
        "double_negation":double_neg_sentence,
        "noisy":noisy_sentence,
    })

    # result = create_challenging_test_data(original_sentence)
    # print(result)

    index += 1
    break

index : 1 


You are a helpful assistant. Given the following movie review text, produce a JSON object with:
        - "original_text": the original text (exactly as given)
        - "polarity_reversed": the text with reversed sentiment
        - "double_negation": the text including double negation while keeping the same sentiment
        - "noisy": the text with added misspellings, slang, internet abbreviations, and filler words while keeping the same sentiment

        Do not include any explanations or additional commentary. 
        Do not include the original instructions in your final output.
        Only output the JSON object.

        Original text:
        "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background,

In [26]:
import ast

# 定義生成文字的函式
def generate_text(prompt, max_length=1024, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length = max_length,
            temperature = temperature,
            top_p = top_p,
            do_sample = True,
            eos_token_id = tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def create_challenging_test_data(original_text):
    # 將 original_text 中的雙引號與換行符號轉譯，以免影響字典字串結構
    # 對 Python 字典來說不必一定要轉譯雙引號成 \\"，只需確保最後生成的格式為有效 Python 字典表示法
    safe_text = original_text.replace("'", "\\'").replace('\n', '\\n')

    prompt = f"""
You are a helpful assistant.
Given the following movie review text, produce a Python dictionary (no explanations) with:
- "original_text": the original text (exactly as given)
- "polarity_reversed": the text with reversed sentiment
- "double_negation": the text with double negation (same sentiment)
- "noisy": the text with added noise (same sentiment)

Only output the dictionary, do not include the original instructions.

Original text:
'{safe_text}'

Output a Python dictionary in this exact format (no extra text):

{{
  "original_text": "...",
  "polarity_reversed": "...",
  "double_negation": "...",
  "noisy": "..."
}}
"""
    # 將 prompt 結尾結構顯示出期望的字典格式，留給模型填入內容。
    return generate_text(prompt)


# 產生各種類型的挑戰句子
challenging_samples = []

index = 1
for review, sentiment in sample_reviews:
    print(f"index : {index}")
    original_sentence = clean_text(review)

    # 呼叫 create_challenging_test_data 取得模型生成的 Python 字典字串
    result = create_challenging_test_data(original_sentence).strip()
    
    # 使用 ast.literal_eval 將字串轉換為 Python 字典
    try:
        result_dict = ast.literal_eval(result)
        # 確認 dict 中有所需的四個欄位
        if all(k in result_dict for k in ["original_text", "polarity_reversed", "double_negation", "noisy"]):
            challenging_samples.append({
                "original_text": result_dict["original_text"],
                "polarity_reversed": result_dict["polarity_reversed"],
                "double_negation": result_dict["double_negation"],
                "noisy": result_dict["noisy"]
            })
            print(challenging_samples[-1])
        else:
            print("The returned dictionary does not contain the required keys.")
    except (ValueError, SyntaxError):
        print("The model did not return a valid Python dictionary:")
        print(result)

    index += 1
    break


index : 1
The model did not return a valid Python dictionary:
You are a helpful assistant.
Given the following movie review text, produce a Python dictionary (no explanations) with:
- "original_text": the original text (exactly as given)
- "polarity_reversed": the text with reversed sentiment
- "double_negation": the text with double negation (same sentiment)
- "noisy": the text with added noise (same sentiment)

Only output the dictionary, do not include the original instructions.

Original text:
'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good