##### 讀取模型

In [1]:
# 檢查 CUDA 版本和 PyTorch 版本
import torch
cuda_version = torch.version.cuda
print("cuda_version :", cuda_version)
print("torch.__version__ :", torch.__version__)

# 設置設備，0 表示第一個 GPU，-1 表示 CPU
device = 0 if torch.cuda.is_available() else -1
print("device :", device)

cuda_version : 11.8
torch.__version__ : 2.4.0+cu118
device : 0


In [2]:
# 載入 tokenizer 與模型
from transformers import LlamaForCausalLM, LlamaTokenizer

model_or_path_name = "C:/Users/PARALELL/Desktop/code/DistilBert BERT/0. models/Llama-2-7b-hf"
model = LlamaForCausalLM.from_pretrained(
    model_or_path_name,
    device_map = {"": "cuda"}, # {"": "cuda:0"}
    torch_dtype=torch.float16,
    use_cache=True
)
tokenizer = LlamaTokenizer.from_pretrained(model_or_path_name)

# 啟用 xFormers 注意力計算（前提是模型和環境支援）
model = model.to("cuda")
model.config.use_cache = True
model.config.use_xformers = True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##### V3

In [None]:
# V3
def generate_text(prompt, max_new_tokens=2048):
    """
    使用 Llama 模型生成文本，僅輸出重寫後的句子或摘要。

    Args:
        prompt (str): 輸入的提示語。
        max_new_tokens (int): 生成的最大新 token 數量。

    Returns:
        str: 模型生成的文本。
    """
    # Tokenize 輸入
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')

    # 定義生成參數
    generation_config = {
        "max_new_tokens": max_new_tokens,
        "temperature": 0.5,  # 調低溫度以提高一致性
        "top_p": 0.95,
        "repetition_penalty": 1.2,
        "eos_token_id": tokenizer.eos_token_id,
        "pad_token_id": tokenizer.eos_token_id,  # 避免 pad token 問題
        "no_repeat_ngram_size": 3,  # 避免重複 n-gram
    }

    # 生成回應
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            **generation_config
        )

    # 解碼回應
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return decoded_output,decoded_output.strip()

def summarize_review(original_sentence):
    """
    摘要影評的重點。

    Args:
        original_sentence (str): 原始的影評句子。

    Returns:
        str: 摘要後的句子。
    """
    # prompt = (
    #     f"Original: {original_sentence}\n"
    #     "Summary:"
    # )
    # 构建提示
    prompt = (
        "Please provide a one-sentence summary for the following movie review. "
        "Output the result in JSON format with two fields: 'Original' and 'Summary'. "
        "Do not include any additional information.\n\n"
        """Original: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again..\n"""
        """Summary: This science fiction series has bad special effects and stiff performances..\n\n"""
        f"Original: {original_sentence}\n"
        "Summary:"
    )
    summary = generate_text(prompt, max_new_tokens=256)
    return summary

# def create_polarity_reversed_sentence(original_sentence):
#     """
#     將正向句子翻轉為負向，或負向句子翻轉為正向。
#     要求模型僅輸出重寫後的句子，不要重複原始內容。
#     """
#     prompt = (
#         "Below is an original movie review sentence and its sentiment-reversed version.\n\n"
#         "Original: I absolutely loved this movie! The acting was superb and the storyline was captivating.\n"
#         "Rewritten: I absolutely hated this movie! The acting was terrible and the storyline was boring.\n\n"
#         "Original: {original_sentence}\nRewritten:"
#     )
#     prompt = prompt.format(original_sentence=original_sentence)
#     return generate_text(prompt)


# def create_double_negation_sentence(original_sentence):
#     """
#     將句子改寫為包含雙重否定的句子，同時保持原本的正負向情感不變。
#     要求模型僅輸出重寫後的句子，不要重複原始內容。
#     """
#     prompt = (
#         "You are a helpful assistant. Rewrite the given movie review sentence to include double negation "
#         "while keeping the same sentiment. Output only the rewritten sentence without including the original sentence."
#         f"\n\nOriginal: {original_sentence}\nRewritten:"
#     )
#     return generate_text(prompt)

# def create_noisy_sentence(original_sentence):
#     """
#     將句子改寫為含有拼字錯誤、口語化字詞、網路用語和雜訊的版本，並保持原本情緒。
#     要求模型僅輸出重寫後的句子，不要重複原始內容。
#     """
#     prompt = (
#         "You are a helpful assistant. Rewrite the given movie review sentence by adding misspellings, slang, "
#         "internet abbreviations (e.g., 'lol', 'btw'), and random filler words while maintaining its original sentiment. "
#         "Output only the rewritten sentence without including the original sentence."
#         f"\n\nOriginal: {original_sentence}\nRewritten:"
#     )
#     return generate_text(prompt)



In [None]:
# V3
def create_polarity_reversed_sentence(summary_sentence):
    """
    將摘要後的句子情感極性反轉。

    Args:
        summary_sentence (str): 摘要後的句子。

    Returns:
        str: 反轉情感極性的句子。
    """
    prompt = (
        "Below is an original movie review summary and its sentiment-reversed version.\n\n"
        "Original: I loved this movie for its superb acting and captivating storyline.\n"
        "Rewritten: I hated this movie for its terrible acting and boring storyline.\n\n"
        f"Original: {summary_sentence}\nRewritten:"
    )
    return generate_text(prompt)

def create_double_negation_sentence(summary_sentence):
    """
    將摘要後的句子改寫為包含雙重否定的句子，保持情感不變。

    Args:
        summary_sentence (str): 摘要後的句子。

    Returns:
        str: 包含雙重否定的句子。
    """
    prompt = (
        "Below is an original movie review summary and its double negation version.\n\n"
        "Original: I loved this movie for its superb acting and captivating storyline.\n"
        "Rewritten: I don't not love this movie for its superb acting and captivating storyline.\n\n"
        f"Original: {summary_sentence}\nRewritten:"
    )
    return generate_text(prompt)

def create_noisy_sentence(summary_sentence):
    """
    將摘要後的句子改寫為含有拼字錯誤、口語化字詞、網路用語和雜訊的版本，保持情緒不變。

    Args:
        summary_sentence (str): 摘要後的句子。

    Returns:
        str: 含有雜訊的句子。
    """
    prompt = (
        "Below is an original movie review summary and its noisy version with misspellings, slang, internet abbreviations, and filler words.\n\n"
        "Original: I loved this movie for its superb acting and captivating storyline.\n"
        "Rewritten: I luvd this mvie 4 its supbr acting & captiv8ing storyline lol.\n\n"
        f"Original: {summary_sentence}\nRewritten:"
    )
    return generate_text(prompt)

def process_movie_review(original_sentence):
    """
    讀取原始影評，生成摘要，然後創建三個重寫版本。

    Args:
        original_sentence (str): 原始的影評句子。

    Returns:
        dict: 包含原始影評、摘要、以及三個重寫版本的字典。
    """
    # 生成摘要
    summary = summarize_review(original_sentence)
    print("Summary:", summary)

    # 創建重寫版本
    polarity_reversed = create_polarity_reversed_sentence(summary)
    double_negation = create_double_negation_sentence(summary)
    noisy = create_noisy_sentence(summary)

    return {
        'original': original_sentence,
        'summary': summary,
        'polarity_reversed': polarity_reversed,
        'double_negation': double_negation,
        'noisy': noisy
    }



##### 載入資料集

In [1]:
# 載入IMDB資料集
from datasets import load_dataset
# imdb = load_dataset("imdb",split="train[:2000]+train[-2000:]")
# imdb.shape
imdb = load_dataset("imdb")
imdb

In [2]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
# 取出部分句子作為示範（這裡使用測試集前5筆正負各2筆為例）
test_data = imdb["test"]
sample_reviews = []
pos_count = 0
neg_count = 0

for example in test_data:
    if ((pos_count < 2) and (example["label"] == 1)):
        sample_reviews.append((example["text"], "positive"))
        pos_count += 1
    if ((neg_count < 2) and (example["label"] == 0)):
        sample_reviews.append((example["text"], "negtive"))
        neg_count += 1
    if ((pos_count == 2) and (neg_count == 2)):
        break


In [None]:
import re
# 清理文本函數
def clean_text(text):
    # 去除HTML標籤
    text = re.sub(r'<[^>]+>', '', text)
    return text

In [None]:
index = 1

for review, sentiment in sample_reviews:
    print(f"index : {index} ")  
    # 為了簡化，只取前一句或前幾個字元
    # 實務中可根據需要，先對review做適當的分句或摘取 
    original_sentence = clean_text(review)
    #print("original_sentence :",review)
    raw_output,summarize_result = summarize_review(original_sentence)
    print("summarize_result :",summarize_result)
    print("-"*200)
    # 查找最后一个“Summary:”的位置
    summary_start = summarize_result.rfind("Summary:") + len("Summary:")
    summary = summarize_result[summary_start:].strip()
    # 打印结果
    print(f"Original: {original_sentence}\nSummary: {summary}")
    print("="*200)
    index += 1


In [None]:
# 產生各種類型的挑戰句子
challenging_samples = []

index = 1
for review, sentiment in sample_reviews:
    # 為了簡化，只取前一句或前幾個字元
    # 實務中可根據需要，先對review做適當的分句或摘取 
    print(f"index : {index} ")  
    original_sentence = clean_text(review)
    # if (len(original_sentence) > 200):
    #     original_sentence = original_sentence[:200] # ??
    
    # 創建重寫版本
    polarity_reversed = create_polarity_reversed_sentence(summary)
    double_negation = create_double_negation_sentence(summary)
    noisy = create_noisy_sentence(summary)

    challenging_samples.append({
        "original":original_sentence,
        "sentiment":sentiment,
        "polarity_reversed":reversed_sentence,
        "double_negation":double_neg_sentence,
        "noisy":noisy_sentence,
    })
    index += 1
    # result = process_movie_review(original_sentence)
    # print(result)
    break

In [None]:
for key,value in result.items():
    print("key :",key)
    print("value :",value)

In [None]:
for k1 in range(len(challenging_samples)):
    print(f"第{k1+1}個")
    oneDict = challenging_samples[k1]
    #print(oneDict)
    for key,value in oneDict.items():
        print("key :",key)
        print("value :",value)
    print("="*100)

##### V4

In [None]:
# 檢查 CUDA 版本和 PyTorch 版本
import torch
cuda_version = torch.version.cuda
print("cuda_version :", cuda_version)
print("torch.__version__ :", torch.__version__)

# 設置設備，0 表示第一個 GPU，-1 表示 CPU
device = 0 if torch.cuda.is_available() else -1
print("device :", device)

# =================================================================================================================
# V4
import json

def generate_text(original_sentence):
    # 构建提示
    prompt = (
        "Please provide summary for the following movie review. "
        # "Output the result in JSON format with two fields: 'Original' and 'Summary'. "
        "Do not include any additional information.\n\n"
        """Original: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again..\n"""
        """Summary: This science fiction series has bad special effects and stiff performances..\n\n"""
        f"Original: {original_sentence}\n"
        "Summary:"
    )

    # 编码提示
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # 移动到与模型相同的设备

    # 生成输出
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,          # 控制生成摘要的长度
        # do_sample=False,             # 使用贪婪搜索，确保一致性
        eos_token_id=tokenizer.eos_token_id,  # 确保在结束标记处停止
        pad_token_id=tokenizer.eos_token_id   # 避免填充问题
    )

    # 解码生成的文本
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("generated_text :\n",generated_text)
    
    # 查找最后一个“Summary:”的位置
    summary_start = generated_text.rfind("Summary:") + len("Summary:")
    summary = summarize_result[summary_start:].strip()
    print("summary :\n",summary)

    # # 提取 JSON 部分
    # json_start = generated_text.find("{")
    # json_end = generated_text.rfind("}") + 1
    # json_str = generated_text[json_start:json_end]

    # # 解析 JSON
    # try:
    #     summary_data = json.loads(json_str)
    #     original = summary_data.get("Original", "")
    #     summary = summary_data.get("Summary", "")
    #     print(f"Original: {original}\nSummary: {summary}")

    # except json.JSONDecodeError:
    #     print("Failed to parse JSON.")        

cuda_version : 11.8
torch.__version__ : 2.4.0+cu118
device : 0


In [23]:
# =================================================================================================================
# 載入IMDB資料集
from datasets import load_dataset
# imdb = load_dataset("imdb",split="train[:2000]+train[-2000:]")
# imdb.shape
imdb = load_dataset("imdb")

# ===============================================================================================================
# 取出部分句子作為示範（這裡使用測試集前5筆正負各2筆為例）
test_data = imdb["test"]
sample_reviews = []
pos_count = 0
neg_count = 0

# ===============================================================================================================
for example in test_data:
    if ((pos_count < 2) and (example["label"] == 1)):
        sample_reviews.append((example["text"], "positive"))
        pos_count += 1
    if ((neg_count < 2) and (example["label"] == 0)):
        sample_reviews.append((example["text"], "negtive"))
        neg_count += 1
    if ((pos_count == 2) and (neg_count == 2)):
        break


In [28]:
for runtime in range(1,11):
    print(runtime)
    print("="*200)
    original_sentence = sample_reviews[1]
    #print("original_sentence :\n",original_sentence)    
    generate_text(original_sentence)
    break
    

1
generated_text : Please provide summary for the following movie review. Do not include any additional information.

Original: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actio

NameError: name 'summarize_result' is not defined

In [None]:
# ===============================================================================================================
import re
# 清理文本函數
def clean_text(text):
    # 去除HTML標籤
    text = re.sub(r'<[^>]+>', '', text)
    return text

# ===============================================================================================================
# 產生各種類型的挑戰句子
challenging_samples = []

index = 1
for review, sentiment in sample_reviews:
    # 為了簡化，只取前一句或前幾個字元
    # 實務中可根據需要，先對review做適當的分句或摘取 
    print(f"index : {index} ")  
    #original_sentence = clean_text(review)
    
    generate_text(review)
    print("="*200)
    # print("original :",original)
    # print("summary :",summary)
    index += 1
    break
    
    # 創建重寫版本
    # polarity_reversed = create_polarity_reversed_sentence(summary)
    # double_negation = create_double_negation_sentence(summary)
    # noisy = create_noisy_sentence(summary)

    # challenging_samples.append({
    #     "original":original_sentence,
    #     "sentiment":sentiment,
    #     "polarity_reversed":reversed_sentence,
    #     "double_negation":double_neg_sentence,
    #     "noisy":noisy_sentence,
    # })
    # index += 1
    # # result = process_movie_review(original_sentence)
    # # print(result)
    # break

##### V4-2

In [17]:
# 檢查 CUDA 版本和 PyTorch 版本
import torch
cuda_version = torch.version.cuda
print("cuda_version :", cuda_version)
print("torch.__version__ :", torch.__version__)

# 設置設備，0 表示第一個 GPU，-1 表示 CPU
device = 0 if torch.cuda.is_available() else -1
print("device :", device)

# =================================================================================================================

def generate_text(original_sentence):
    # 提供摘要示例的 Prompt
    prompt = (
        "Please summarize the following movie review in one concise sentence.\n"
        """Original: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again..\n"""
        """Summary: This science fiction series has bad special effects and stiff performances..\n\n"""
        f"Original: {original_sentence}\n"
        "Summary:"
    )
    # 编码提示
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # 移动到与模型相同的设备

    # 生成输出
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,          # 控制生成摘要的长度
        # do_sample=False,             # 使用贪婪搜索，确保一致性
        eos_token_id=tokenizer.eos_token_id,  # 确保在结束标记处停止
        pad_token_id=tokenizer.eos_token_id   # 避免填充问题
    )

    # 解码生成的文本
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)      

cuda_version : 11.8
torch.__version__ : 2.4.0+cu118
device : 0


In [6]:
# =================================================================================================================
# 載入IMDB資料集
from datasets import load_dataset
# imdb = load_dataset("imdb",split="train[:2000]+train[-2000:]")
# imdb.shape
imdb = load_dataset("imdb")

# ===============================================================================================================
# 取出部分句子作為示範（這裡使用測試集前5筆正負各2筆為例）
test_data = imdb["test"]
sample_reviews = []
pos_count = 0
neg_count = 0

# ===============================================================================================================
for example in test_data:
    if ((pos_count < 2) and (example["label"] == 1)):
        sample_reviews.append((example["text"], "positive"))
        pos_count += 1
    if ((neg_count < 2) and (example["label"] == 0)):
        sample_reviews.append((example["text"], "negtive"))
        neg_count += 1
    if ((pos_count == 2) and (neg_count == 2)):
        break



In [19]:
original_sentence = sample_reviews[1]
print("original_sentence :\n",original_sentence)
print("="*200)
generate_text(original_sentence)

original_sentence :
 ("Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.<br /><br />The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.<br /><br />I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys

##### V4-3

In [3]:
def generate_text(prompt, max_new_tokens=2048):
    """
    使用 Llama 模型生成文本，僅輸出重寫後的句子或摘要。

    Args:
        prompt (str): 輸入的提示語。
        max_new_tokens (int): 生成的最大新 token 數量。

    Returns:
        str: 模型生成的文本。
    """
    # Tokenize 輸入
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda')

    # 定義生成參數
    generation_config = {
        "max_new_tokens": max_new_tokens,
        "temperature": 0.5,  # 調低溫度以提高一致性
        "top_p": 0.95,
        "repetition_penalty": 1.2,
        "eos_token_id": tokenizer.eos_token_id,
        "pad_token_id": tokenizer.eos_token_id,  # 避免 pad token 問題
        "no_repeat_ngram_size": 3,  # 避免重複 n-gram
    }

    # 生成回應
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            **generation_config
        )

    # 解碼回應
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return decoded_output,decoded_output.strip()

def summarize_review(original_sentence):
    """
    摘要影評的重點。

    Args:
        original_sentence (str): 原始的影評句子。

    Returns:
        str: 摘要後的句子。
    """
    # prompt = (
    #     f"Original: {original_sentence}\n"
    #     "Summary:"
    # )
    # 构建提示
    prompt = (
        "Please provide a one-sentence summary for the following movie review. "
        "Output the result in JSON format with two fields: 'Original' and 'Summary'. "
        "Do not include any additional information.\n\n"
        """Original: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say "Gene Roddenberry's Earth..." otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again..\n"""
        """Summary: This science fiction series has bad special effects and stiff performances..\n\n"""
        f"Original: {original_sentence}\n"
        "Summary:"
    )
    summary = generate_text(prompt, max_new_tokens=256)
    return summary

In [5]:
# =================================================================================================================
# 載入IMDB資料集
from datasets import load_dataset
# imdb = load_dataset("imdb",split="train[:2000]+train[-2000:]")
# imdb.shape
imdb = load_dataset("imdb")

# ===============================================================================================================
# 取出部分句子作為示範（這裡使用測試集前5筆正負各2筆為例）
test_data = imdb["test"]
sample_reviews = []
pos_count = 0
neg_count = 0

# ===============================================================================================================
for example in test_data:
    if ((pos_count < 2) and (example["label"] == 1)):
        sample_reviews.append((example["text"], "positive"))
        pos_count += 1
    if ((neg_count < 2) and (example["label"] == 0)):
        sample_reviews.append((example["text"], "negtive"))
        neg_count += 1
    if ((pos_count == 2) and (neg_count == 2)):
        break

In [6]:
print(sample_reviews[1])

("Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.<br /><br />The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.<br /><br />I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. Th

In [None]:
original_sentence = sample_reviews[1]

raw_output,summarize_result = summarize_review(original_sentence)
print("summarize_result :",summarize_result)
# 查找最后一个“Summary:”的位置
summary_start = summarize_result.rfind("Summary:") + len("Summary:")
summary = summarize_result[summary_start:].strip()
# 打印结果
print(f"Original: {original_sentence}\nSummary: {summary}")