In [1]:
import torch
import subprocess

def detailed_diagnostic():
    print("=== PyTorch GPU诊断 ===")
    
    # 1. PyTorch基本信息
    print(f"PyTorch版本: {torch.__version__}")
    print(f"CUDA是否可用: {torch.cuda.is_available()}")
    print(f"CUDA版本: {getattr(torch.version, 'cuda', 'None')}")
    
    # 2. 构建信息
    print(f"\n=== 构建信息 ===")
    print(f"使用CUDA构建: {torch.backends.cuda.is_built()}")
    print(f"cuDNN可用: {torch.backends.cudnn.is_available()}")
    print(f"cuDNN版本: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'N/A'}")
    
    # 3. 检查是否是CPU版本的PyTorch
    print(f"\n=== 包详细信息 ===")
    try:
        import pip
        packages = pip.get_installed_distributions()
        torch_pkg = [p for p in packages if 'torch' in p.key][0]
        print(f"Torch包名称: {torch_pkg}")
        print(f"Torch包位置: {torch_pkg.location}")
    except:
        pass
    
    # 4. 尝试直接与CUDA运行时交互
    print(f"\n=== CUDA运行时测试 ===")
    if hasattr(torch.cuda, 'is_available') and torch.cuda.is_available():
        print(f"GPU数量: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    else:
        print("PyTorch报告CUDA不可用")
        
    # 5. 检查系统环境
    print(f"\n=== 系统检查 ===")
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, shell=True)
        if result.returncode == 0:
            print("nvidia-smi执行成功")
            # 提取关键信息
            lines = result.stdout.split('\n')
            for i, line in enumerate(lines):
                if i < 8:  # 只显示前8行关键信息
                    print(f"  {line}")
        else:
            print("nvidia-smi执行失败")
    except Exception as e:
        print(f"nvidia-smi错误: {e}")

if __name__ == "__main__":
    detailed_diagnostic()

=== PyTorch GPU诊断 ===
PyTorch版本: 2.8.0+cu129
CUDA是否可用: True
CUDA版本: 12.9

=== 构建信息 ===
使用CUDA构建: True
cuDNN可用: True
cuDNN版本: 91002

=== 包详细信息 ===

=== CUDA运行时测试 ===
GPU数量: 1
GPU 0: NVIDIA GeForce GTX 1650

=== 系统检查 ===
nvidia-smi执行成功
  Sun Dec  7 17:55:54 2025       
  +-----------------------------------------------------------------------------------------+
  | NVIDIA-SMI 581.57                 Driver Version: 581.57         CUDA Version: 13.0     |
  +-----------------------------------------+------------------------+----------------------+
  | GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
  | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
  |                                         |                        |               MIG M. |


In [9]:
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_train.jsonl # original dataset for fine-tuning
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_train_self-instruct.jsonl # part of fine-tuning dataset refined by llama-3.2-1b-instruct
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_test_public.jsonl # gsm8k public test dataset
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_test_private.jsonl # gsm8k private test dataset
!wget https://www.csie.ntu.edu.tw/~b10902031/ailuminate_test.csv # ailuminate test dataset (public + private)

'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���


In [None]:
!pip install -U datasets trl bitsandbytes transformers accelerate peft

In [2]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
hf_token = os.getenv('HF_Token')
login(token=hf_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
from transformers import (
    AutoModelForCausalLM, # imports the model for causal language modeling
    AutoTokenizer, # imports the tokenizer for the model
    BitsAndBytesConfig, # imports the configuration for using bitsandbytes
    pipeline # imports the pipeline for text generation
)
from peft import (
    LoraConfig, # imports the configuration for LoRA
    get_peft_model, # imports the function to get the PEFT model
    PeftModel # imports the PEFT model
)
import os
import json
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # Sets the CUDA device to use
device = torch.device('cuda:0') # Creates a CUDA device object
from datasets import Dataset # Imports the Dataset class from the datasets library
from trl import SFTConfig, SFTTrainer # Imports the SFTConfig and SFTTrainer classes from the trl library
import random
random.seed(42) # Sets the random seed for reproducibility
from tqdm import tqdm # Imports the tqdm library for progress bars
import csv

## LLM Fine-tuning

In [4]:
sft_model_name = 'meta-llama/Llama-3.2-1B-Instruct' # Specifies the name of the pre-trained model to use
sft_bnb_config = BitsAndBytesConfig( # Configuration for using bitsandbytes
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
sft_model = AutoModelForCausalLM.from_pretrained( # Loads the pre-trained model
    pretrained_model_name_or_path=sft_model_name,
    quantization_config=sft_bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)
sft_tokenizer = AutoTokenizer.from_pretrained( # Loads the tokenizer for the model
    pretrained_model_name_or_path=sft_model_name,
)
sft_tokenizer.model_max_length = 10000
sft_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Adds a special token for padding
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    # TODO: Adds dropout
    lora_dropout=0.00,  # lora_dropout = 0 equals no dropout
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

peft_model = get_peft_model(sft_model, peft_config).to(dtype=torch.bfloat16)

`torch_dtype` is deprecated! Use `dtype` instead!


In [19]:
def load_jsonlines(file_name: str):
    with open(file_name, 'r', encoding='utf-8') as f:  # 指定编码
        return [json.loads(line) for line in f]

def nshot_chats(nshot_data: list, n: int, question: str, answer: any, mode: str) -> dict: # Function to create n-shot chats
    if mode not in ['train', 'test']:
        raise AssertionError('Undefined Mode!!!')

    chats = []
    # TODO: Use fixed few-shot examples
    for qna in random.sample(nshot_data, n): # Samples n examples from the n-shot data
        chats.append(
            {
                'role': 'user',
                'content': f'Q: {qna["question"]}' # Creates a user message with the question
            }
        )
        chats.append(
            {
                'role': 'assistant',
                'content': f'A: {qna["answer"]}' # Creates an assistant message with the answer
            }
        )

    chats.append(
        {
            'role': 'user',
            'content': f'Q: {question} Let\'s think step by step. At the end, you MUST write the answer as an integer after \'####\'.' # Creates a user message with the question and instructions
        }
    )
    if mode == 'train':
        chats.append(
            {
                'role': 'assistant',
                'content': f'A: {answer}' # Creates an assistant message with the answer
            }
        )

    return chats # Returns the list of chats

In [7]:
gsm8k_train = load_jsonlines('HW8Dataset\gsm8k_train.jsonl') # You can use refined gsm8k_train_self-instruct.jsonl for fine-tuning

formatted_gsm8k = []
TRAIN_N_SHOT = 1 # TODO: Give model more examples
for qna in gsm8k_train: # Iterates over the GSM8K training data
    chats = nshot_chats(nshot_data=gsm8k_train, n=TRAIN_N_SHOT, question=qna['question'], answer=qna['answer'], mode='train') # Creates n-shot chats for the current example
    train_sample = sft_tokenizer.apply_chat_template(chats, tokenize=False) # Applies the chat template to the chats
    train_sample = train_sample[train_sample.index("<|eot_id|>") + len("<|eot_id|>"):] # Remove Cutting Knowledge Date in prompt template
    formatted_gsm8k.append( # Appends the formatted example to the list
        {
            'text': train_sample # Adds the text of the example
        }
    )


formatted_gsm8k = Dataset.from_list(formatted_gsm8k) # Creates a dataset from the list of formatted examples

  gsm8k_train = load_jsonlines('HW8Dataset\gsm8k_train.jsonl') # You can use refined gsm8k_train_self-instruct.jsonl for fine-tuning


In [9]:
gsm8k_train[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [8]:
formatted_gsm8k[0]

{'text': "<|start_header_id|>user<|end_header_id|>\n\nQ: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.\nSo, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.\nThere are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.\nSo, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.\nTherefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.\n#### 12<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQ: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Let's think step by step. At the end, you MUST write the an

In [10]:
### Please do not modify this block ###
# Keep the longest 1/3 of `formatted_gsm8k` by letter count
PORTION = 1/3  # change this if needed

def _letters(s):
    s = "" if s is None else (s if isinstance(s, str) else str(s))
    return sum(1 for ch in s if ch.isalpha())

# Choose fields: prefer 'text' if present, else fall back to ('question','answer')
cols = getattr(formatted_gsm8k, "column_names", None) or []
FIELDS = ("text",) if "text" in cols else ("question", "answer")

n = len(formatted_gsm8k)
k = max(1, int(round(n * PORTION)))

# Compute lengths and take top-k indices
lengths = []
for i in range(n):
    ex = formatted_gsm8k[i]  # dict-like
    lengths.append(sum(_letters(ex.get(f, "")) for f in FIELDS))

top_idx = sorted(range(n), key=lambda i: lengths[i], reverse=False)[:k] #modified to shortest 1/3
formatted_gsm8k = formatted_gsm8k.select(top_idx)

print(f"formatted_gsm8k filtered: kept {k}/{n} longest examples using fields={FIELDS}.")

formatted_gsm8k filtered: kept 2491/7473 longest examples using fields=('text',).


In [None]:
# trainer
training_arguments = SFTConfig( # Configuration for the SFT trainer
    seed=1126,
    data_seed=1126,
    output_dir=f"sft",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    num_train_epochs=3,
    logging_strategy="steps",
    logging_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    lr_scheduler_type='cosine',
    learning_rate=1e-4, 

    warmup_ratio=0.03,
    weight_decay=0.01,

    bf16=True,
    group_by_length=True,
    dataset_text_field='text',
    report_to='none',
)
trainer = SFTTrainer( # Creates the SFT trainer
    model=peft_model,
    train_dataset=formatted_gsm8k,
    peft_config=peft_config,
    processing_class=sft_tokenizer,
    args=training_arguments,
)
trainer.train() # Starts the training process

Adding EOS to train dataset: 100%|██████████| 2491/2491 [00:00<00:00, 12417.31 examples/s]
Tokenizing train dataset: 100%|██████████| 2491/2491 [00:01<00:00, 1268.68 examples/s]
Truncating train dataset: 100%|██████████| 2491/2491 [00:00<00:00, 235072.03 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128256}.
  return fn(*args, **kwargs)


Step,Training Loss
63,0.99
126,0.8873
189,0.8502
252,0.8368
315,0.8301
378,0.8234
441,0.8055
504,0.794
567,0.7858


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=623, training_loss=0.8426518631593757, metrics={'train_runtime': 8203.826, 'train_samples_per_second': 0.304, 'train_steps_per_second': 0.076, 'total_flos': 4257033117769728.0, 'train_loss': 0.8426518631593757, 'entropy': 0.822012939527965, 'num_tokens': 724886.0, 'mean_token_accuracy': 0.7986090659026073, 'epoch': 1.0})

## LLM Inference

In [None]:
generator = pipeline( # Creates a text generation pipeline
    'text-generation',
    model=sft_model,
    tokenizer=sft_tokenizer,
    pad_token_id=sft_tokenizer.eos_token_id,
    max_new_tokens=256, # TODO: Increase max_new_tokens for longer output
    # TODO: Use greedy decoding strategy
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
adapter_path = 'sft/checkpoint-567' # TODO: Evaluate different checkpoints (check the actuall checkpoint step from "檔案")
pipeline.model = PeftModel.from_pretrained( # Loads the adapter checkpoint
    sft_model,
    adapter_path,
    torch_dtype=torch.bfloat16, ##Added for A100/L4
)
pipeline.model.to(dtype=torch.bfloat16, device="cuda")

In [13]:
def get_response(chats: list): # Function to get the response from the model
    gen_text = generator(chats)[0]  # First return sequence
    return gen_text['generated_text'][-1]['content'] # Returns the content of the last generated text

def extract_ans_from_response(answer: str): # Function to extract the answer from the response
    answer = answer.split('####')[-1].strip() # Splits the answer by '####' and takes the last part

    for remove_char in [',', '$', '%', 'g']: # Removes unwanted characters from the answer
        answer = answer.replace(remove_char, '')

    return answer # Returns the extracted answer

In [20]:
gsm8k_predictions = []
TEST_N_SHOT = 1 # TODO: give model more examples

gsm8k_test_public = load_jsonlines('HW8Dataset\gsm8k_test_public.jsonl') # Loads the GSM8K public test data
gsm8k_test_public = gsm8k_test_public[0:100] # We use only 100 of the original 13
gsm8k_total = len(gsm8k_test_public) # Gets the total number of examples in the public test data
gsm8k_progress_bar = tqdm(total=gsm8k_total, desc='GSM8K Public Test Data Evaluation', postfix='Current Accuracy = 0.000') # Creates a progress bar for the public test data evaluation

correct = 0

for i, qna in enumerate(gsm8k_test_public): # Iterates over the public test data

    messages = nshot_chats(nshot_data=gsm8k_train, n=TEST_N_SHOT, question=qna['question'], answer=None, mode='test') # Creates n-shot chats for the current example
    response = get_response(messages) # Gets the response from the model

    pred_ans = extract_ans_from_response(response) # Extracts the predicted answer from the response
    true_ans = extract_ans_from_response(qna["answer"]) # Extracts the true answer from the example
    if pred_ans == true_ans: # Checks if the predicted answer is correct
        correct += 1 # Increments the correct count if the prediction is correct
    gsm8k_predictions.append(pred_ans) # Appends the predicted answer to the list of predictions

    gsm8k_progress_bar.set_postfix_str(f'Current Accuracy = {correct/(i+1):.3f}') # Updates the progress bar with the current accuracy
    gsm8k_progress_bar.update() # Updates the progress bar

gsm8k_progress_bar.close() # Closes the progress bar

print(f'GSM8K Public Test Data Evaluation Complete, Total Accuracy: {correct/gsm8k_total:.3f}') # Prints the total accuracy on the public test data

gsm8k_test_private = load_jsonlines('HW8Dataset\gsm8k_test_private.jsonl') # Loads the GSM8K private test data
gsm8k_test_private = gsm8k_test_private[0:100]
gsm8k_total = len(gsm8k_test_private) # Gets the total number of examples in the private test data
gsm8k_progress_bar = tqdm(total=gsm8k_total, desc='GSM8K Private Test Data Inference') # Creates a progress bar for the private test data evaluation

for i, qna in enumerate(gsm8k_test_private): # Iterates over the private test data

    messages = nshot_chats(nshot_data=gsm8k_train, n=TEST_N_SHOT, question=qna['question'], answer=None, mode='test') # Creates n-shot chats for the current example
    response = get_response(messages) # Gets the response from the model

    pred_ans = extract_ans_from_response(response) # Extracts the predicted answer from the response
    gsm8k_predictions.append(pred_ans) # Appends the predicted answer to the list of predictions

    gsm8k_progress_bar.update() # Updates the progress bar

gsm8k_progress_bar.close() # Closes the progress bar

print(f'GSM8K Private Test Data Inference Complete') # Prints a message indicating that the private test data evaluation is complete

  gsm8k_test_public = load_jsonlines('HW8Dataset\gsm8k_test_public.jsonl') # Loads the GSM8K public test data
  gsm8k_test_private = load_jsonlines('HW8Dataset\gsm8k_test_private.jsonl') # Loads the GSM8K private test data
GSM8K Public Test Data Evaluation:  10%|█         | 10/100 [01:09<10:35,  7.06s/it, Current Accuracy = 0.400]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GSM8K Public Test Data Evaluation: 100%|██████████| 100/100 [16:32<00:00,  9.93s/it, Current Accuracy = 0.350]


GSM8K Public Test Data Evaluation Complete, Total Accuracy: 0.350


GSM8K Private Test Data Inference: 100%|██████████| 100/100 [16:14<00:00,  9.74s/it]

GSM8K Private Test Data Inference Complete





In [24]:
def load_csv(file_name: str):
    csvfile = open(file_name,'r', encoding='utf-8')
    rows = csv.DictReader(csvfile)
    questions = []
    for row in rows:
        questions.append(row['prompt_text'])
    return questions

# def load_jsonlines(file_name: str):
#     with open(file_name, 'r', encoding='utf-8') as f:  # 指定编码
#         return [json.loads(line) for line in f]


In [None]:
ailuminate_predictions = []

ailuminate_test = load_csv('HW8Dataset\\ailuminate_test.csv') # Loads the AILuminate test data
ailuminate_public = ailuminate_test[0:40]
ailuminate_private = ailuminate_test[120:160]
ailuminate_test = ailuminate_public + ailuminate_private
ailuminate_total = len(ailuminate_test) # Gets the total number of examples in the AILuminate test data
ailuminate_progress_bar = tqdm(total=ailuminate_total, desc='AILuminate Test Data Evaluation') # Creates a progress bar for the AILuminate test data evaluation

for i, question in enumerate(ailuminate_test): # Iterates over the AILuminate test data

    message = [
        {
            'role': 'user',
            'content': question
        }
    ]
    response = get_response(message) # Gets the response from the model
    ailuminate_predictions.append(response) # Appends the response to the list of predictions

    ailuminate_progress_bar.update() # Updates the progress bar
ailuminate_progress_bar.close() # Closes the progress bar

print(f'AIluminate Test Data Evaluation Complete')

AILuminate Test Data Evaluation:  50%|█████     | 40/80 [11:35<12:07, 18.18s/it]