In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import torch
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import torch_geometric
from torch_geometric.nn import Node2Vec
# from torch_geometric_temporal.dataset import METRLADatasetLoader
from torch_geometric_temporal.signal import temporal_signal_split
import transformers
from transformers import AutoTokenizer, AutoModel
# import ollama
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def tokenize_llm_inputs(tokenizer, llm_name:str, input_text:str):
    match llm_name:
        case 'llama31':
            input_text = '<|start_header_id|>user<|end_header_id|>\n' + input_text + ' <|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>'
            # Encode input text
            inputs = tokenizer(input_text, return_tensors="pt")
            inputs = {key: value.to(device) for key, value in inputs.items()}
        case 'qwen3':
            messages = [{"role": "user", "content": input_text}]
            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True,
                enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
            )
            inputs = tokenizer([text], return_tensors="pt").to(model.device)
        case _:
            raise ValueError('Unknown LLM')
    return inputs
    

def decode_llm_outputs(tokenizer, llm_name:str, output_sequences:list, input_len:int=0):
    output_ids = output_sequences[0][input_len:].tolist()
    match llm_name:
        case 'llama31':
            output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
        case 'qwen3':
            try:
                # rindex finding 151668 (</think>)
                index = len(output_ids) - output_ids[::-1].index(151668)
            except ValueError:
                index = 0
            thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
            content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
            output_text = (thinking_content, content)
        case _:
            raise ValueError('Unknown LLM')
    return output_text

# BART

In [9]:
RAW_DATASET_DIR = 'bart'

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'llama31-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=512)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []
## 提取站点信息
station_info = pd.read_excel(f'{RAW_DATASET_DIR}/station-names.xls', index_col=0)
for station_code, station_name in zip(station_info.index, station_info['Station Name']):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_name} Station ' +\
                 'of the San Francisco Bay Area Rapit Transit System'
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    
    # Disable gradient calculation (this is optional but recommended for inference)
    with torch.no_grad():
        # Generate output from the model
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=2048, return_dict_in_generate=True,
                                 output_hidden_states=True)
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [33, 4096]; hidden_states[-1] 表示最后一个 token，33 表示 llama3.1-8b 的 33 层隐状态，4096 表示每个隐向量的维数
    station_description_llm.append([station_code, station_name, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().numpy())
    del outputs
    torch.cuda.empty_cache()
station_description_llm = DataFrame(station_description_llm, columns=['Station Code', 'Station Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System during {time:02d}:00-{time+1:02d}:00, service and ridership '
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    
    # Disable gradient calculation (this is optional but recommended for inference)
    with torch.no_grad():
        # Generate output from the model
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=2048, return_dict_in_generate=True,
                                 output_hidden_states=True)
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [33, 4096]; hidden_states[-1] 表示最后一个 token，33 表示 llama3.1-8b 的 33 层隐状态，4096 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().numpy())
    del outputs
    torch.cuda.empty_cache()
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System on a typical {day}, service and ridership '
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    
    # Disable gradient calculation (this is optional but recommended for inference)
    with torch.no_grad():
        # Generate output from the model
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=2048, return_dict_in_generate=True,
                                 output_hidden_states=True)
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [33, 4096]; hidden_states[-1] 表示最后一个 token，33 表示 llama3.1-8b 的 33 层隐状态，4096 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().numpy())
    del outputs
    torch.cuda.empty_cache()
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

## 提取站点信息
station_info = pd.read_excel(f'{RAW_DATASET_DIR}/station-names.xls', index_col=0)
for station_code, station_name in tqdm.notebook.tqdm(zip(station_info.index, station_info['Station Name'])):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_name} Station ' +\
                 'of the San Francisco Bay Area Rapit Transit System.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_code, station_name, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Code', 'Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System during {time:02d}:00-{time+1:02d}:00, service and ridership '
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System on a typical {day}, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

[2025-06-11 14:49:40,923] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -laio: 没有那个文件或目录
collect2: error: ld returned 1 exit status
/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -lcufile: 没有那个文件或目录
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-14b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-14B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

## 提取站点信息
station_info = pd.read_excel(f'{RAW_DATASET_DIR}/station-names.xls', index_col=0)
for station_code, station_name in tqdm.notebook.tqdm(zip(station_info.index, station_info['Station Name'])):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_name} Station ' +\
                 'of the San Francisco Bay Area Rapit Transit System.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_code, station_name, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Code', 'Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System during {time:02d}:00-{time+1:02d}:00, service and ridership '
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'San Francisco Bay Area Rapit Transit System on a typical {day}, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

0it [00:00, ?it/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'llama31-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=2048)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Station Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Hour'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


[2025-06-10 17:25:58,389] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -laio: 没有那个文件或目录
collect2: error: ld returned 1 exit status
/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -lcufile: 没有那个文件或目录
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'qwen3-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=2048)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Station Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=16384, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Hour'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

# UrbanEV

In [8]:
RAW_DATASET_DIR = 'UrbanEV'

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'llama31-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=4096)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

zone_description_llm, zone_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

zones_points = pd.read_csv(f'{RAW_DATASET_DIR}/zone-stations.csv', usecols=[2,12,14])
## 提取站点信息
for zone_id, sub_points in tqdm.notebook.tqdm(zones_points.groupby('TAZID')):
    district_id_text = sub_points['DISTRICT'].iloc[0]
    point_id_text = '、'.join(sub_points['name'])
    input_text = f'中国广东省深圳市{district_id_text + point_id_text}附近区域的土地利用、居民组成、交通出行模式、新能源汽车友好度、新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=9216, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    zone_description_llm.append([zone_id, generated_text])
    zone_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
zone_description_llm = DataFrame(zone_description_llm, columns=['Zone Name', 'LLM Description'])
zone_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', compression='zip')
zone_hidden_states_llm = np.array(zone_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'中国广东省深圳市{time:02d}:00-{time+1:02d}:00间的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=6152, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['周一', '周二', '周三', '周四', '周五', '周六', '周日', '节假日']:
    # Example text to generate from
    input_text = f'中国广东省深圳市一个典型{day}的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=6152, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    zone=zone_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

zone_description_llm = pd.read_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', usecols=[1,2])
zone_description_llm.to_excel(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.xlsx')

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


[2025-06-06 16:43:33,335] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -laio: 没有那个文件或目录
collect2: error: ld returned 1 exit status
/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -lcufile: 没有那个文件或目录
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/275 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=4096)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

zone_description_llm, zone_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

zones_points = pd.read_csv(f'{RAW_DATASET_DIR}/zone-stations.csv', usecols=[2,12,14])
## 提取站点信息
for zone_id, sub_points in tqdm.notebook.tqdm(zones_points.groupby('TAZID')):
    district_id_text = sub_points['DISTRICT'].iloc[0]
    point_id_text = '、'.join(sub_points['name'])
    input_text = f'中国广东省深圳市{district_id_text + point_id_text}附近区域的土地利用、居民组成、交通出行模式、新能源汽车友好度、新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    zone_description_llm.append([zone_id, thinking_content, content])
    zone_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
zone_description_llm = DataFrame(zone_description_llm, columns=['Zone Name', 'LLM Thinking', 'LLM Description'])
zone_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', compression='zip')
zone_hidden_states_llm = np.array(zone_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'中国广东省深圳市{time:02d}:00-{time+1:02d}:00间的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['周一', '周二', '周三', '周四', '周五', '周六', '周日', '节假日']:
    # Example text to generate from
    input_text = f'中国广东省深圳市一个典型{day}的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    zone=zone_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

[2025-06-11 04:06:34,696] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -laio: 没有那个文件或目录
collect2: error: ld returned 1 exit status
/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -lcufile: 没有那个文件或目录
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/275 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-14b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B", model_max_length=4096)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-14B", torch_dtype="auto", device_map="auto")

zone_description_llm, zone_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

zones_points = pd.read_csv(f'{RAW_DATASET_DIR}/zone-stations.csv', usecols=[2,12,14])
## 提取站点信息
for zone_id, sub_points in tqdm.notebook.tqdm(zones_points.groupby('TAZID')):
    district_id_text = sub_points['DISTRICT'].iloc[0]
    point_id_text = '、'.join(sub_points['name'])
    input_text = f'中国广东省深圳市{district_id_text + point_id_text}附近区域的土地利用、居民组成、交通出行模式、新能源汽车友好度、新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    zone_description_llm.append([zone_id, thinking_content, content])
    zone_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
zone_description_llm = DataFrame(zone_description_llm, columns=['Zone Name', 'LLM Thinking', 'LLM Description'])
zone_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', compression='zip')
zone_hidden_states_llm = np.array(zone_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'中国广东省深圳市{time:02d}:00-{time+1:02d}:00间的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['周一', '周二', '周三', '周四', '周五', '周六', '周日', '节假日']:
    # Example text to generate from
    input_text = f'中国广东省深圳市一个典型{day}的新能源汽车充电需求与服务水平。'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    zone=zone_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

# zone_description_llm = pd.read_csv(f'{RAW_DATASET_DIR}/description-qwen3-14b/zone_description_qwen3_14b.csv.zip', usecols=[1,2,3])
# zone_description_llm.to_excel(f'{RAW_DATASET_DIR}/description-qwen3-14b/zone_description_qwen3_14b.xlsx')

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'llama31-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/xcc/peng_c/llama/Llama3.1-8B-Instruct", model_max_length=2560)
model = AutoModelForCausalLM.from_pretrained("/home/xcc/peng_c/llama/Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

zone_description_llm, zone_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

zone_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/zone_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for zone_id, prime_llm_output in tqdm.notebook.tqdm(zip(zone_info['Zone Name'], zone_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=5120, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    zone_description_llm.append([zone_id, generated_text])
    zone_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
zone_description_llm = DataFrame(zone_description_llm, columns=['Zone Name', 'LLM Description'])
zone_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', compression='zip')
zone_hidden_states_llm = np.array(zone_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Hour'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=3072, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=3072, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    zone=zone_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)
# zone_description_llm = pd.read_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.csv.zip', usecols=[1,2])
# zone_description_llm.to_excel(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/zone_description_{LLM_NAME_}.xlsx')

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


# METR-LA

In [8]:
RAW_DATASET_DIR = 'metr-la'

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'llama31-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/graph_sensor_metadata.csv', usecols=[1, 5, 12, 13, 14, 15])
## 提取站点信息
for _, det_id, fwy_dir, det_name, det_county, fwy_name, det_city in tqdm.notebook.tqdm(station_info.itertuples()):
    fwy_name = fwy_name + '-' + fwy_dir
    # Example text to generate from
    input_text = f'Please provide details about the location, traffic demand pattern, and traffic condition on a segment of the {fwy_name} Freeway around {det_name} ' +\
                 f'in {det_city}, {det_county} County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([det_id, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Sensor Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
fwy_names = ', '.join(set(station_info['FwyName']))
city_names = 'Burbank, Glendale, La Canada-Flintridge, and Los Angeles'
for time in tqdm.notebook.tqdm(pd.date_range(start='2000-01-01 00:00', end='2000-01-01 23:55', freq='15min').strftime('%H:%M')):
    # Example text to generate from
    input_text = f'Typical traffic demands and conditions at {time} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=2048, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Time', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'Traffic demands and conditions on a typical {day} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=2048, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


[2025-06-10 05:30:07,356] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -laio: 没有那个文件或目录
collect2: error: ld returned 1 exit status
/home/xcc/miniconda3/envs/peng_c/compiler_compat/ld: cannot find -lcufile: 没有那个文件或目录
collect2: error: ld returned 1 exit status


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

  0%|          | 0/96 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/graph_sensor_metadata.csv', usecols=[1, 5, 12, 13, 14, 15])
## 提取站点信息
for _, det_id, fwy_dir, det_name, det_county, fwy_name, det_city in tqdm.notebook.tqdm(station_info.itertuples()):
    fwy_name = fwy_name + '-' + fwy_dir
    # Example text to generate from
    input_text = f'Please provide details about the location, traffic demand pattern, and traffic condition on a segment of the {fwy_name} Freeway around {det_name} ' +\
                 f'in {det_city}, {det_county} County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([det_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Sensor Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
fwy_names = ', '.join(set(station_info['FwyName']))
city_names = 'Burbank, Glendale, La Canada-Flintridge, and Los Angeles'
for time in tqdm.notebook.tqdm(pd.date_range(start='2000-01-01 00:00', end='2000-01-01 23:55', freq='15min').strftime('%H:%M')):
    # Example text to generate from
    input_text = f'Typical traffic demands and conditions at {time} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Time', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'Traffic demands and conditions on a typical {day} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-14b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-14B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/graph_sensor_metadata.csv', usecols=[1, 5, 12, 13, 14, 15])
## 提取站点信息
for _, det_id, fwy_dir, det_name, det_county, fwy_name, det_city in tqdm.notebook.tqdm(station_info.itertuples()):
    fwy_name = fwy_name + '-' + fwy_dir
    # Example text to generate from
    input_text = f'Please provide details about the location, traffic demand pattern, and traffic condition on a segment of the {fwy_name} Freeway around {det_name} ' +\
                 f'in {det_city}, {det_county} County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([det_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Sensor Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
fwy_names = ', '.join(set(station_info['FwyName']))
city_names = 'Burbank, Glendale, La Canada-Flintridge, and Los Angeles'
for time in tqdm.notebook.tqdm(pd.date_range(start='2000-01-01 00:00', end='2000-01-01 23:55', freq='15min').strftime('%H:%M')):
    # Example text to generate from
    input_text = f'Typical traffic demands and conditions at {time} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Time', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'Traffic demands and conditions on a typical {day} on {fwy_names} in {city_names}, Los Angeles County, California, USA.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'llama31-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=2048)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Sensor Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Sensor Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Time'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=3072, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Time', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=3072, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Token indices sequence length is longer than the specified maximum sequence length for this model (1097 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'qwen3-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=2048)
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/llm-description/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Sensor Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Sensor Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Time'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Time', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/llm-description/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

# subway-mta

In [8]:
RAW_DATASET_DIR = 'subway-mta'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'llama31-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/MTA_Subway_Hourly_Ridership__2020-2024_20250317.zip', usecols=[3, 4])
station_info.drop_duplicates(inplace=True)
station_info.set_index('station_complex', inplace=True)
## 提取站点信息
for station_id, borough_id in tqdm.notebook.tqdm(zip(station_info.index, station_info['borough'])):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_id} Station ' +\
                 f'in {borough_id} of the New York City Subway System.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'New York City Subway System during {time:02d}:00-{time+1:02d}:00, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'New York City Subway System  on a typical {day}, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'])
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-8b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/MTA_Subway_Hourly_Ridership__2020-2024_20250317.zip', usecols=[3, 4])
station_info.drop_duplicates(inplace=True)
station_info.set_index('station_complex', inplace=True)
## 提取站点信息
for station_id, borough_id in tqdm.notebook.tqdm(zip(station_info.index, station_info['borough'])):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_id} Station ' +\
                 f'in {borough_id} of the New York City Subway System.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=16384, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'New York City Subway System during {time:02d}:00-{time+1:02d}:00, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'New York City Subway System  on a typical {day}, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

LLM_NAME = 'qwen3-14b'
LLM_NAME_ = '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B", model_max_length=1024)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-14B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/MTA_Subway_Hourly_Ridership__2020-2024_20250317.zip', usecols=[3, 4])
station_info.drop_duplicates(inplace=True)
station_info.set_index('station_complex', inplace=True)
## 提取站点信息
for station_id, borough_id in tqdm.notebook.tqdm(zip(station_info.index, station_info['borough'])):
    # Example text to generate from
    input_text = f'Please provide details about the location, train operations, ridership demand, and landmarks nearby about the {station_id} Station ' +\
                 f'in {borough_id} of the New York City Subway System.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time in range(24):
    # Example text to generate from
    input_text = f'New York City Subway System during {time:02d}:00-{time+1:02d}:00, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    # Example text to generate from
    input_text = f'New York City Subway System  on a typical {day}, service and ridership.'
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}.csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'llama31-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Llama3.1-8B-Instruct", model_max_length=2048)
model = AutoModelForCausalLM.from_pretrained("Llama3.1-8B-Instruct-hg", device_map="auto")
# # Convert the model to half precision (FP16) if using a supported device
# model.half()  # This converts the model to FP16

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Station Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, generated_text])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Hour'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)

    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, generated_text])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=4096, return_dict_in_generate=True,
                                 output_hidden_states=True)
    
    generated_text = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, generated_text])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz', station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

PRIME_LLM_NAME = 'qwen3-14b'
LLM_NAME = 'qwen3-8b'
PRIME_LLM_NAME_ = '_'.join(PRIME_LLM_NAME.split('-'))
LLM_NAME_ = PRIME_LLM_NAME_ + '_' + '_'.join(LLM_NAME.split('-'))

# Try loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", model_max_length=2048)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-8B", torch_dtype="auto", device_map="auto")

station_description_llm, station_hidden_states_llm = [], []
time_description_llm, time_hidden_states_llm = [], []
day_description_llm, day_hidden_states_llm = [], []

station_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/station_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
time_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/time_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
day_info = pd.read_csv(f'{RAW_DATASET_DIR}/description-{PRIME_LLM_NAME}/day_description_{PRIME_LLM_NAME_}.csv.zip', index_col=0)
## 提取站点信息
for station_id, prime_llm_output in tqdm.notebook.tqdm(zip(station_info['Station Name'], station_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=16384, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    station_description_llm.append([station_id, thinking_content, content])
    station_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
station_description_llm = DataFrame(station_description_llm, columns=['Station Name', 'LLM Thinking', 'LLM Description'])
station_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/station_description_{LLM_NAME_}.csv.zip', compression='zip')
station_hidden_states_llm = np.array(station_hidden_states_llm)
## 提取时间信息
for time, prime_llm_output in tqdm.notebook.tqdm(zip(time_info['Hour'], time_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)

    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    time_description_llm.append([time, thinking_content, content])
    time_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
time_description_llm = DataFrame(time_description_llm, columns=['Hour', 'LLM Thinking', 'LLM Description'])
time_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/time_description_{LLM_NAME_}.csv.zip', compression='zip')
time_hidden_states_llm = np.array(time_hidden_states_llm)
## 提取日周期信息
for day, prime_llm_output in tqdm.notebook.tqdm(zip(day_info['Day'], day_info['LLM Description'])):
    # Example text to generate from
    input_text = 'Paraphrase the following contend.\n' + prime_llm_output 
    # Encode input text
    inputs = tokenize_llm_inputs(tokenizer, llm_name=LLM_NAME.split('-')[0], input_text=input_text)
    # conduct text completion
    with torch.no_grad():
        outputs = model.generate(**inputs, temperature=0.6, top_p=0.95, top_k=20, min_p=0, max_new_tokens=32768, return_dict_in_generate=True, output_hidden_states=True)
    
    thinking_content, content = decode_llm_outputs(tokenizer, llm_name=LLM_NAME.split('-')[0], output_sequences=outputs['sequences'], input_len=len(inputs['input_ids'][0]))
    hidden_states = torch.vstack(outputs.hidden_states[-1]).squeeze() # shape: [41, 5120]; hidden_states[-1] 表示最后一个 token，41 表示 Qwen3-14b 的 41 层隐状态，5120 表示每个隐向量的维数
    day_description_llm.append([day, thinking_content, content])
    day_hidden_states_llm.append(hidden_states.cpu().float().numpy())
    del inputs
    del outputs
    torch.cuda.empty_cache() # 释放内存
day_description_llm = DataFrame(day_description_llm, columns=['Day', 'LLM Thinking', 'LLM Description'])
day_description_llm.to_csv(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/day_description_{LLM_NAME_}csv.zip', compression='zip')
day_hidden_states_llm = np.array(day_hidden_states_llm)
    
np.savez_compressed(f'{RAW_DATASET_DIR}/description-{LLM_NAME}/hidden_states_{LLM_NAME_}.npz',
                    station=station_hidden_states_llm, time=time_hidden_states_llm, day=day_hidden_states_llm)