In [1]:
from huggingface_hub import login, HfApi
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.models.llama import LlamaConfig
import os
import time

# Hugging Faceにログイン
# 環境変数 HUGGINGFACE_TOKEN が設定されていれば自動的に利用
token = os.environ.get("HUGGINGFACE_TOKEN")
# モデルディレクトリと保存先パス
base_model_name = "meta-llama/Llama-3.1-70B-Instruct"

# ユーザー名を取得して/raidに保存パスを設定
user_dir = os.environ.get("USER")
save_directory = f"/raid/{user_dir}/{base_model_name.replace('/', '_')}"

# 保存ディレクトリを作成
os.makedirs(save_directory, exist_ok=True)
print(f"Model will be saved to: {save_directory}")

# int4量子化の設定
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.torch.bfloat16  # 計算をfloat16で実行
)

# モデル構成の設定（rope_scaling対策）
config = LlamaConfig.from_pretrained(base_model_name, cache_dir=save_directory)
if hasattr(config, "rope_scaling"):
    config.rope_scaling["original_max_position_embeddings"] = 8191  # 調整して問題を解決

# トークナイザのロード
tokenizer = AutoTokenizer.from_pretrained(base_model_name, cache_dir=save_directory, token=token)

# `pad_token_id` を設定（未設定の場合）
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# 通常モデルのロード
model_base = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    config=config,        
    device_map="auto",           
    torch_dtype=torch.float16,
    cache_dir=save_directory,    
    token=token
)

# 量子化モデルのロード
model_quantized = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    config=config,   
    device_map="auto",           
    quantization_config=bnb_config,  
    cache_dir=save_directory,   
    token=token         
)

print("Model and tokenizer loaded successfully!")
print(f"All files are saved in: {save_directory}")


Model will be saved to: /raid/sasaki/meta-llama_Llama-3.1-70B-Instruct


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Model and tokenizer loaded successfully!
All files are saved in: /raid/sasaki/meta-llama_Llama-3.1-70B-Instruct


In [2]:
# モデルがどのデバイスに載っているかを出力する関数
def print_model_device_info(model, model_name):
    for param_name, param in model.named_parameters():
        print(f"{model_name} - Parameter: {param_name}, Device: {param.device}")
        break  # 最初のパラメータで終了

print_model_device_info(model_base, "Base Model")
print_model_device_info(model_quantized, "Quantized Model")

Base Model - Parameter: model.embed_tokens.weight, Device: cuda:0
Quantized Model - Parameter: model.embed_tokens.weight, Device: cuda:1


In [3]:
def calculate_task_difficulty(input_text: str) -> float:
    """
    タスクの難易度を測定する関数。ここでは文字数やトークン数、構造の複雑さなどを基準に仮定。
    """
    tokens = tokenizer.tokenize(input_text)
    # 難易度はトークン数で計測（仮）
    difficulty = len(tokens)
    return difficulty

In [4]:
def select_model(difficulty: float, threshold: float = 50) -> torch.nn.Module:
    """
    難易度に応じて使用するモデルを選択。
    thresholdを超える場合、32-bitモデルを使用。
    """
    if difficulty > threshold:
        return model_base, tokenizer
    else:
        return model_quantized, tokenizer


In [5]:
def generate_output(model, tokenizer, input_text: str) -> str:
    """
    指定したモデルとトークナイザを用いて出力を生成。
    """
    # トークナイズ
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    # 不要なフィールドを削除
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    # 生成
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        pad_token_id=tokenizer.pad_token_id  # トークナイザの設定済みpad_token_idを利用
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [6]:
import time
# メイン処理
def process_input(input_text: str, difficulty_threshold: float = 50) -> str:
    """
    タスク全体を実行するメイン関数。
    """
    start_time = time.time()

    # 1. タスクの難易度を測定
    difficulty = calculate_task_difficulty(input_text)

    # 2. 適切なモデルを選択
    model, tokenizer = select_model(difficulty, difficulty_threshold)

    # 3. 出力を生成
    output = generate_output(model, tokenizer, input_text)
    
    end_time = time.time()

    print("=" * 50)
    print(f"Using model: {'Base model' if model == model_base else 'Quantized'}")
    print(f"Task Difficulty: {difficulty}")
    print(f"Processing Time: {end_time - start_time:.2f} seconds")
    
    return output

In [7]:
# テスト
input_text = "Please explain Linear Algebra"
result = process_input(input_text)
print("Generated Output:", result)


Using model: Quantized
Task Difficulty: 4
Processing Time: 62.54 seconds
Generated Output: Please explain Linear Algebra
Linear algebra is a branch of mathematics that deals with the study of linear equations, vector spaces, linear transformations, and linear operators. It is a fundamental subject that has numerous applications in various fields, including physics, engineering, computer science, and data analysis. Here's a comprehensive overview of linear algebra:
**Key Concepts:**

1. **Vector Spaces:** A vector space is a set of vectors that can be added and scaled (multiplied by a scalar). Vector spaces can be finite-dimensional (e.g., 2D, 3D) or infinite-dimensional.
2. **Linear Independence:** A set of vectors is said to be linearly independent if none of the vectors can be expressed as a linear combination of the others.
3. **Span:** The span of a set of vectors is the set of all linear combinations of those vectors.
4. **Basis:** A basis is a set of linearly independent vectors 

In [8]:
short_inputs = [
    "Hello, how is the weather today?",
    "Can you recommend some popular spots in Tokyo?",
    "What are your thoughts on the future of technology?"
]

In [10]:
# 短い入力を処理
print("Using Base Model:")
for input_text in short_inputs:
    result = process_input(input_text, difficulty_threshold=0)

# 長い入力を処理
print("Using Quantized Model:")
for input_text in short_inputs:
    result = process_input(input_text, difficulty_threshold=50)

Using Base Model:
Using model: Base model
Task Difficulty: 8
Processing Time: 12.04 seconds
Using model: Base model
Task Difficulty: 9
Processing Time: 75.18 seconds
Using model: Base model
Task Difficulty: 10
Processing Time: 45.70 seconds
Using Quantized Model:
Using model: Quantized
Task Difficulty: 8
Processing Time: 4.32 seconds
Using model: Quantized
Task Difficulty: 9
Processing Time: 38.58 seconds
Using model: Quantized
Task Difficulty: 10
Processing Time: 50.08 seconds


In [10]:
!nvidia-smi

Mon Nov 25 04:27:25 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A30                     Off |   00000000:B5:00.0 Off |                    0 |
| N/A   32C    P0             31W /  165W |   18001MiB /  24576MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
import pandas as pd

# ファイル名を指定
file1 = "glue_comparison_results_meta-llama_Llama-3.1-70B-Instruct_mrpc_20241129_105418.csv"
file2 = "10ex_glue_comparison_results.csv"

# CSVファイルの読み込み
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# 各種統計情報の計算関数
def calculate_summary(df):
    return {
        "base_model_accuracy": df["base_model_correct"].mean() * 100,  # Baseモデルの正答率 (%)
        "quantized_model_accuracy": df["quantized_model_correct"].mean() * 100,  # 量子化モデルの正答率 (%)
        "base_model_avg_time": df["base_model_time"].mean(),  # Baseモデルの平均実行時間 (秒)
        "quantized_model_avg_time": df["quantized_model_time"].mean(),  # 量子化モデルの平均実行時間 (秒)
        "base_model_time_std": df["base_model_time"].std(),  # Baseモデル実行時間の標準偏差 (秒)
        "quantized_model_time_std": df["quantized_model_time"].std()  # 量子化モデル実行時間の標準偏差 (秒)
    }

# それぞれのデータの統計情報を計算
summary1 = calculate_summary(df1)
summary2 = calculate_summary(df2)

# 差異を計算
comparison = {
    key: summary1[key] - summary2[key]
    for key in summary1
}

# 結果をデータフレームに変換して表示
summary_df1 = pd.DataFrame([summary1], index=["File1"])
summary_df2 = pd.DataFrame([summary2], index=["File2"])
comparison_df = pd.DataFrame([comparison], index=["Difference"])

# 結果を保存または表示
output_filename = "glue_comparison_summary.csv"
result_df = pd.concat([summary_df1, summary_df2, comparison_df])
result_df.to_csv(output_filename)

print("Comparison complete. Results saved to:", output_filename)


Comparison complete. Results saved to: glue_comparison_summary.csv
