## 步骤1: GPU检查和Drive挂载

In [3]:
# 检查GPU
!nvidia-smi

import torch
print(f"CUDA可用: {torch.cuda.is_available()}")
print(f"GPU型号: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

Thu Dec 11 10:48:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# 验证PyTorch及依赖版本，确保与 Colab/VS Code 环境.yaml 一致
import torch
import transformers
import diffusers
import sys
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"Diffusers: {diffusers.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
print(f"GPU数量: {torch.cuda.device_count()}")

torch: 2.9.0+cu126 cuda: True device count: 1


## 步骤2: 上传并解压代码

**重要**: 
1. 将 `tess-diffusion-colab.zip` 上传到 Google Drive 的 `MyDrive` 目录 (此文件使用跨平台脚本打包, 不含反斜杠路径)
2. 运行下面的代码自动解压
3. 所有路径已适配 Linux/Colab 环境; 如果你上传的是旧的 Windows 压缩包, 下方提供修复脚本

**不要使用 GitHub clone** - 代码可能未包含最新修复

In [None]:
# 从Drive解压zip文件 (推荐使用 tess-diffusion-colab.zip)
import os, zipfile, shutil, pathlib

zip_path = '/content/drive/MyDrive/tess-diffusion-colab.zip'  # 新的跨平台打包文件
fallback_old_zip = '/content/drive/MyDrive/tess-diffusion-final.zip'  # 旧Windows打包(可能含反斜杠)
extract_to = '/content/tess-diffusion'

# 如果已经有旧目录,清理
if os.path.exists(extract_to):
    print(f"⚠️ 发现旧目录 {extract_to}, 正在删除...")
    shutil.rmtree(extract_to)

use_zip = None
if os.path.exists(zip_path):
    use_zip = zip_path
    print(f"✅ 使用新zip文件: {zip_path}")
elif os.path.exists(fallback_old_zip):
    use_zip = fallback_old_zip
    print(f"⚠️ 新zip缺失,使用旧文件: {fallback_old_zip}")
else:
    print("❌ 未找到 tess-diffusion-colab.zip 或 tess-diffusion-final.zip, 请上传后重试")

if use_zip:
    size_mb = os.path.getsize(use_zip) / (1024*1024)
    print(f"文件大小: {size_mb:.2f} MB")
    print(f"开始解压到: {extract_to}")
    with zipfile.ZipFile(use_zip, 'r') as zf:
        zf.extractall(extract_to)
    print("✓ 解压完成")
    %cd {extract_to}

    # 检查是否存在反斜杠展开问题 (文件名中包含 '\\')
    bad_entries = []
    for root, dirs, files in os.walk(extract_to):
        for f in files:
            if '\\' in f:
                bad_entries.append(os.path.join(root, f))

    if bad_entries:
        print(f"⚠️ 检测到 {len(bad_entries)} 个含反斜杠文件名, 执行修复...")
        for full_path in bad_entries:
            rel = os.path.relpath(full_path, extract_to)
            parts = rel.split('\\')  # Windows 残留路径切分
            # 新的正确路径
            target_path = pathlib.Path(extract_to).joinpath(*parts)
            # 如果目标父目录不存在,创建
            target_path.parent.mkdir(parents=True, exist_ok=True)
            # 移动文件(如果不是同一个路径)
            if full_path != str(target_path):
                shutil.move(full_path, target_path)
        print("✓ 反斜杠文件已修复为真实目录结构")

        # 清理可能遗留的扁平文件(包含反斜杠的原始名称)
        removed = 0
        for root, dirs, files in os.walk(extract_to):
            for f in files:
                if '\\' in f:
                    try:
                        os.remove(os.path.join(root, f))
                        removed += 1
                    except OSError:
                        pass
        if removed:
            print(f"✓ 清理残留伪文件 {removed} 个")
    else:
        print("✅ 没有发现含反斜杠的文件名, 结构正常")

    # 验证关键目录
    print("\n目录验证:")
    for d in ['scripts', 'sdlm', 'configs']:
        print(f"{d}: {'存在' if os.path.isdir(d) else '缺失'}")

    print("\n列出前20个文件以确认:")
    import itertools
    count = 0
    for root, dirs, files in os.walk('.'):
        for f in files:
            print(os.path.join(root, f))
            count += 1
            if count >= 20:
                break
        if count >= 20:
            break

    print("\n✅ 项目准备完成, 可继续下一步")

## 步骤3: 安装依赖 (~5分钟)

## 步骤2.5: 验证目录结构 (重要!)

确保解压成功并在正确的目录

In [None]:
import os

# 检查当前目录和关键文件
print("=" * 60)
print("目录验证")
print("=" * 60)
print(f"当前目录: {os.getcwd()}")
print()

# 检查关键文件
check_files = [
    'tess_train1_oneline.txt',
    'tess_valid1_oneline.txt',
    'run_mlm.py',
    'extend_tokenizer_vocab.py',
    'configs/tess_gpu_oneline_sc.json'
]

all_exists = True
for file in check_files:
    exists = os.path.exists(file)
    status = "✓" if exists else "✗"
    print(f"{status} {file}")
    if not exists:
        all_exists = False

print("=" * 60)

if all_exists:
    print("✓ 所有关键文件都存在,可以继续!")
else:
    print("✗ 缺少文件,请检查:")
    print("  1. 是否成功运行步骤2解压代码?")
    print("  2. 是否在正确的目录 /content/tess-diffusion ?")
    print("\n尝试切换目录:")
    if os.path.exists('/content/tess-diffusion'):
        %cd /content/tess-diffusion
        print("已切换到 /content/tess-diffusion,请重新运行此单元格验证!")
    else:
        print("未找到 /content/tess-diffusion,请返回步骤2!")

In [None]:
# 安装PyTorch和依赖
!pip install -q torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 \
    --extra-index-url https://download.pytorch.org/whl/cu113

!pip install -q transformers==4.25.1 diffusers==0.7.2 datasets==2.14.6 \
    accelerate==0.12.0 tensorboard scipy scikit-learn nltk \
    sacrebleu evaluate bert_score

print("✓ 依赖安装完成")

In [None]:
# 验证安装
import torch
import transformers
import diffusers

print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"Diffusers: {diffusers.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

## 步骤4: 扩展Tokenizer词汇表 (~3分钟)

In [None]:
# 检查数据文件
!head -n 3 tess_train1_oneline.txt
!wc -l tess_*.txt

In [None]:
# 扩展tokenizer - 关键步骤!
!python extend_tokenizer_vocab.py \
    --train_file tess_train1_oneline.txt \
    --base_model roberta-base \
    --output_dir extended_tokenizer

# 查看统计
!cat extended_tokenizer/vocab_extension_stats.json

In [None]:
# 验证tokenizer扩展结果
import json
from transformers import AutoTokenizer

# 加载扩展后的tokenizer
tokenizer = AutoTokenizer.from_pretrained('extended_tokenizer')

# 测试几个实体
test_entities = ['South_Korea', 'North_Korea', 'Cleric_(Iraq)', 'Government_Official_(Turkey)']

print("=" * 60)
print("Tokenizer扩展验证")
print("=" * 60)
print(f"词汇表大小: {len(tokenizer)}")

all_single_token = True
for entity in test_entities:
    tokens = tokenizer.tokenize(entity)
    is_single = len(tokens) == 1 and tokens[0] == entity
    status = "✓" if is_single else "✗"
    print(f"{status} '{entity}' → {tokens}")
    if not is_single:
        all_single_token = False

print("=" * 60)
if all_single_token:
    print("✓ 所有测试实体都是单个token,扩展成功!")
else:
    print("✗ 部分实体被分词,扩展可能有问题")
print("=" * 60)

## 步骤5: 配置训练参数

In [None]:
import json
import os

# 确保在正确的目录
if not os.path.exists('configs/tess_gpu_oneline_sc.json'):
    print("⚠️ 当前目录不正确,尝试切换到tess-diffusion...")
    if os.path.exists('/content/tess-diffusion'):
        %cd /content/tess-diffusion
        print("✓ 已切换到 /content/tess-diffusion")
    else:
        print("✗ 未找到tess-diffusion目录,请先运行步骤2!")
        raise FileNotFoundError("请先运行步骤2解压代码")

# 验证目录
print(f"当前目录: {os.getcwd()}")
print(f"configs/tess_gpu_oneline_sc.json 存在: {os.path.exists('configs/tess_gpu_oneline_sc.json')}")

# 读取已配置好的GPU配置(已包含save_total_limit: 5)
with open('configs/tess_gpu_oneline_sc.json', 'r') as f:
    config = json.load(f)

# 修改为Colab适配配置
config.update({
    'tokenizer_name': 'extended_tokenizer',
    'output_dir': '/content/drive/MyDrive/tess_outputs',
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'num_train_epochs': 3,  # 快速验证改为1
    'fp16': True,
    'time_save_interval_seconds': 1800,
    'gdrive_backup_dir': '/content/drive/MyDrive/tess_backups',
    'backup_keep_last': 2,
    # save_total_limit: 5 已在tess_gpu_oneline_sc.json中配置
})

# 保存配置
with open('configs/tess_colab.json', 'w') as f:
    json.dump(config, f, indent=2)

print("\n✓ 配置已更新")
print(json.dumps(config, indent=2))

## 步骤6: 训练模型

### 选项A: 快速验证 (1 epoch, ~2小时)

In [None]:
# 快速训练1个epoch (配置已包含save_total_limit: 5)
!python run_mlm.py \
    --model_name_or_path roberta-base \
    --tokenizer_name extended_tokenizer \
    --train_file tess_train1_oneline.txt \
    --validation_file tess_valid1_oneline.txt \
    --output_dir /content/drive/MyDrive/tess_outputs_quick \
    --line_by_line True \
    --max_seq_length 256 \
    --pad_to_max_length True \
    --per_device_train_batch_size 8 \
    --num_train_epochs 1 \
    --save_steps 500 \
    --save_total_limit 5 \
    --eval_steps 500 \
    --logging_steps 50 \
    --fp16 True \
    --simplex_value 5 \
    --num_diffusion_steps 500 \
    --self_condition logits_addition \
    --self_condition_zeros_after_softmax True \
    --overwrite_output_dir True

### 选项B: 完整训练 (3 epochs, ~6-7小时)

In [None]:
# 完整训练3个epoch
!python run_mlm.py configs/tess_colab.json

## 步骤7: 监控训练 (在训练时运行)

In [None]:
# 启动TensorBoard
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/tess_outputs

In [None]:
# 查看最新checkpoints
!ls -lht /content/drive/MyDrive/tess_outputs/checkpoint-* | head -5

## 步骤8: 评测模型

### 快速评测 (200 queries, ~5分钟)

In [None]:
# 找到最新checkpoint
import os
checkpoint_dirs = [d for d in os.listdir('/content/drive/MyDrive/tess_outputs') if d.startswith('checkpoint-')]
if checkpoint_dirs:
    latest_checkpoint = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[1]))[-1]
    checkpoint_path = f'/content/drive/MyDrive/tess_outputs/{latest_checkpoint}'
    print(f"使用checkpoint: {checkpoint_path}")
else:
    print("未找到checkpoint")

In [None]:
# 快速评测
!python run_optimized_eval.py \
    --checkpoint {checkpoint_path} \
    --mode tail \
    --quick

### Grid Search 最优参数 (~20分钟)

In [None]:
# Grid search找最优tess_t_eval
!python run_optimized_eval.py \
    --checkpoint {checkpoint_path} \
    --grid_search \
    --num_queries 500

### 完整评测 (2000 queries, ~40分钟)

In [None]:
# Tail预测 (给定h,r预测t)
!python run_optimized_eval.py \
    --checkpoint {checkpoint_path} \
    --mode tail \
    --num_queries 2000 \
    --tess_t_eval 60 \
    --neg_k 128 \
    --output eval_tail_results.json

In [None]:
# Head预测 (给定r,t预测h)
!python run_optimized_eval.py \
    --checkpoint {checkpoint_path} \
    --mode head \
    --num_queries 2000 \
    --tess_t_eval 60 \
    --neg_k 128 \
    --output eval_head_results.json

## 步骤9: 查看结果

In [None]:
import json

# 读取结果
with open('eval_tail_results.json', 'r') as f:
    tail_results = json.load(f)

with open('eval_head_results.json', 'r') as f:
    head_results = json.load(f)

# 显示结果
print("=" * 60)
print("最终评测结果")
print("=" * 60)
print(f"\nTail预测 (给定h,r预测t):")
print(f"  MRR: {tail_results['MRR']:.4f}")
print(f"  Hits@1: {tail_results['Hits@1']:.4f}")
print(f"  Hits@3: {tail_results['Hits@3']:.4f}")
print(f"  Hits@10: {tail_results['Hits@10']:.4f}")

print(f"\nHead预测 (给定r,t预测h):")
print(f"  MRR: {head_results['MRR']:.4f}")
print(f"  Hits@1: {head_results['Hits@1']:.4f}")
print(f"  Hits@3: {head_results['Hits@3']:.4f}")
print(f"  Hits@10: {head_results['Hits@10']:.4f}")

print("\n" + "=" * 60)

# 性能提升
baseline_tail_mrr = 0.167
improvement = (tail_results['MRR'] - baseline_tail_mrr) / baseline_tail_mrr * 100
print(f"\nTail MRR提升: {improvement:.1f}%")
print(f"修复前: {baseline_tail_mrr:.4f}")
print(f"修复后: {tail_results['MRR']:.4f}")

## 步骤10: 保存结果到Drive

In [None]:
# 创建结果目录
!mkdir -p /content/drive/MyDrive/tess_final_results

# 复制评测结果
!cp eval_*.json /content/drive/MyDrive/tess_final_results/

# 复制最佳checkpoint
!cp -r {checkpoint_path} /content/drive/MyDrive/tess_final_results/best_checkpoint

print("✓ 结果已保存到 /content/drive/MyDrive/tess_final_results/")

---

## 常见问题

### 1. 训练中断怎么办?
从checkpoint恢复:
```python
!python run_mlm.py \
    --resume_from_checkpoint /content/drive/MyDrive/tess_outputs/checkpoint-3000 \
    configs/tess_colab.json
```

### 2. 内存不足?
减小batch size:
```python
config['per_device_train_batch_size'] = 4
config['gradient_accumulation_steps'] = 2
```

### 3. 评测结果仍然低?
- 运行grid search找最优tess_t_eval
- 检查实体是否被正确tokenize
- 确认训练loss < 2.0

### 4. Checkpoint管理
系统会自动保持最新5个checkpoints (`save_total_limit: 5`):
- 每500步保存一个checkpoint
- 旧的checkpoint会被自动删除
- Google Drive备份保留最新2个

---

## 预期性能

修复后预期:
- **Tail MRR**: 35-45% (原16.7%)
- **Tail Hits@10**: 55-65% (原34.7%)
- **训练时间**: 6-7小时 (3 epochs)
- **Checkpoint数量**: 最多5个 (自动管理)