# とりあえず練習用で山登り法のNotebook

### パッケージ

In [1]:
ex_num = "000"

In [2]:
import os
# 環境によって変更
os.chdir('/home/jovyan/work/notebook')

In [3]:
import numpy as np
import pandas as pd
import os
import time

from tqdm.notebook import tqdm

import random
from typing import List, Callable
import itertools, math

from utils import PerplexityCalculator

### データ読み込み

In [4]:
df = pd.read_csv("../input/sample_submission.csv")

In [5]:
sub_df = df.copy()
sub_df["best_value"] = 0

In [6]:
df

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbrea...
1,1,advent chimney elf family fireplace gingerbrea...
2,2,yuletide decorations gifts cheer holiday carol...
3,3,yuletide decorations gifts cheer holiday carol...
4,4,hohoho candle poinsettia snowglobe peppermint ...
5,5,advent chimney elf family fireplace gingerbrea...


### 関数

In [7]:
# 山登り法の実装
def hill_climbing(
    scorer,
    initial_sequence: List[str],
    max_iterations: int = 1000,
    BATCH_SIZE: int = 32
) -> List[str]:
    
    """山登り法で最適な順序を探索する。"""
    current_sequence = initial_sequence
    current_value = scorer.get_perplexity(" ".join(current_sequence), batch_size=1)
    
    best_sequence = current_sequence
    best_value = current_value
    
    # バッチ計算用のリスト
    perms = []
        
    for iteration in tqdm(range(max_iterations)):
        
        # listの要素をランダムに1つ入れ替える
        neighbor_sequence = best_sequence[:]
        i, j = random.sample(range(len(best_sequence)), 2)
        neighbor_sequence[i], neighbor_sequence[j] = neighbor_sequence[j], neighbor_sequence[i]
        
        # スペース区切りの文字列に変換
        perms.append(" ".join(neighbor_sequence))
        
        if len(perms) == BATCH_SIZE:
            
            # 評価値を計算
            neighbor_values = scorer.get_perplexity(perms, batch_size=BATCH_SIZE)
            
            # 改善があれば更新
            for value, sequence in zip(neighbor_values, perms):
                if value < best_value:
                    best_value = value
                    best_sequence = sequence.split(" ")
                    print( f"New best = {best_value} with '{best_sequence}'" )
    
            perms = []

    return best_sequence, best_value
        


In [8]:
def search_optimal_permutation(id: int, scorer, max_iterations: int = 1000, BATCH_SIZE=64):
    
    df = pd.read_csv("../input/sample_submission.csv")
    
    # 文字列を単語に分割
    words = df.loc[id,"text"].split()
    
    # 初期順序をシャッフル
    initial_sequence = words
    random.shuffle(initial_sequence)
    
    start = time.time()
    
    # 山登り法で最適な順序を探索
    best_sequence, best_value = hill_climbing(
        scorer=scorer,
        initial_sequence=initial_sequence,
        max_iterations=max_iterations,
        BATCH_SIZE=BATCH_SIZE
    )
    
    print(f"{id}th sample: {best_sequence}")
    print(f"Elapsed time: {time.time() - start:.2f} sec")
    print(f"Best value: {best_value}")
    
    return best_sequence, best_value


In [9]:
# LOAD GEMMA SCORER
scorer = PerplexityCalculator('../model/gemma-2-9b')

for i in [0,1,2,3,4,5]:
    
    print(f"Processing {i}th sample")
    best_sequence, best_value = search_optimal_permutation(
        id=4,
        scorer=scorer,
        max_iterations=2**9,
        BATCH_SIZE=16
    )
    
    sub_df.loc[i, "best_value"] = best_value
    sub_df.loc[i, "selected_text"] = " ".join(best_sequence)

scorer.clear_gpu_memory()

cuda


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Processing 0th sample


  0%|          | 0/512 [00:00<?, ?it/s]

New best = 1902.2282668101732 with '['doll', 'night', 'workshop', 'angel', 'it', 'to', 'with', 'candle', 'fireplace', 'have', 'season', 'bow', 'hope', 'as', 'candy', 'in', 'wish', 'star', 'eggnog', 'wonder', 'we', 'dream', 'joy', 'card', 'of', 'fruitcake', 'believe', 'the', 'peppermint', 'and', 'from', 'wreath', 'cookie', 'wrapping', 'hohoho', 'merry', 'milk', 'snowglobe', 'poinsettia', 'chocolate', 'paper', 'toy', 'game', 'kaggle', 'not', 'puzzle', 'you', 'peace', 'greeting', 'that']'
New best = 1822.223066866023 with '['doll', 'night', 'workshop', 'angel', 'it', 'to', 'with', 'candle', 'fireplace', 'have', 'season', 'wish', 'hope', 'as', 'candy', 'in', 'bow', 'star', 'eggnog', 'wonder', 'toy', 'dream', 'joy', 'card', 'of', 'fruitcake', 'believe', 'the', 'peppermint', 'and', 'from', 'wreath', 'cookie', 'wrapping', 'hohoho', 'merry', 'milk', 'snowglobe', 'poinsettia', 'chocolate', 'paper', 'we', 'game', 'kaggle', 'not', 'puzzle', 'you', 'peace', 'greeting', 'that']'
New best = 1759.273

  sub_df.loc[i, "best_value"] = best_value


  0%|          | 0/512 [00:00<?, ?it/s]

New best = 1652.6845586534514 with '['bow', 'of', 'candy', 'poinsettia', 'from', 'fireplace', 'you', 'greeting', 'and', 'that', 'hope', 'hohoho', 'wonder', 'with', 'paper', 'not', 'doll', 'wreath', 'eggnog', 'kaggle', 'night', 'fruitcake', 'the', 'game', 'star', 'peppermint', 'angel', 'snowglobe', 'chocolate', 'workshop', 'in', 'joy', 'dream', 'have', 'to', 'card', 'it', 'we', 'candle', 'wrapping', 'wish', 'milk', 'as', 'puzzle', 'toy', 'season', 'cookie', 'believe', 'peace', 'merry']'
New best = 1633.4302003356393 with '['bow', 'of', 'candy', 'poinsettia', 'from', 'fireplace', 'you', 'greeting', 'and', 'that', 'hope', 'hohoho', 'wonder', 'with', 'paper', 'not', 'doll', 'wreath', 'eggnog', 'kaggle', 'night', 'fruitcake', 'milk', 'game', 'star', 'peppermint', 'angel', 'snowglobe', 'chocolate', 'workshop', 'in', 'joy', 'dream', 'have', 'to', 'card', 'it', 'we', 'candle', 'wrapping', 'wish', 'the', 'as', 'puzzle', 'toy', 'season', 'cookie', 'believe', 'peace', 'merry']'
New best = 1608.10

  0%|          | 0/512 [00:00<?, ?it/s]

New best = 1343.6230325404526 with '['merry', 'game', 'workshop', 'from', 'wrapping', 'paper', 'kaggle', 'wreath', 'peace', 'cookie', 'snowglobe', 'candy', 'wish', 'wonder', 'it', 'that', 'bow', 'with', 'of', 'candle', 'doll', 'milk', 'puzzle', 'hope', 'we', 'to', 'poinsettia', 'and', 'card', 'chocolate', 'hohoho', 'fruitcake', 'star', 'toy', 'the', 'night', 'not', 'greeting', 'season', 'eggnog', 'as', 'dream', 'peppermint', 'believe', 'angel', 'in', 'have', 'joy', 'fireplace', 'you']'
New best = 1327.9693500653907 with '['merry', 'game', 'workshop', 'from', 'wrapping', 'paper', 'kaggle', 'wreath', 'peace', 'cookie', 'snowglobe', 'candy', 'wish', 'wonder', 'it', 'that', 'bow', 'chocolate', 'of', 'candle', 'doll', 'milk', 'puzzle', 'hope', 'we', 'to', 'poinsettia', 'and', 'card', 'with', 'hohoho', 'fruitcake', 'star', 'toy', 'the', 'night', 'not', 'greeting', 'season', 'eggnog', 'as', 'dream', 'peppermint', 'believe', 'angel', 'in', 'have', 'joy', 'fireplace', 'you']'
New best = 1287.11

KeyboardInterrupt: 

In [None]:
sub_df.loc[0, "best_value"] = best_value
sub_df.loc[0, "text"] = " ".join(best_sequence)

  sol_df.loc[0, "best_value"] = best_value


In [None]:
sub_df

Unnamed: 0,id,text,best_value
0,0,ornament reindeer mistletoe family advent scro...,1022.1745
1,1,advent chimney elf family fireplace gingerbrea...,0.0
2,2,yuletide decorations gifts cheer holiday carol...,0.0
3,3,yuletide decorations gifts cheer holiday carol...,0.0
4,4,hohoho candle poinsettia snowglobe peppermint ...,0.0
5,5,advent chimney elf family fireplace gingerbrea...,0.0


In [None]:
sub_df.to_csv(f"../submission{ex_num}.csv", index=False)