# とりあえず練習用で山登り法のNotebook

### パッケージ

In [1]:
ex_num = "000"

In [2]:
import os
# 環境によって変更
os.chdir('/home/jovyan/work/notebook')

path = "../../output"
path = "../model"

In [3]:
import numpy as np
import pandas as pd
import os
import time
import csv

from tqdm.notebook import tqdm

import random
from typing import List, Callable
import itertools, math
from typing import List, Tuple

from utils import PerplexityCalculator

### データ読み込み

In [4]:
df = pd.read_csv("../input/sample_submission.csv")

In [5]:
sub_df = df.copy()
sub_df["best_value"] = 0

In [6]:
df

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbrea...
1,1,advent chimney elf family fireplace gingerbrea...
2,2,yuletide decorations gifts cheer holiday carol...
3,3,yuletide decorations gifts cheer holiday carol...
4,4,hohoho candle poinsettia snowglobe peppermint ...
5,5,advent chimney elf family fireplace gingerbrea...


### 関数

In [7]:
def simulated_annealing(
    scorer,
    initial_sequence: List[str],
    max_iterations: int = 1000,
    BATCH_SIZE: int = 32,
    initial_temperature: float = 100.0,
    cooling_rate: float = 0.99
) -> Tuple[List[str], float]:
    """
    焼きなまし法で最適な順序を探索する。

    Args:
        scorer: 評価関数を提供するオブジェクト。
        initial_sequence (List[str]): 初期の文字列リスト。
        max_iterations (int): 最大反復回数。
        BATCH_SIZE (int): バッチサイズ。
        initial_temperature (float): 初期温度。
        cooling_rate (float): 冷却率。

    Returns:
        Tuple[List[str], float]: 最適な順序とその評価値。
    """
    current_sequence = initial_sequence
    current_value = scorer.get_perplexity(" ".join(current_sequence), batch_size=1)

    best_sequence = current_sequence
    best_value = current_value

    temperature = initial_temperature

    perms = []

    for iteration in tqdm(range(max_iterations)):
        # 隣接解を生成（ランダムに要素を入れ替え）
        neighbor_sequence = current_sequence[:]
        i, j = random.sample(range(len(current_sequence)), 2)
        neighbor_sequence[i], neighbor_sequence[j] = neighbor_sequence[j], neighbor_sequence[i]

        perms.append(" ".join(neighbor_sequence))

        if len(perms) == BATCH_SIZE:
            # 評価値を計算
            neighbor_values = scorer.get_perplexity(perms, batch_size=BATCH_SIZE)

            for neighbor_value, perm in zip(neighbor_values, perms):
                # 確率的に次の状態を受け入れるかを決定
                delta = neighbor_value - current_value
                acceptance_probability = math.exp(-delta / temperature) if delta > 0 else 1.0

                if random.random() < acceptance_probability:
                    current_sequence = perm.split(" ")
                    current_value = neighbor_value

                # ベスト解を更新
                if neighbor_value < best_value:
                    best_sequence = perm.split(" ")
                    best_value = neighbor_value
                    print(f"New best = {best_value} with '{best_sequence}'")

            # バッチリセット
            perms = []

            # 温度を更新
            temperature *= cooling_rate
                      
            # 温度が非常に低くなったら終了
            if temperature < 1e-5:
                break
            
        # iteration100回ごとに現在の温度を表示
        if iteration % 100 == 0:
            print(f"Iteration {iteration}, Temperature = {temperature}")

    return best_sequence, best_value

In [8]:
def search_optimal_permutation(id: int, scorer, max_iterations: int = 1000, BATCH_SIZE=64):
    
    df = pd.read_csv("../input/sample_submission.csv")
    
    # 文字列を単語に分割
    words = df.loc[id,"text"].split()
    
    # 初期順序をシャッフル
    initial_sequence = words
    random.shuffle(initial_sequence)
    
    start = time.time()
    
    # SAで最適な順序を探索
    best_sequence, best_value = simulated_annealing(
        scorer,
        initial_sequence=initial_sequence,
        max_iterations=max_iterations,
        BATCH_SIZE=BATCH_SIZE,
        initial_temperature=100.0,
        cooling_rate=0.99
    )
    
    print(f"{id}th sample: {best_sequence}")
    print(f"Elapsed time: {time.time() - start:.2f} sec")
    print(f"Best value: {best_value}")
    
    return best_sequence, best_value


In [9]:
# 文字列型にのみダブルクォーテーションを付ける関数
def add_quotes_to_strings(value):
    if isinstance(value, str):  # 文字列型の場合
        return f'"{value}"'
    return value  # それ以外はそのまま

In [10]:
# LOAD GEMMA SCORER
scorer = PerplexityCalculator(f'{path}/gemma_2_9b')

cuda


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [11]:
# scorer.clear_gpu_memory()

In [12]:
for i in [0,1,2,3,4,5]:
    
    print(f"Processing {i}th sample")
    best_sequence, best_value = search_optimal_permutation(
        id=i,
        scorer=scorer,
        max_iterations=2**9,
        BATCH_SIZE=16
    )
    
    print(best_sequence)
    sub_df.loc[i, "best_value"] = best_value
    sub_df.loc[i, "text"] = " ".join(best_sequence)



Processing 0th sample


  0%|          | 0/512 [00:00<?, ?it/s]

Iteration 0, Temperature = 100.0
New best = 4203.824562345984 with '['chimney', 'fireplace', 'advent', 'elf', 'ornament', 'reindeer', 'gingerbread', 'scrooge', 'mistletoe', 'family']'
New best = 2957.7599879429495 with '['chimney', 'reindeer', 'fireplace', 'elf', 'ornament', 'advent', 'gingerbread', 'scrooge', 'mistletoe', 'family']'
New best = 2122.088979078394 with '['ornament', 'reindeer', 'fireplace', 'elf', 'chimney', 'advent', 'gingerbread', 'scrooge', 'mistletoe', 'family']'
New best = 1718.5200270175208 with '['ornament', 'reindeer', 'family', 'elf', 'chimney', 'advent', 'gingerbread', 'scrooge', 'mistletoe', 'fireplace']'
New best = 1627.0620594881786 with '['ornament', 'reindeer', 'fireplace', 'chimney', 'elf', 'advent', 'mistletoe', 'scrooge', 'gingerbread', 'family']'
New best = 1481.4568637723241 with '['ornament', 'family', 'fireplace', 'chimney', 'elf', 'advent', 'gingerbread', 'scrooge', 'mistletoe', 'reindeer']'
Iteration 100, Temperature = 94.1480149401
New best = 146

  sub_df.loc[i, "best_value"] = best_value


  0%|          | 0/512 [00:00<?, ?it/s]

Iteration 0, Temperature = 100.0
New best = 4440.123631628586 with '['give', 'reindeer', 'sleep', 'drive', 'the', 'walk', 'jump', 'laugh', 'family', 'scrooge', 'night', 'mistletoe', 'bake', 'advent', 'chimney', 'elf', 'ornament', 'fireplace', 'and', 'gingerbread']'
New best = 4171.110140058703 with '['give', 'gingerbread', 'sleep', 'drive', 'the', 'walk', 'and', 'laugh', 'family', 'scrooge', 'night', 'mistletoe', 'bake', 'advent', 'chimney', 'elf', 'ornament', 'fireplace', 'jump', 'reindeer']'
New best = 3857.64626282737 with '['scrooge', 'gingerbread', 'sleep', 'drive', 'the', 'walk', 'jump', 'laugh', 'family', 'give', 'night', 'mistletoe', 'bake', 'advent', 'chimney', 'elf', 'ornament', 'fireplace', 'and', 'reindeer']'
New best = 3431.0616115058406 with '['scrooge', 'gingerbread', 'sleep', 'drive', 'the', 'walk', 'jump', 'laugh', 'family', 'give', 'bake', 'mistletoe', 'night', 'advent', 'chimney', 'elf', 'ornament', 'fireplace', 'and', 'reindeer']'
New best = 3325.498943407423 with '

  0%|          | 0/512 [00:00<?, ?it/s]

Iteration 0, Temperature = 100.0
New best = 4371.285895201858 with '['nice', 'ornament', 'holiday', 'cheer', 'yuletide', 'beard', 'nutcracker', 'grinch', 'polar', 'magi', 'carol', 'workshop', 'stocking', 'gifts', 'jingle', 'decorations', 'holly', 'sleigh', 'chimney', 'naughty']'
New best = 4138.650303425442 with '['nice', 'ornament', 'holiday', 'cheer', 'yuletide', 'beard', 'nutcracker', 'grinch', 'polar', 'magi', 'jingle', 'workshop', 'sleigh', 'gifts', 'carol', 'decorations', 'holly', 'stocking', 'chimney', 'naughty']'
New best = 1698.4986622608421 with '['magi', 'ornament', 'holiday', 'cheer', 'yuletide', 'beard', 'nutcracker', 'grinch', 'polar', 'nice', 'jingle', 'workshop', 'stocking', 'gifts', 'carol', 'decorations', 'holly', 'sleigh', 'chimney', 'naughty']'
New best = 1627.0620594881786 with '['magi', 'ornament', 'holiday', 'cheer', 'yuletide', 'beard', 'nutcracker', 'grinch', 'polar', 'nice', 'jingle', 'workshop', 'chimney', 'gifts', 'carol', 'decorations', 'holly', 'sleigh', '

  0%|          | 0/512 [00:00<?, ?it/s]

Iteration 0, Temperature = 100.0
New best = 6116.528646774525 with '['chimney', 'polar', 'magi', 'stocking', 'nutcracker', 'gifts', 'holiday', 'decorations', 'workshop', 'yuletide', 'is', 'sleigh', 'relax', 'eat', 'the', 'ornament', 'cheer', 'naughty', 'visit', 'and', 'holly', 'cheer', 'grinch', 'unwrap', 'carol', 'of', 'nice', 'beard', 'jingle', 'sing']'
New best = 5882.208114053411 with '['chimney', 'polar', 'magi', 'stocking', 'nutcracker', 'gifts', 'holiday', 'decorations', 'workshop', 'yuletide', 'is', 'sleigh', 'relax', 'eat', 'the', 'ornament', 'cheer', 'carol', 'visit', 'grinch', 'holly', 'cheer', 'and', 'unwrap', 'naughty', 'of', 'nice', 'beard', 'jingle', 'sing']'
New best = 5745.946909852821 with '['chimney', 'polar', 'magi', 'stocking', 'nutcracker', 'gifts', 'holiday', 'decorations', 'workshop', 'yuletide', 'is', 'sleigh', 'relax', 'eat', 'the', 'ornament', 'cheer', 'naughty', 'visit', 'grinch', 'holly', 'cheer', 'and', 'unwrap', 'carol', 'of', 'jingle', 'beard', 'nice', '

KeyboardInterrupt: 

In [None]:
sub_df

Unnamed: 0,id,text,best_value,selected_text
0,0,ornament reindeer elf mistletoe gingerbread fa...,960.244078,"""""reindeer mistletoe elf scrooge chimney firep..."
1,1,reindeer sleep drive the scrooge laugh gingerb...,654.829568,"""""reindeer mistletoe jump drive scrooge orname..."
2,2,jingle sleigh yuletide carol grinch naughty ni...,427.773457,"""""jingle grinch carol yuletide holly holiday c..."
3,3,grinch ornament decorations of holly yuletide ...,417.864096,"""""holiday is cheer yuletide holly jingle relax..."
4,4,from chocolate candy paper angel toy candle be...,534.456317,"""""wreath and hope peace night bow candle dream..."
5,5,is not eggnog unwrap hohoho the joy cookie puz...,441.352444,"""""sleigh you sing unwrap fruitcake stocking ho..."


In [None]:
# 各セルに関数を適用
sub_df["text"] = sub_df["text"].astype(str)
sub_df = sub_df.applymap(add_quotes_to_strings)

  sub_df = sub_df.applymap(add_quotes_to_strings)


In [None]:
sub_df[["id", "text"]].to_csv(f"../../output/out/submission{ex_num}.csv", index=False, header=True, quoting=csv.QUOTE_NONE)

In [None]:
np.mean(sub_df["best_value"])

np.float64(572.7533266893)

In [None]:
sub_df.iloc[0]["text"]

'"ornament reindeer elf mistletoe gingerbread family advent scrooge chimney fireplace"'

In [None]:
sub_df.dtypes

id                 int64
text              object
best_value       float64
selected_text     object
dtype: object