### 先頭の単語だけ固定し、2番目の単語を総当りして探索

In [1]:
ex_num = "004"

In [2]:
import os
# 環境によって変更
# os.chdir('/home/jovyan/work/notebook')

path = "../../output"
# path = "../model"

In [3]:
import numpy as np
import pandas as pd
import os
import time
import csv

from tqdm.notebook import tqdm

import random
from typing import List, Callable
import itertools, math
from typing import List, Tuple

from utils import PerplexityCalculator

### データ読み込み

In [4]:
df = pd.read_csv("../input/sample_submission.csv")

In [5]:
sub_df = df.copy()
sub_df["best_value"] = 0

In [6]:
df

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbrea...
1,1,advent chimney elf family fireplace gingerbrea...
2,2,yuletide decorations gifts cheer holiday carol...
3,3,yuletide decorations gifts cheer holiday carol...
4,4,hohoho candle poinsettia snowglobe peppermint ...
5,5,advent chimney elf family fireplace gingerbrea...


### 関数

In [7]:
def simulated_annealing(
    scorer,
    pattern_df,
    max_iterations: int = 1000,
    BATCH_SIZE: int = 32,
    initial_temperature: float = 100.0,
    cooling_rate: float = 0.99
) -> Tuple[List[str], float]:
    """
    焼きなまし法で最適な順序を探索する。

    Args:
        scorer: 評価関数を提供するオブジェクト。
        pattern_df: パターンのデータフレーム。
        max_iterations (int): 最大反復回数。
        BATCH_SIZE (int): バッチサイズ。
        initial_temperature (float): 初期温度。
        cooling_rate (float): 冷却率。

    Returns:
        solution_df: 最適な順序のデータフレーム。
    """
    
    solution_df = pattern_df.copy()
    solution_df["score"] = solution_df["score"].astype(float)
    
    words_len = len(solution_df.iloc[0]["text"].split())
    pattern_len = len(solution_df)
    
    temperature = initial_temperature
    
    perms = []
    best_score = 10000
    global_best_sequence = ""
    
    for iteration in tqdm(range(max_iterations)):
        # 1行ずつ処理する
        for idx in range(pattern_len):
            current_sequence = solution_df.iloc[idx]["text"].split()
            
            neighbor_sequence = current_sequence[:]
            
            # 最初と最後の単語を固定し、ランダムに交換する
            i, j = random.sample(range(1, words_len), 2)
            neighbor_sequence[i], neighbor_sequence[j] = neighbor_sequence[j], neighbor_sequence[i]
            
            perms.append(" ".join(neighbor_sequence))
        
        # 評価値をまとめて計算
        neighbor_values = scorer.get_perplexity(perms, batch_size=min(pattern_len, BATCH_SIZE))

        # print(f"neighbor_values = {neighbor_values}")
        # 焼きなましによる評価値更新
        for idx in range(pattern_len):
            perm = perms[idx]
            neighbor_value = neighbor_values[idx]
            current_value = solution_df.iloc[idx]["score"]
            # print(f"current_value = {current_value}, neighbor_value = {neighbor_value}")
            
            # 確率的に次の状態を受け入れるかを決定
            delta = neighbor_value - current_value
            acceptance_probability = math.exp(-delta / temperature) if delta > 0 else 1.0

            # ベスト解を更新
            if (random.random() < acceptance_probability) or (neighbor_value < current_value):
                current_sequence = perm.split(" ")
                current_value = neighbor_value
                
                solution_df.loc[solution_df.index[idx], "text"] = perm
                solution_df.loc[solution_df.index[idx], "score"] = current_value
                start_word = solution_df.iloc[idx]["start"]
                # print(f"start_word = {start_word}, end_word = {end_word}, New best = {current_value} with '{current_sequence}'")

            if current_value < best_score:
                best_score = current_value
                global_best_sequence = perm
                print(f"New best = {best_score} with '{perm}'")
                
        # バッチリセット
        perms = []

        # 温度を更新
        if iteration % 100 == 0:
            temperature *= cooling_rate
                    
        # 温度が非常に低くなったら終了
        if temperature < 1e-5:
            break

        # iteration100回ごとに現在の温度を表示
        if iteration % 100 == 0:
            print(f"Iteration {iteration}, Temperature = {temperature}")
            
    return solution_df, best_score, global_best_sequence
     

In [8]:
def search_optimal_permutation(id: int, scorer, max_iterations: int = 1000, BATCH_SIZE=64):
    
    df = pd.read_csv("../input/sub_20241218_01.csv")
    
    # 文字列を単語に分割
    words = df.loc[id,"text"].split()
    
    # データをタプルのリストとして準備
    pattern_data = []
    
    words_len = len(words)
    for i in range(1, words_len):
        new_words = words[:]
        if i != 1:
            new_words[i], new_words[1] = new_words[1], new_words[i]
        
        # new_words[1]意外の単語をシャッフル
        random.shuffle(new_words[1:])
        
        pattern_data.append((new_words[0], " ".join(new_words), 10000))
            
    pattern_df = pd.DataFrame(pattern_data, columns=["start", "text", "score"])        

    start = time.time()
    
    # SAで最適な順序を探索
    solutions_df, best_value, best_sequence = simulated_annealing(
        scorer,
        pattern_df,
        max_iterations=max_iterations,
        BATCH_SIZE=BATCH_SIZE,
        initial_temperature=50.0,
        cooling_rate=0.995
    )
    
    # # optimized_dfのscore列の値が最小の行を取得
    # best_sequence = solutions_df.loc[solutions_df["score"].idxmin(), "text"]
    # best_value = solutions_df["score"].min()
    
    print(f"{id}th sample: {best_sequence}")
    print(f"Elapsed time: {time.time() - start:.2f} sec")
    print(f"Best value: {best_value}")
    
    return best_sequence, best_value, solutions_df


In [9]:
# 文字列型にのみダブルクォーテーションを付ける関数
def add_quotes_to_strings(value):
    if isinstance(value, str):  # 文字列型の場合
        return f'"{value}"'
    return value  # それ以外はそのまま

In [10]:
# LOAD GEMMA SCORER
scorer = PerplexityCalculator(f'{path}/gemma_2_9b')

cuda


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
# scorer.clear_gpu_memory()

In [12]:
for i in [1, 2]:
    
    print(f"Processing {i}th sample")
    best_sequence, best_value, solutions_df = search_optimal_permutation(
        id=i,
        scorer=scorer,
        max_iterations=2**17,
        BATCH_SIZE=256
    )
    
    print(best_sequence)
    sub_df.loc[i, "best_value"] = best_value
    sub_df.loc[i, "text"] = best_sequence
    
    display(solutions_df)
    solutions_df.to_csv(f"{path}/out/solutions_{i}_{ex_num}.csv", index=False, header=True)



Processing 1th sample


  0%|          | 0/131072 [00:00<?, ?it/s]

New best = 716.3857293204885 with 'reindeer mistletoe elf gingerbread ornament advent night fireplace family chimney sleep drive walk jump bake laugh give and the scrooge'
Iteration 0, Temperature = 49.75
New best = 662.5484915670792 with 'reindeer mistletoe elf gingerbread ornament advent night scrooge chimney fireplace sleep drive walk give laugh jump bake and the family'
New best = 639.6604572352473 with 'reindeer mistletoe scrooge gingerbread ornament advent family elf chimney fireplace sleep drive walk give laugh jump bake and the night'
Iteration 100, Temperature = 49.50125
Iteration 200, Temperature = 49.25374375
New best = 634.6825800645618 with 'reindeer mistletoe elf scrooge gingerbread ornament advent fireplace chimney night drive walk sleep laugh jump bake give family and the'
New best = 577.8850653361966 with 'reindeer mistletoe elf scrooge gingerbread ornament advent fireplace chimney night drive walk sleep laugh jump bake the family and give'
Iteration 300, Temperature =

  sub_df.loc[i, "best_value"] = best_value


Unnamed: 0,start,text,score
0,reindeer,reindeer walk the night and drive sleep scroog...,547.130583
1,reindeer,reindeer sleep drive the night scrooge laugh m...,629.743441
2,reindeer,reindeer sleep walk the night and drive mistle...,439.631774
3,reindeer,reindeer jump the scrooge and family walk slee...,610.368272
4,reindeer,reindeer sleep walk the night and drive mistle...,448.302734
5,reindeer,reindeer and elf family night jump drive sleep...,511.977792
6,reindeer,reindeer mistletoe scrooge elf gingerbread orn...,566.707756
7,reindeer,reindeer the elf family scrooge mistletoe ging...,580.146844
8,reindeer,reindeer walk gingerbread bake scrooge drive c...,518.012825
9,reindeer,reindeer walk gingerbread bake elf sleep and d...,509.98178


Processing 2th sample


  0%|          | 0/131072 [00:00<?, ?it/s]

New best = 331.8514735526883 with 'jingle yuletide carol cheer grinch holiday holly naughty nice polar beard sleigh workshop chimney nutcracker decorations ornament gifts stocking magi'
Iteration 0, Temperature = 49.75
Iteration 100, Temperature = 49.50125
Iteration 200, Temperature = 49.25374375
Iteration 300, Temperature = 49.007475031249996
Iteration 400, Temperature = 48.762437656093745
Iteration 500, Temperature = 48.51862546781327
Iteration 600, Temperature = 48.27603234047421
Iteration 700, Temperature = 48.034652178771836
Iteration 800, Temperature = 47.794478917877974
Iteration 900, Temperature = 47.555506523288585
Iteration 1000, Temperature = 47.317728990672144
Iteration 1100, Temperature = 47.08114034571879
Iteration 1200, Temperature = 46.84573464399019
Iteration 1300, Temperature = 46.61150597077024
Iteration 1400, Temperature = 46.37844844091639
Iteration 1500, Temperature = 46.14655619871181
Iteration 1600, Temperature = 45.91582341771825
Iteration 1700, Temperature = 4

Unnamed: 0,start,text,score
0,jingle,jingle magi carol yuletide grinch holiday chee...,334.454217
1,jingle,jingle sleigh yuletide carol holly naughty nic...,373.110641
2,jingle,jingle yuletide carol cheer grinch holly stock...,346.421503
3,jingle,jingle yuletide carol cheer grinch holiday hol...,334.454217
4,jingle,jingle yuletide grinch gifts decorations holid...,364.467542
5,jingle,jingle magi carol yuletide grinch holiday chee...,334.454217
6,jingle,jingle magi carol yuletide grinch holiday chee...,378.986279
7,jingle,jingle yuletide carol cheer grinch holiday hol...,321.641477
8,jingle,jingle yuletide carol holiday cheer holly naug...,319.138443
9,jingle,jingle yuletide carol holiday cheer holly naug...,335.763234


In [13]:
sub_df

Unnamed: 0,id,text,best_value
0,0,advent chimney elf family fireplace gingerbrea...,0.0
1,1,reindeer sleep walk the night and drive mistle...,424.444498
2,2,jingle yuletide carol cheer grinch holiday hol...,316.654888
3,3,yuletide decorations gifts cheer holiday carol...,0.0
4,4,hohoho candle poinsettia snowglobe peppermint ...,0.0
5,5,advent chimney elf family fireplace gingerbrea...,0.0


In [14]:
# 各セルに関数を適用
sub_df["text"] = sub_df["text"].astype(str)
sub_df = sub_df.applymap(add_quotes_to_strings)

  sub_df = sub_df.applymap(add_quotes_to_strings)


In [15]:
sub_df[["id", "text"]].to_csv(f"{path}/out/submission_{ex_num}.csv", index=False, header=True, quoting=csv.QUOTE_NONE)
sub_df.to_csv(f"{path}/out/score_{ex_num}.csv", index=False, header=True, quoting=csv.QUOTE_NONE)

In [16]:
np.mean(sub_df["best_value"])

np.float64(123.51656433131616)