### SAのテンプレート

In [1]:
ex_num = "006"

In [2]:
import os
# 環境によって変更
os.chdir('/home/jovyan/work/notebook')

# path = "../../output"
path = "../model"

# 基準にするベースファイル
base_file_path = "../input/sub20241219.csv"

In [3]:
class CFG:
    path = "../model"
    base_file_path = "../input/sub20241219.csv"
    BATCH_SIZE = 32

In [4]:
import numpy as np
import pandas as pd
import os
import time
import csv

from tqdm.notebook import tqdm

import random
from typing import List, Callable
import itertools, math
from typing import List, Tuple

from utils import PerplexityCalculator

### データ読み込み

In [5]:
df = pd.read_csv(CFG.base_file_path)

In [6]:
sub_df = df.copy()
sub_df["best_value"] = 0

In [7]:
df

Unnamed: 0,id,text
0,0,reindeer mistletoe elf gingerbread family adve...
1,1,reindeer sleep walk the night and drive mistle...
2,2,jingle yuletide carol cheer grinch holiday hol...
3,3,sleigh the holly jingle unwrap gifts of magi r...
4,4,peppermint candy milk chocolate eggnog fruitca...
5,5,sleigh of naughty nice the of not the to and t...


### 関数

In [8]:
# 2単語組み合わせ全てから最低スコアとなるペアを探す
def find_initial_pair(
    CFG,
    scorer,
    words
):
    best_score = float('inf')
    best_pair = None
    for i in range(len(words)):
        perms = []
        for j in range(i+1, len(words)):
            pair = [words[i], words[j]]
            perms.append(" ".join(pair))
        
        scores = scorer.get_perplexity(perms, batch_size=CFG.BATCH_SIZE)
        for j, perm in enumerate(perms):
            if scores[j] < best_score:
                best_score = scores[j]
                best_pair = perm.split(" ")
                
                # print(f"best_score: {best_score}, best_pair: {best_pair}")
     
    return best_pair, best_score

In [16]:
# 文字列のファイルからコストを最小化する文字列を生成する
def greedy_add_initialize(
    CFG,
    scorer,
    initial_words: list
):
    
    # 全単語
    candidates = initial_words[:]

    print("candidates:", candidates)
    print("-----初期配置の組み合わせ計算開始-----")
    # 初期配置を求める
    initial, initial_score = find_initial_pair(
        CFG,
        scorer,
        candidates
    )
    print("-----初期配置の組み合わせ計算終了-----")
    for w in initial:
        candidates.remove(w)

    print("initial:", initial, initial_score)
    current_order = initial[:]

    print("-----貪欲法での挿入開始-----")
    # 貪欲に残りを挿入
    while candidates:
        best_score = float('inf')
        best_word = None
        best_pos = None

        # 残っている単語それぞれについて
        for w in candidates:
            # current_orderのあらゆる挿入位置を試す
            # 挿入位置は0からlen(current_order)まで
            
            perms = []
            for pos in range(len(current_order) + 1):
                new_order = current_order[:pos] + [w] + current_order[pos:]
                perms.append(" ".join(new_order))
                
            scores = scorer.get_perplexity(perms, batch_size=CFG.BATCH_SIZE)
                
            for pos in range(len(current_order) + 1):
                if scores[pos] < best_score:
                    best_score = scores[pos]
                    best_word = w
                    best_pos = pos
                    
                    # print("best_word:", best_word, best_score)
        
        # best_wordをbest_posに挿入
        current_order.insert(best_pos, best_word)
        candidates.remove(best_word)
        print("current_order:", current_order, best_score)
    print("-----貪欲法での挿入終了-----")
    
    best_score = scorer.get_perplexity([" ".join(current_order)], batch_size=1)[0]
    
    return current_order, best_score

In [17]:
def search_optimal_permutation(
    CFG,
    id: int, 
    scorer, 
    max_iterations: int = 1000, 
    BATCH_SIZE=64
):
    
    df = pd.read_csv(CFG.base_file_path)
    
    start = time.time()
    # 文字列を単語に分割
    words = df.loc[id,"text"].split()
    
    initial_sequence, initial_score = greedy_add_initialize(
        CFG,
        scorer=scorer,
        initial_words=words
    )
    
    print(f"{id}th sample: {initial_sequence}")
    print(f"Elapsed time: {time.time() - start:.2f} sec")
    print(f"initial_score: {initial_score}")
    
    
    best_sequence = initial_sequence
    best_score = initial_score
    
    return best_sequence, best_score


In [11]:
# 文字列型にのみダブルクォーテーションを付ける関数
def add_quotes_to_strings(value):
    if isinstance(value, str):  # 文字列型の場合
        return f'"{value}"'
    return value  # それ以外はそのまま

In [12]:
# LOAD GEMMA SCORER
scorer = PerplexityCalculator(f'{path}/gemma_2_9b')

cuda


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [13]:
# scorer.clear_gpu_memory()

In [22]:
for i in [0]:
    
    print(f"Processing {i}th sample")
    best_sequence, best_score = search_optimal_permutation(
        CFG,
        id=i,
        scorer=scorer,
        max_iterations=2**10,
    )
    
    print("Best sequence:", best_sequence)
    print("Best score:", best_score)
    sub_df.loc[i, "best_value"] = best_score
    sub_df.loc[i, "text"] = " ".join(best_sequence)



Processing 0th sample
candidates: ['reindeer', 'mistletoe', 'elf', 'gingerbread', 'family', 'advent', 'scrooge', 'chimney', 'fireplace', 'ornament']
-----初期配置の組み合わせ計算開始-----
-----初期配置の組み合わせ計算終了-----
initial: ['reindeer', 'scrooge'] 7732.016698600223
-----貪欲法での挿入開始-----
current_order: ['reindeer', 'mistletoe', 'scrooge'] 1665.646724298566
current_order: ['reindeer', 'mistletoe', 'scrooge', 'elf'] 1026.1751780534337
current_order: ['reindeer', 'mistletoe', 'scrooge', 'gingerbread', 'elf'] 874.3121824177148
current_order: ['reindeer', 'mistletoe', 'scrooge', 'gingerbread', 'elf', 'ornament'] 750.765798300995
current_order: ['reindeer', 'mistletoe', 'scrooge', 'gingerbread', 'chimney', 'elf', 'ornament'] 686.2554949467076
current_order: ['reindeer', 'mistletoe', 'scrooge', 'gingerbread', 'chimney', 'fireplace', 'elf', 'ornament'] 560.1054110313753
current_order: ['reindeer', 'mistletoe', 'scrooge', 'gingerbread', 'chimney', 'fireplace', 'elf', 'ornament', 'advent'] 547.1305831594982
curren

In [23]:
sub_df

Unnamed: 0,id,text,best_value
0,0,reindeer mistletoe scrooge gingerbread chimney...,553.579985
1,1,reindeer sleep walk the night and drive mistle...,0.0
2,2,jingle yuletide carol cheer grinch holiday hol...,0.0
3,3,sleigh the holly jingle unwrap gifts of magi r...,0.0
4,4,peppermint candy milk chocolate eggnog fruitca...,0.0
5,5,sleigh of naughty nice the of not the to and t...,0.0


In [24]:
# 各セルに関数を適用
sub_df["text"] = sub_df["text"].astype(str)
sub_df = sub_df.applymap(add_quotes_to_strings)

  sub_df = sub_df.applymap(add_quotes_to_strings)


In [25]:
sub_df[["id", "text"]].to_csv(f"{path}/out/submission_{ex_num}.csv", index=False, header=True, quoting=csv.QUOTE_NONE)
sub_df.to_csv(f"{path}/out/score_{ex_num}.csv", index=False, header=True, quoting=csv.QUOTE_NONE)

In [26]:
np.mean(sub_df["best_value"])

np.float64(92.26333088990248)