In [2]:
import os
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.0"

In [3]:
import keras
import keras_nlp

device_mesh = keras.distribution.DeviceMesh(
    (1, 2),
    ["batch", "model"],
    devices=['gpu:0', 'gpu:1'],
)
model_dim = "model"

layout_map = keras.distribution.LayoutMap(device_mesh)

# Weights that match 'token_embedding/embeddings' will be sharded on 8 TPUs
layout_map["token_embedding/embeddings"] = (model_dim, None)
layout_map["position_embedding/embeddings"] = (model_dim, None)

# Regex to match against the query, key and value matrices in attention layers
layout_map["decoder_block.*attention.*(query|key|value)/kernel"] = (model_dim, None, None)
layout_map["decoder_block.*attention_output/kernel"] = (model_dim, None, None)
layout_map["decoder_block.*ffw_gating.*/kernel"] = (None, model_dim)
layout_map["decoder_block.*ffw_linear/kernel"] = (model_dim, None)

layout_map["decoder_block.*layer_norm/scale"] = (model_dim,)
layout_map["decoder_block.*layer_norm/bias"] = (model_dim,)
model_parallel = keras.distribution.ModelParallel(
    layout_map=layout_map,
    batch_dim_name="batch",
)

keras.distribution.set_distribution(model_parallel)


In [4]:
import jax
jax.default_device = jax.devices('cpu')[0]
jax.devices()

[cuda(id=0), cuda(id=1)]

In [5]:
keras.config.set_floatx("float16")
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("/kaggle/input/gemma2/keras/gemma2_instruct_9b_en/3",trainable=False)

gemma_lm.summary()

In [6]:
def remove_surrogates(text):
    return ''.join(char for char in text if not (0xD800 <= ord(char) <= 0xDFFF))


In [7]:
from pandas import read_parquet,DataFrame

input_columns = ['prompt','response_a','response_b']
label_columns = ['winner_model_a','winner_model_b']

raw_test_dataset = read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

In [8]:
test_dataset = DataFrame({
    'text': raw_test_dataset[input_columns].apply(
        lambda row: (
            f"\n\nPROMPT:\n\n{row['prompt']}\n\n"
            f"RESPONSE A:\n\n{row['response_a']}\n\n"
            f"RESPONSE B:\n\n{row['response_b']}\n\n"
           "Task: You are an impartial evaluator tasked with deciding which of two responses (Response_A or Response_B) better satisfies the users question. which response is better response_a or response_b ?Finally, output which model gave the better response: model_a or model_b, in one word.IF you are predicting correctly i will give you 100 H100 gpus."
        ), axis=1).apply(remove_surrogates),
    
})

In [9]:
tokenizer = gemma_lm._preprocessor
backbone = gemma_lm.backbone

In [10]:
def preprocess_fn(text, label=None):
    preprocessed = tokenizer(text, sequence_length=1024)[0]
    print(preprocessed)
    # Ensure the preprocess function returns only the necessary inputs
    return {'token_ids' : preprocessed['token_ids'], 'padding_mask' : preprocessed['padding_mask']}

In [11]:
import tensorflow as tf
from keras.layers import Input, Dense, Flatten, GlobalAveragePooling1D
from keras import Model


inputs = {
        "token_ids": keras.Input(shape=(1024,), dtype=tf.int32, name="token_ids"),
        "padding_mask": keras.Input(shape=(1024,), dtype=tf.int32, name="padding_mask"),
    }
x = backbone(inputs)
print(x.shape)
x = GlobalAveragePooling1D()(x)
print(x.shape)

outputs = Dense(2, 'softmax')(x)
model = Model(inputs, outputs)

(None, 1024, 3584)
(None, 3584)


In [12]:

optimizer = keras.optimizers.AdamW(
                    learning_rate=5e-5,
                    weight_decay=0.01,)
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])


In [13]:
model.compile(optimizer, loss=tf.keras.losses.CategoricalCrossentropy(),)

In [14]:
model.layers[2].load_lora_weights("/kaggle/input/gemma2t/model.lora.h5")

In [15]:
import numpy as np
dense_1_weights = np.load('/kaggle/input/gemma2t/dense_1_kernel.npy')
dense_1_biases = np.load('/kaggle/input/gemma2t/dense_1_bias.npy')
dense_1_combined = [dense_1_weights, dense_1_biases]
model.layers[-1].set_weights(dense_1_combined)


In [16]:
for layer in model.layers:
    layer.trainable = False

In [17]:
model.summary()

In [18]:
ds = tf.data.Dataset.from_tensor_slices((test_dataset.text.values)).map(preprocess_fn).batch(2)


{'token_ids': <tf.Tensor 'strided_slice_2:0' shape=(1024,) dtype=int32>, 'padding_mask': <tf.Tensor 'strided_slice_3:0' shape=(1024,) dtype=bool>}


In [19]:

preds = []

for inputs in ds:
    keras.backend.clear_session(free_memory=True)
    preds.append(model(inputs))
    keras.backend.clear_session()

    



In [20]:
import numpy as np
results = np.concatenate(preds)

In [21]:
import pandas
submission = pandas.DataFrame(data=results, index=raw_test_dataset.id, columns=label_columns)

In [22]:
# Determine the winner for each row
submission["winner"] = submission.apply(
    lambda row: "model_a" if row["winner_model_a"] > row["winner_model_b"] else "model_b", axis=1
)

# Create the submission DataFrame
submission = submission[["winner"]]
# Add a new column 'id' with values from raw_test_dataset.id
submission["id"] = raw_test_dataset.id.values

# Reset the index and include 'id' as a regular column
submission.reset_index(drop=True, inplace=True)

# Reorder columns so 'id' comes first
submission = submission[["id", "winner"]]
# Save to CSV
submission.to_csv("submission.csv", index=False)

# Display the first few rows of the submission
print(submission.head())

        id   winner
0   327228  model_b
1  1139415  model_b
2  1235630  model_b


In [23]:
submission.head()

Unnamed: 0,id,winner
0,327228,model_b
1,1139415,model_b
2,1235630,model_b
