In [1]:
import glob
import json
import os
from typing import Optional, Tuple
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

2024-06-22 16:39:46.038856: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-22 16:39:46.173624: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-22 16:39:46.178263: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-22 16:39:46.178284: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

In [2]:
DATA_PATH = "../raw_data/AI4Code"
BASE_MODEL = "../models/distilbert-base-uncased"
N_SPLITS = 5
SEQ_LEN = 128
RANDOM_STATE = 42

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 128 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 32
    LIMIT = 10_000

print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

TensorFlow 2.11.0
Using GPU/CPU
Batch size: 32


In [3]:
def read_notebook(path: str) -> pd.DataFrame:
    with open(path) as file:
        df = pd.DataFrame(json.load(file))
    df["id"] = os.path.splitext(os.path.basename(path))[0]
    return df

def expand_order(row: Tuple[str, str]) -> pd.DataFrame:
    cell_ids = row[1].split(" ")
    df = pd.DataFrame(
        {
            "id": [row[0] for _ in range(len(cell_ids))],
            "cell_id": cell_ids,
            "rank": range(len(cell_ids)),
        }
    )
    df["pct_rank"] = df["rank"] / len(df)
    return df

def tokenize(source: pd.Series) -> Tuple[np.array, np.array]:
    tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=True)

    input_ids = np.zeros((len(source), SEQ_LEN), dtype="int32")
    attention_mask = np.zeros((len(source), SEQ_LEN), dtype="int32")

    for i, x in enumerate(tqdm(source, total=len(source))):
        encoding = tokenizer.encode_plus(
            x,
            None,
            add_special_tokens=True,
            max_length=SEQ_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True,
        )
        input_ids[i] = encoding["input_ids"]
        attention_mask[i] = encoding["attention_mask"]

    return input_ids, attention_mask

def get_dataset(
    input_ids: np.array,
    attention_mask: np.array,
    labels: Optional[np.array] = None,
    ordered: bool = False,
    repeated: bool = False,
) -> tf.data.Dataset:
    if labels is not None:
        dataset = tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            {"input_ids": input_ids, "attention_mask": attention_mask}
        )
    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

def get_model() -> tf.keras.Model:
    backbone = transformers.TFDistilBertModel.from_pretrained(BASE_MODEL)
    input_ids = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(SEQ_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    x = backbone(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        },
    )
    outputs = tf.keras.layers.Dense(1, activation="linear", dtype="float32")(x[0][:, 0, :])

    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=outputs,
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
        loss=tf.keras.losses.MeanSquaredError(),
    )
    return model

In [4]:
paths = glob.glob(os.path.join(DATA_PATH, "train_data", "*.json"))
if LIMIT is not None:
    paths = paths[:LIMIT]

source_df = pd.concat([read_notebook(x) for x in tqdm(paths, total=len(paths))])

source_df = source_df[source_df["cell_type"] == "markdown"]
source_df = source_df.drop("cell_type", axis=1)
source_df = source_df.rename_axis("cell_id").reset_index()

order_df = pd.read_csv(os.path.join(DATA_PATH, "train_orders.csv"), index_col="id")
order_df = pd.concat(
    [expand_order(row) for row in tqdm(order_df.itertuples(), total=len(order_df))]
)

ancestors_df = pd.read_csv(
    os.path.join(DATA_PATH, "train_ancestors.csv"),
    usecols=["id", "ancestor_id"],
    index_col="id",
)

df = source_df.merge(order_df, on=["id", "cell_id"]).merge(ancestors_df, on="id")
df = df.dropna()

lang_df = pd.read_csv('../raw_data/AI4Code/all_languages.csv')
merged_df = df.merge(lang_df, on='id', how='left')
merged_df = merged_df[merged_df['score'] >= 0.75]
merged_df = merged_df[merged_df['language'] == 'en']
df = merged_df.drop(columns=['language', 'score'])

display(df)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/139256 [00:00<?, ?it/s]

Unnamed: 0,cell_id,source,id,rank,pct_rank,ancestor_id
0,58900633,#### Explore airports\nThere are 268 unique ai...,2ac1be019bf73e,26,0.440678,b66b5e9a
1,950a8058,#### Airlines\nAfter looking into delay distri...,2ac1be019bf73e,21,0.355932,b66b5e9a
2,3d3eb7f6,Distribution of airlines is extremely right sk...,2ac1be019bf73e,25,0.423729,b66b5e9a
3,99385c1e,#### How did carriers performed over the years?,2ac1be019bf73e,52,0.881356,b66b5e9a
4,26c65ce6,An average departure delay and its std are slo...,2ac1be019bf73e,45,0.762712,b66b5e9a
...,...,...,...,...,...,...
156679,d5e9c516,# Optimization,d3c351143d72ef,36,0.418605,77eaf8c7
156680,47fd56dd,# Optimizer Analytics,d3c351143d72ef,28,0.325581,77eaf8c7
156681,a3bdddf7,# Define Constraints,d3c351143d72ef,10,0.116279,77eaf8c7
156682,67b26d0c,In validation RMS 99.4 and Adam 66.4 this crea...,d3c351143d72ef,46,0.534884,77eaf8c7


In [5]:
file_path = os.path.join(DATA_PATH, "distilbert_data.csv")
df.to_csv(file_path, index=False)

In [6]:
df = pd.read_csv('../raw_data/AI4Code/distilbert_data.csv')
df

Unnamed: 0,cell_id,source,id,rank,pct_rank,ancestor_id
0,58900633,#### Explore airports\nThere are 268 unique ai...,2ac1be019bf73e,26,0.440678,b66b5e9a
1,950a8058,#### Airlines\nAfter looking into delay distri...,2ac1be019bf73e,21,0.355932,b66b5e9a
2,3d3eb7f6,Distribution of airlines is extremely right sk...,2ac1be019bf73e,25,0.423729,b66b5e9a
3,99385c1e,#### How did carriers performed over the years?,2ac1be019bf73e,52,0.881356,b66b5e9a
4,26c65ce6,An average departure delay and its std are slo...,2ac1be019bf73e,45,0.762712,b66b5e9a
...,...,...,...,...,...,...
143005,d5e9c516,# Optimization,d3c351143d72ef,36,0.418605,77eaf8c7
143006,47fd56dd,# Optimizer Analytics,d3c351143d72ef,28,0.325581,77eaf8c7
143007,a3bdddf7,# Define Constraints,d3c351143d72ef,10,0.116279,77eaf8c7
143008,67b26d0c,In validation RMS 99.4 and Adam 66.4 this crea...,d3c351143d72ef,46,0.534884,77eaf8c7


In [5]:
from transformers import TFAutoModel, AutoTokenizer

model_name = "distilbert-base-uncased"

model = TFAutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.save_pretrained(BASE_MODEL)
tokenizer.save_pretrained(BASE_MODEL)

2024-06-22 15:32:25.133334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2024-06-22 15:32:25.133725: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2024-06-22 15:32:25.133807: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (AYO-LP-BILH): /proc/driver/nvidia/version does not exist
2024-06-22 15:32:25.135428: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the PyTorch model were not used when initializi

('../models/distilbert-base-uncased/tokenizer_config.json',
 '../models/distilbert-base-uncased/special_tokens_map.json',
 '../models/distilbert-base-uncased/vocab.txt',
 '../models/distilbert-base-uncased/added_tokens.json',
 '../models/distilbert-base-uncased/tokenizer.json')

In [6]:
input_ids, attention_mask = tokenize(df["source"])

labels = df["pct_rank"].to_numpy()
groups = df["ancestor_id"].to_numpy()

print("input_ids:", input_ids.shape)
print("attention_mask:", attention_mask.shape)
print("labels:", labels.shape)
print("groups:", groups.shape)

  0%|          | 0/240 [00:00<?, ?it/s]

input_ids: (240, 128)
attention_mask: (240, 128)
labels: (240,)
groups: (240,)


In [13]:
input_ids, attention_mask, labels, groups = shuffle(
    input_ids, attention_mask, labels, groups, random_state=RANDOM_STATE
)
kfold = GroupKFold(n_splits=N_SPLITS)

for i, (train_index, val_index) in enumerate(kfold.split(input_ids, labels, groups=groups)):
    if TPU is not None:
        tf.tpu.experimental.initialize_tpu_system(TPU)

    with STRATEGY.scope():
        model = get_model()
        model.summary()

    train_dataset = get_dataset(
        input_ids=input_ids[train_index],
        attention_mask=attention_mask[train_index],
        labels=labels[train_index],
        repeated=True,
    )
    val_dataset = get_dataset(
        input_ids=input_ids[val_index],
        attention_mask=attention_mask[val_index],
        labels=labels[val_index],
        ordered=True,
    )

    model.fit(
        train_dataset,
        validation_data=val_dataset,
        steps_per_epoch=len(train_index) // BATCH_SIZE,
        epochs=1,
        verbose=2,
    )

    model.save_weights(f"model_{i}.h5")
    break

All model checkpoint layers were used when initializing TFDistilBertModel.

All the layers of TFDistilBertModel were initialized from the model checkpoint at ../models/distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 tf_distil_bert_model_2 (TFDist  TFBaseModelOutput(l  66362880   ['attention_mask[0][0]',         
 ilBertModel)                   ast_hidden_state=(N               'input_ids[0][0]']              
                                one, 128, 768),                                                   
                                 hidden_states=None                                         

In [21]:
model.__dict__

{'validation_data': None,
 'model': <keras.engine.functional.Functional at 0x7fa1e5445c00>,
 '_chief_worker_only': None,
 '_supports_tf_logs': False,
 'history': {'loss': [0.24577596783638], 'val_loss': [0.14320646226406097]},
 'params': {'verbose': 2, 'epochs': 1, 'steps': 5},
 'epoch': [0]}