In [1]:
import pickle
import numpy as np

from utils.lang_enum import languages

DF_CACHE_PATH = "../datasets/cache/all.pickle"
DF_PATHONLY_CACHE_PATH = "../datasets/cache/path_only.pickle"

# Process data

In [2]:
df = pickle.load(open(DF_CACHE_PATH, "rb"))

In [None]:
print(df.groupby("language_tag").count().sort_values(by="code", ascending=True).to_string())

In [None]:
s = df[df.language_tag == "TGLANG_LANGUAGE_OTHER"].sample()
print(s.language_tag.values[0])
print("-"*50)
print(s.code.values[0])

# Validate

In [None]:
import train as mt

MODEL_RESUME = "./logs/2023-10-15_17-15-06_bs=512_epochs=1300/epoch=1114-step=781225.ckpt"
model = mt.LanguageClassifier.load_from_checkpoint(MODEL_RESUME, strict=False).cpu().eval()

In [2]:
import pickle
DF_CACHE_PATH = "../datasets/cache/all.pickle"
df = pickle.load(open(DF_CACHE_PATH, "rb"))

In [6]:
print(df.language_tag.unique().tolist())

['TGLANG_LANGUAGE_OTHER', 'TGLANG_LANGUAGE_APACHE_GROOVY', 'TGLANG_LANGUAGE_1S_ENTERPRISE', 'TGLANG_LANGUAGE_ASSEMBLY', 'TGLANG_LANGUAGE_ABAP', 'TGLANG_LANGUAGE_ACTIONSCRIPT', 'TGLANG_LANGUAGE_ADA', 'TGLANG_LANGUAGE_APEX', 'TGLANG_LANGUAGE_APPLESCRIPT', 'TGLANG_LANGUAGE_BASIC', 'TGLANG_LANGUAGE_AWK', 'TGLANG_LANGUAGE_C', 'TGLANG_LANGUAGE_CPLUSPLUS', 'TGLANG_LANGUAGE_CMAKE', 'TGLANG_LANGUAGE_CLOJURE', 'TGLANG_LANGUAGE_COBOL', 'TGLANG_LANGUAGE_COFFESCRIPT', 'TGLANG_LANGUAGE_CRYSTAL', 'TGLANG_LANGUAGE_COMMON_LISP', 'TGLANG_LANGUAGE_D', 'TGLANG_LANGUAGE_DART', 'TGLANG_LANGUAGE_DELPHI', 'TGLANG_LANGUAGE_ELIXIR', 'TGLANG_LANGUAGE_ELM', 'TGLANG_LANGUAGE_ERLANG', 'TGLANG_LANGUAGE_FORTH', 'TGLANG_LANGUAGE_FORTRAN', 'TGLANG_LANGUAGE_GO', 'TGLANG_LANGUAGE_HACK', 'TGLANG_LANGUAGE_HASKELL', 'TGLANG_LANGUAGE_IDL', 'TGLANG_LANGUAGE_JAVA', 'TGLANG_LANGUAGE_JAVASCRIPT', 'TGLANG_LANGUAGE_JULIA', 'TGLANG_LANGUAGE_KOTLIN', 'TGLANG_LANGUAGE_LATEX', 'TGLANG_LANGUAGE_LISP', 'TGLANG_LANGUAGE_LOGO', 'TGLANG_LA

In [94]:
from typing import List
from utils import vocab

import random
import torch
from utils import helper, preprocess, lang_enum

lang = random.choice(df.language_tag.unique().tolist())
text = helper.augment(df[(df.language_tag == lang)].code.sample().iloc[0], lines_num_range=(5, 50))

et = preprocess.encode_text(text)
inputs = torch.tensor(et[:vocab.max_size])
inputs = torch.cat([inputs, torch.zeros(vocab.max_size - len(inputs), dtype=torch.long)], dim=0).unsqueeze(0)
r = model(inputs.long(), None).softmax(1)

print("et len:", len(et))
print("gt:", lang)
top = 5
t5 = r.topk(top)
for i in range(top):
    print(f"{lang_enum.languages[t5.indices[0][i].item()]}: {t5.values[0][i].item():.2f}")
print("-"*10 + "ORIGINAL TEXT" + "-"*10)
print(text)
print("-"*10 + "DECODED TOKENS" + "-"*10)
print(preprocess.decode_text(inputs.tolist()[0][:len(et)]))

et len: 366
gt: TGLANG_LANGUAGE_C
TGLANG_LANGUAGE_C: 0.81
TGLANG_LANGUAGE_CPLUSPLUS: 0.01
TGLANG_LANGUAGE_BISON: 0.01
TGLANG_LANGUAGE_SCHEME: 0.00
TGLANG_LANGUAGE_VALA: 0.00
----------ORIGINAL TEXT----------
  ADD_CNT (write);
  START_TIMER (write);
  if (server->sfd < 0) {
    END_TIMER (write);
    return -1;
  }
  int r = 0;
  int first = 1;
  
  struct pollfd s;
  s.fd = server->sfd;
  s.events = POLLOUT;
  static struct iovec t[3];
  int ss, sf;
  if (server->out_bytes) {
    if (server->out_bytes != RPC_OUT_BUF_SIZE && server->out_rptr <= server->out_wptr) {
      ss = 1;
      t[1].iov_base = server->out_rptr;
      t[1].iov_len = server->out_wptr - server->out_rptr;
    } else {
      ss = 0;
      t[0].iov_base = server->out_rptr;
      t[0].iov_len = server->out_buf + RPC_OUT_BUF_SIZE - server->out_rptr;
      t[1].iov_base = server->out_buf;
      t[1].iov_len = server->out_wptr - server->out_buf;
    }
  } else {
    ss = 2;
  }
  if (buf && buf_len) {
    sf = 3;
    t[2].

# Run full ds

In [None]:
from torch.utils.data import DataLoader, Dataset
import train as mt

MODEL_RESUME = "./logs/2023-10-14_16-11-14_bs=512_epochs=1100/epoch=1086-step=766482.ckpt"
model = mt.LanguageClassifier.load_from_checkpoint(MODEL_RESUME, strict=False)
model = model.cuda().eval()

pdf = pickle.load(open(DF_PATHONLY_CACHE_PATH, "rb"))
dataset = mt.CodeDataset(pdf, aug=False)
train_loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=32)

In [3]:
import tqdm.autonotebook as tqdm

all_preds = []
all_probs = []

for batch in tqdm.tqdm(train_loader):
    x, am, y = batch
    y_hat = model(x.cuda(), am.cuda())

    probs = y_hat.softmax(dim=1)
    pred_labels = probs.argmax(dim=1)

    for p, l in zip(probs, pred_labels):
        all_preds.append(l.item())
        all_probs.append(p[l].item())

  0%|          | 0/15698 [00:00<?, ?it/s]

100%|██████████| 15698/15698 [12:45<00:00, 20.51it/s]


In [5]:
df["pred"] = all_preds
df["prob"] = all_probs

In [33]:
tdf = df[df.prob < 0.2]

In [34]:
len(tdf)

12984

In [67]:
from utils import lang_enum
# lang = "TGLANG_LANGUAGE_SWIFT"
lang = None
s = tdf[tdf.pred_lang == lang].sample() if lang is not None else tdf.sample()

print("GT:", s.language_tag.values[0])
print("PRED:", lang_enum.languages[s.pred.values[0]], f"{s.prob.values[0]:.2f}")
print("-"*50)
print(s.code.values[0])

GT: TGLANG_LANGUAGE_GAMS
PRED: TGLANG_LANGUAGE_INI 0.18
--------------------------------------------------
p_ng 641.0000085356635


# Export

In [348]:
# GENERATE MODEL_META.HPP

from utils import vocab

def escape_string(s):
    return s.replace('\\', '\\\\').replace('\n', '\\n').replace('\t', '\\t').replace('"', r'\"')

def generate_cpp_header(strings):
    header_content = "#pragma once\n\n"
    header_content += "#include <vector>\n"
    header_content += "#include <unordered_map>\n"
    header_content += "#include <string>\n\n"

    max_len = max([len(s) for s in strings])
    header_content += "const int MODEL_MAX_INPUT = " + str(vocab.max_size) + ";\n"
    header_content += "const float DETECTION_THRESHOLD = 0.2;\n"
    header_content += "const int MAX_LINE_LEN = " + str(vocab.max_line_len) + ";\n"
    header_content += "const std::vector<int> SPACES_RANGE = {" + f"{vocab.spaces_range[0]}, {vocab.spaces_range[1]}" + "};\n"
    header_content += "const int LETTERS_POSE = " + str(vocab.letters_pose) + ";\n"
    header_content += "const int VOCAB_NEW_LINE_ID = " + str(vocab.vocab_dict["\n"]) + ";\n"
    header_content += "const int VOCAB_UNK_ID = " + str(len(vocab.vocab_list) - 1) + ";\n"
    header_content += "const int VOCAB_PAD_ID = " + str(len(vocab.vocab_list) - 2) + ";\n"
    header_content += "const int VOCAB_MAX_LEN = " + str(max_len) + ";\n\n"
    header_content += "using det_int_t = int64_t;\n\n"

    header_content += "const std::vector<std::string> vocab_list = {\n"
    for s in strings:
        header_content += '    "' + escape_string(s) + '",\n'
    header_content += "};\n\n"

    vocab_map = {s: i for i, s in enumerate(strings)}
    header_content += "const std::unordered_map<std::string, int> vocab_map = {\n"
    for s, i in vocab_map.items():
        header_content += '    {"' + escape_string(s) + '", ' + str(i) + '},\n'
    header_content += "};\n\n"

    return header_content

header = generate_cpp_header(vocab.vocab_list)
with open("../lib/data/model/model_meta.hpp", 'w') as f:
    f.write(header)

In [347]:
import train as mt

MODEL_RESUME = "./logs/2023-10-15_17-15-06_bs=512_epochs=1300/epoch=1114-step=781225.ckpt"
model = mt.LanguageClassifier.load_from_checkpoint(MODEL_RESUME, strict=False)
model = model.cpu().eval()

In [349]:
# SAVE MODEL TO HF

import os
save_path = os.path.join(os.path.dirname(MODEL_RESUME), "hfmodel")
model.model.save_pretrained(save_path)
print(f"Saved to {save_path}")

Saved to ./logs/2023-10-15_17-15-06_bs=512_epochs=1300/hfmodel


In [None]:
# CONVERT TO TFLITE AND SAVE

import tensorflow as tf
from transformers import TFMobileBertForSequenceClassification

from utils import lang_enum

tfmodel = TFMobileBertForSequenceClassification.from_pretrained(save_path, from_pt=True)
class TGInferenceModelKeras(tf.keras.Model):
    def __init__(self, tfmodel):
        super().__init__()
        self.model = tfmodel
        self.max_len = 256

    @tf.function(
        input_signature=[
            tf.TensorSpec(
                shape=[None],
                dtype=tf.int64,
                name="inputs",
            ),
        ]
    )
    def call(self, inputs):
        inputs = inputs[:self.max_len]
        inputs = inputs[None]
        logits = self.model(inputs).logits[0]
        label = tf.argmax(logits, axis=-1)
        conf = tf.nn.softmax(logits)[label]
        return label, conf

kerasmodel = TGInferenceModelKeras(tfmodel)

inputs = tf.zeros(512, dtype=tf.int64)
label, conf = kerasmodel(inputs)
print("Label: ", lang_enum.languages[label.numpy()], "Confidence: ", conf.numpy())

### TO TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(kerasmodel)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter.target_spec.supported_types = [tf.float16]
converter.experimental_new_converter = True
tflite_model = converter.convert()

save_tf_lite_path = os.path.join(
    os.path.dirname(MODEL_RESUME), os.path.basename(MODEL_RESUME).split(".")[0] + ".tflite"
)
with open(save_tf_lite_path, "wb") as f:
    f.write(tflite_model)

print("Converting finished. Saved to: ", os.path.abspath(save_tf_lite_path))

In [351]:
!cp $save_tf_lite_path ../lib/data/model/model.tflite

In [223]:
import tflite_runtime.interpreter as tflite

interpreter = tflite.Interpreter(model_path=save_tf_lite_path)
interpreter.get_signature_list()

{'serving_default': {'inputs': ['inputs'],
  'outputs': ['output_1', 'output_2']}}