In [None]:
import pandas as pd
import numpy as np

import re

import glob, json, os

import torch 

from scipy.spatial.distance import euclidean, cosine

from transformers import BertConfig, DebertaTokenizer, DebertaModel, RobertaTokenizer, RobertaModel, MPNetTokenizer, MPNetModel, XLMTokenizer, XLMModel, BertTokenizer, BertModel

### Load Train, Test & Pretrained Models

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")

print(df.shape)
df.head()

In [None]:
vals = pd.concat([df.dataset_title, df.dataset_label, df.cleaned_label])
vals = np.unique([clean_text(t) for t in vals])
print(len(vals))
vals[:10]

In [None]:
df_vals = pd.DataFrame({"values":vals, "ntokens":[len(v.split()) for v in vals]})
print(df_vals.shape)
df_vals.head()

In [None]:
df_vals.describe()

In [None]:
#use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#deberta
tokenizer_d = DebertaTokenizer.from_pretrained("../input/deberta2/v2/deberta-base-tok")
model_d = DebertaModel.from_pretrained("../input/deberta2/v2/deberta-base-model").to(device)

#bert base max
tokenizer_bbmax = BertTokenizer.from_pretrained("../input/embedders/bert-base-nli-max-tokens-tok")
model_bbmax = BertModel.from_pretrained("../input/embedders/bert-base-nli-max-tokens-model").to(device)

#bert base mean
tokenizer_bbmean = BertTokenizer.from_pretrained("../input/embedders/bert-base-nli-mean-tokens-tok")
model_bbmean = BertModel.from_pretrained("../input/embedders/bert-base-nli-mean-tokens-model").to(device)

#paraphrase mini
tokenizer_pmini = BertTokenizer.from_pretrained("../input/embedders/paraphrase-MiniLM-tok")
model_pmini = BertModel.from_pretrained("../input/embedders/paraphrase-MiniLM-model").to(device)

#paraphrase distilroberta
tokenizer_pd = RobertaTokenizer.from_pretrained("../input/embedders/paraphrase-distilroberta-tok")
model_pd = RobertaModel.from_pretrained("../input/embedders/paraphrase-distilroberta-model").to(device)

#paraphrase mpnet
tokenizer_pm = MPNetTokenizer.from_pretrained("../input/pmpnet/paraphrase-mpnet-tok")
model_pm = MPNetModel.from_pretrained("../input/pmpnet/paraphrase-mpnet-model").to(device)

#roberta
tokenizer_r = RobertaTokenizer.from_pretrained("../input/embedders/roberta-base-tok")
model_r = RobertaModel.from_pretrained("../input/embedders/roberta-base-model").to(device)

print(model_d.config, model_bbmax.config, model_bbmean.config, model_pmini.config, model_pd.config, model_pm.config, model_r.config)

In [None]:
print(model_d.embeddings.word_embeddings, model_bbmax.embeddings.word_embeddings, model_bbmean.embeddings.word_embeddings, 
      model_pmini.embeddings.word_embeddings, model_pd.embeddings.word_embeddings, model_pm.embeddings.word_embeddings, 
      model_r.embeddings.word_embeddings)

In [None]:
print(model_d.device, model_bbmax.device, model_bbmean.device, model_pmini.device, model_pd.device, model_pm.device, model_r.device)

In [None]:
def load(json_type=str):
    base_path = f"../input/coleridgeinitiative-show-us-the-data/{json_type}"
        
    texts = []

    for j in os.listdir(base_path):
        print(j)
        texts.append(pd.read_json(os.path.join(base_path, j)))
    
    df_test = pd.DataFrame()
    
    df_test["Id"] = pd.Series([j.strip(".json") for j in os.listdir(base_path)], name="Id")
    df_test["text"] = pd.Series([" ".join(t["text"]) for t in texts], name="text")
    df_test.text = df_test.text.apply(lambda row: clean_text(row))
    
    return df_test

In [None]:
df_test = load("test")
print(df_test.shape)
df_test.head()

In [None]:
df_test.text[0]

### Dictionary to create tokens of length == train vals from json test

In [None]:
import time

start = time.time()

empty = {k:{} for k in range(len(df_test.text))}

for l, t in zip(range(len(empty)), df_test.text):
    splitted = np.array(t.split())
    vals_d = {v:[] for v in vals}
    for v in vals:
        n = len(v.split())
        for i in range(len(splitted)):
            vals_d[v].append(splitted[i:n+i])
    empty[l] = vals_d

end = time.time()
print(end - start)

In [None]:
print(len(empty), type(empty), "\n")
print(len(empty[1].keys()), empty[1].keys())

In [None]:
empty[0].keys() == empty[2].keys()

In [None]:
list(vals) == list(empty[1].keys())

In [None]:
df_v = pd.DataFrame([" ".join(t) for t in empty[2]["trends in international mathematics and science study"]], columns=["val"])
print(df_v.shape)
df_v[df_v.val==("trends in international mathematics and science study")]

## Models Test

In [None]:
#deberta: tokenizer_d, model_d

#bert-base-max: tokenizer_bbmax, model_bbmax

#bert-base-mean: tokenizer_bbmean, model_bbmean

#paraphrase-mini: tokenizer_pmini, model_pmini

#paraphrase-distilroberta: tokenizer_pd, model_pd

#paraphrase-mpnet: tokenizer_pm, model_pm

#roberta: tokenizer_r, model_r

### paraphrase-mini: tokenizer_pmini, model_pmini

In [None]:
encoded_v = tokenizer_pmini.encode_plus(("trends in international mathematics and science study").split(), padding=True, truncation=True, add_special_tokens=False, return_attention_mask=True, return_tensors="pt")
with torch.no_grad():
    output_v = model_pmini(**encoded_v.to(device))
#output_v = model_t(**encoded_v)

In [None]:
#use gpu is available
#device = "cuda:0" if torch.cuda.is_available() else "cpu"
s = time.time()
encoded_ = [tokenizer_pmini.encode_plus(" ".join(t), padding=False, truncation=True, add_special_tokens=False, return_attention_mask=True, return_tensors="pt") for t in empty[2]["trends in international mathematics and science study"]]
with torch.no_grad():
    output_ = [model_pmini(**e.to(device)) for e in encoded_]
#output_ = model_t(**encoded_)
e = time.time()
print(e-s)

In [None]:
[" ".join(t).encode() for t in empty[2]["trends in international mathematics and science study"][:2]]

In [None]:
output_v[0].cpu().detach().numpy().shape, output_[1][0].cpu().detach().numpy().shape, len([" ".join(t) for t in empty[2]["trends in international mathematics and science study"]])

In [None]:
for i in range(len(output_)):
    #output_[i][0].detach().numpy().flatten()
    d = cosine(np.average(output_v[0].cpu().detach().numpy(), axis=1), np.average(output_[i][0].cpu().detach().numpy(), axis=1))
    if d < 0.05:
        print(d, empty[2]["trends in international mathematics and science study"][i])