In [32]:
import torch
import numpy as np
import pickle

In [33]:
import re
from setuptools.namespaces import flatten
from urllib.parse import urlparse, unquote_plus

In [34]:
MODEL_PATH = "models_trained/bertm_5epochs_dropout/"
TOKENIZER_NAME = "bert-base-multilingual-uncased"

In [35]:
model = torch.load(MODEL_PATH + "/model_finetuned.h5", map_location='cpu')

In [36]:
with open(MODEL_PATH + 'mlb.pickle', 'rb') as handle:
    mlb = pickle.load(handle)



In [37]:
def preprocess_url(url):
    ## convert to urlparse with quoted
    url_parsed = urlparse(unquote_plus(url))
    ## join all url attributes
    url_text = ''.join(x for x in [url_parsed.netloc, url_parsed.path, url_parsed.params, url_parsed.query])
    
    ## split url to tokens ie: words
    tokens = re.split('[- _ % : , / \. \+ = ]', url_text)
    ## spliting by upper case
    
    tokens = list(flatten([re.split(r'(?<![A-Z\W])(?=[A-Z])', s) for s in tokens]))
    ## delete token with digits with len < 2
    tokens = [token for token in tokens if (not any(c.isdigit() for c in token)) and (not len(token) <=2)]
    tokens = [token for token in tokens if token not in ['www', 'html', 'com', 'net', 'org']]
    return ' '.join(token for token in tokens)

In [38]:
# urls_cleaned = [preprocess_url(url) for url in URLs]
# urls_cleaned

In [39]:
# BertCustomModel()

In [40]:
# import torch
# from utils import *


URLs = [
    "https://www.logifac.fr/residence/la-residence-gondoles-choisy-le-roi/",
    "https://www.researchgate.net/publication/352563832_MULTILABEL_OVER-SAMPLING_AND_UNDER-SAMPLING_WITH_CLASS_ALIGNMENT_FOR_IMBALANCED_MULTILABEL_TEXT_CLASSIFICATION",
    "https://www.cdiscount.com/bricolage/electricite/batterie-plomb-6v-4ah-ova51023e-pour-toplux/f-16614-ova2009927775303.html",
    "https://www.lequipe.fr/Tennis/TennisFicheJoueur1500000000003017.html"
]

urls_cleaned = [preprocess_url(url) for url in URLs]

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_NAME, map_location=torch.device("cpu")
)
inputs = tokenizer(
    urls_cleaned,
    truncation=True,
    add_special_tokens=True,
    max_length=40,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt",
)

model.eval()
out = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])

pred_probs = torch.sigmoid(out).detach().numpy()

AttributeError: 'BertCustomModel' object has no attribute 'concat_hidden_states'

In [None]:
pred_bools = np.where(pred_probs > 0.5, 1, 0)
pred_bools

In [12]:
print(mlb.inverse_transform(pred_bools))

[(), (), (), ('1077', '294')]


In [18]:
import os

path = 'models_trained/'
models_scoring = {}
for model_dir in os.listdir(path):
    with open(path + model_dir + '/model_scoring.pickle', 'rb') as handle:
        models_scoring[model_dir] = pickle.load(handle)[1]

In [19]:
list_models = models_scoring.keys()

In [21]:
import pandas as pd

pd.DataFrame.from_dict(models_scoring).T

Unnamed: 0,Accuracy,Hamming loss,AUC,F1 score macro,F1 score micro,F1 score weighted
bertm_10epochs_dropout_concat,0.201455,0.008464,0.766455,0.570676,0.645182,0.631545
bertm_5epochs_dropout,0.073055,0.009498,0.611556,0.244681,0.51026,0.406705
bertm_5epochs_dropout_concat,0.187065,0.008338,0.741633,0.528397,0.638434,0.608079
bertm_5epochs_dropout_freezing_concat,0.025142,0.01139,0.537301,0.093941,0.268745,0.197125
bertm_5epochs_nodropout,0.077008,0.00946,0.61398,0.25053,0.515028,0.411921
camembert_5epochs_nodropout,0.047755,0.010287,0.566295,0.134595,0.438133,0.308513


In [49]:
MODEL_PATH = "models_trained/camembert/"
with open(MODEL_PATH + 'model_scoring.pickle', 'rb') as handle:
    camembert_scoring = pickle.load(handle)

In [50]:
MODEL_PATH = "models_trained/bertm/"
with open(MODEL_PATH + 'model_scoring.pickle', 'rb') as handle:
    bertm_scoring = pickle.load(handle)

In [53]:
bertm_scoring

[[{'epoch': 1,
   'Training Loss': 7.141457376373552e-05,
   'Valid. Loss': 0.06411134102120158,
   'Accuracy': 0.0,
   'Hamming loss': 0.012787648049970042,
   'AUC': 0.5,
   'F1 score macro': 0.0,
   'F1 score micro': 0.0,
   'F1 score weighted': 0.0},
  {'epoch': 2,
   'Training Loss': 3.888661810616196e-05,
   'Valid. Loss': 0.045712731004263575,
   'Accuracy': 0.01695510849512431,
   'Hamming loss': 0.011514970250991935,
   'AUC': 0.5176809714597462,
   'F1 score macro': 0.03597131995776486,
   'F1 score micro': 0.22330721942342266,
   'F1 score weighted': 0.12964389450532468},
  {'epoch': 3,
   'Training Loss': 2.8293338242128762e-05,
   'Valid. Loss': 0.03623172285014324,
   'Accuracy': 0.037599929719757536,
   'Hamming loss': 0.01040516099143838,
   'AUC': 0.5572896435698722,
   'F1 score macro': 0.11925659326444367,
   'F1 score micro': 0.4058319555087931,
   'F1 score weighted': 0.2772669519586611},
  {'epoch': 4,
   'Training Loss': 2.3403512143986206e-05,
   'Valid. Loss': 