In [1]:
import json
import itertools
import torch
! pip install fairseq sentencepiece annoy

In [2]:
with open("data/nature-de-fait.json") as f:
    nf = json.load(f)

In [46]:
def get_all_childs(codes):
    def _clean_label_codes(codes):
        return [{k:v for k,v in e.items() if k in ["code", "label"]} for e in codes]
        
    enfants = list(itertools.chain.from_iterable([elt.get("enfants", []) for elt in codes]))
    if enfants:
        return _clean_label_codes(codes) + get_all_childs(enfants)
    else:
        return _clean_label_codes(codes)

result = get_all_childs(nf)

In [48]:
from fairseq.models.roberta import CamembertModel
camembert = CamembertModel.from_pretrained('./camembert-base/')
camembert.eval();

In [32]:
def embed(sentence):
    tokens = camembert.encode(sentence)
    # Extract all layer's features (layer 0 is the embedding layer)
    all_layers = camembert.extract_features(tokens, return_all_hiddens=True)
    pooling_layer = all_layers[-2]
    embedded = pooling_layer.mean(1)  # 1 is the dimension you want to average ovber
    # note, using numpy to take the mean is bad if you want to stay on GPU
    return embedded

In [39]:
line = "feu d'usine"
line_2 = "J'aime beaucoup le camembert !"

cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(embed(line), embed(line_2))


In [40]:
output

tensor([0.9026], grad_fn=<DivBackward0>)

In [43]:
from tqdm.auto import tqdm

In [53]:
nf_with_embedding = [{**res, **{"embedding":embed(res["label"])}} for res in tqdm(result)]

  0%|          | 0/299 [00:00<?, ?it/s]

In [59]:
from annoy import AnnoyIndex
import random

f = 768
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i, elt in enumerate(nf_with_embedding):
    t.add_item(i, elt["embedding"][0])

t.build(10)

True

In [76]:
line = "Incendie"

output = embed(line)[0]
podium = t.get_nns_by_vector(output, 10)

[nf_with_embedding[p]["label"] for p in podium]

['Incendie',
 'Incident',
 'Explosion',
 'Incendie en milieu industriel',
 'Incendie en milieu agricole',
 'Attentat',
 'Traumatisme / Accident',
 'Infraction aérienne',
 'Accident de chasse',
 'Accident de pêche']

[97, 17, 93, 38, 52, 290, 195, 48, 218, 190]


{'label': 'Homicide ou infanticide', 'code': 'C02.03.00'}

In [77]:
line = "feu"
line_2 = "incendie"

cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(embed(line), embed(line_2))
output

tensor([0.4822], grad_fn=<DivBackward0>)

In [74]:
result

[{'label': 'Accident de la circulation', 'code': 'C01.00.00'},
 {'label': 'Atteinte aux personnes', 'code': 'C02.00.00'},
 {'label': 'Atteinte aux biens / animal', 'code': 'C03.00.00'},
 {'label': 'Incendie', 'code': 'C04.00.00'},
 {'label': 'Explosion', 'code': 'C05.00.00'},
 {'label': "Suspicion d'infractions", 'code': 'C06.00.00'},
 {'label': 'Ordre public', 'code': 'C07.00.00'},
 {'label': 'Aléa naturel', 'code': 'C08.00.00'},
 {'label': 'Aléa technologique', 'code': 'C09.00.00'},
 {'label': 'Disparitions et découvertes', 'code': 'C10.00.00'},
 {'label': 'Autre nature de fait', 'code': 'C11.00.00'},
 {'label': 'Accident routier', 'code': 'C01.01.00'},
 {'label': 'Accident ferroviaire', 'code': 'C01.02.00'},
 {'label': 'Accident maritime ou fluvial', 'code': 'C01.03.00'},
 {'label': 'Accident aérien', 'code': 'C01.04.00'},
 {'label': "Autre type d'accident", 'code': 'C01.05.00'},
 {'label': 'Attentat', 'code': 'C02.01.00'},
 {'label': "Prise d'otage", 'code': 'C02.02.00'},
 {'label'

# Fasttext

In [80]:
!pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-macosx_10_16_x86_64.whl size=344675 sha256=d91f48210f235c9df9e65ce0219875350e772cf6e6cbbe7d600a7a8f7cdc3df4
  Stored in directory: /Users/raphael/Library/Caches/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [1]:
import fasttext.util
#fasttext.util.download_model('fr', if_exists='ignore')  # English
ft = fasttext.load_model('cc.fr.300.bin')



In [2]:
fasttext.util.reduce_model(ft, 100)
ft.get_dimension()

100

In [3]:
ft.get_nearest_neighbors('usine')

[(0.81749027967453, 'aciérie'),
 (0.7903648614883423, 'usines'),
 (0.7874736785888672, 'Usine'),
 (0.7836554050445557, 'usine-pilote'),
 (0.7748782634735107, 'usine-mère'),
 (0.7568495273590088, 'fonderie'),
 (0.7543993592262268, 'raffinerie'),
 (0.752841591835022, 'mini-usine'),
 (0.7344776391983032, 'minoterie'),
 (0.72261643409729, 'ex-usine')]

In [4]:
ls

Build matching .ipynb  cc.fr.300.bin          [1m[36mdata.ms[m[m/
README.md              cc.fr.300.bin.gz
[1m[36mcamembert-base[m[m/        [1m[36mdata[m[m/


In [7]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-macosx_10_9_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 3.3 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 11.2 MB/s eta 0:00:01
[?25hCollecting scipy>=0.18.1
  Downloading scipy-1.6.3-cp38-cp38-macosx_10_9_x86_64.whl (30.8 MB)
[K     |████████████████████████████████| 30.8 MB 31.8 MB/s eta 0:00:01
Installing collected packages: smart-open, scipy, gensim
Successfully installed gensim-4.0.1 scipy-1.6.3 smart-open-5.1.0


In [8]:
from gensim.models import FastText




In [9]:
model = FastText.load_fasttext_format('cc.fr.300.bin')

print(model.most_similar('usine'))


  model = FastText.load_fasttext_format('cc.fr.300.bin')


KeyboardInterrupt: 