In [96]:
import multiprocessing as mp

import string
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix

In [97]:
language = 'fr'
assert(language in ['en', 'fr', 'es'])

In [98]:
numberbatch = pickle.load(open(f"/data/zeste_cache/numberbatch-{language}-19.08.pickle", 'rb'))

In [99]:
len(numberbatch.vocab)

1409935

In [100]:
! ls datasets/silknow

zeste_material_en_post.csv	       zeste_technique_es_post.csv
zeste_material_es-no-imatex_post.csv   zeste_technique_fr_post.csv
zeste_material_es_post.csv	       zeste_visual_item_es-no-imatex_post.csv
zeste_technique_en_post.csv	       zeste_visual_item_es_post.csv
zeste_technique_es-no-imatex_post.csv


In [101]:
ds_techniques = pd.read_csv('datasets/silknow/zeste_technique_fr_post.csv').rename(columns={'technique':'label'})
ds_techniques.label = ds_techniques.label.str.lower()
ds_techniques.head()

Unnamed: 0,text,label
0,"Elément de tenture, section rectangulaire. Dam...",http://data.silknow.org/vocabulary/168
1,Damas vert à casques et à roses pour deux meub...,http://data.silknow.org/vocabulary/168
2,VELOURS,http://data.silknow.org/vocabulary/379
3,VELOURS,http://data.silknow.org/vocabulary/379
4,VELOURS,http://data.silknow.org/vocabulary/379


In [102]:
ds_techniques.label.value_counts()

http://data.silknow.org/vocabulary/379    10
http://data.silknow.org/vocabulary/168     3
http://data.silknow.org/vocabulary/366     3
Name: label, dtype: int64

In [103]:
len(ds_techniques)

16

In [104]:
def get_word_neighborhood(word, depth=2, allowed_rels='all', keep='top20000', language=language):
    neighborhood = pickle.load(open('/data/zeste_cache/neighborhoods_'+language+'/'+word+'.pickle', 'rb'))
    neighborhood_words = list(neighborhood.keys())
    
    if allowed_rels != 'all':
        for n in neighborhood_words:
            if all(rel not in neighborhood[n]['rels'] for rel in allowed_rels):
                del neighborhood[n]
                continue

    to_visit_next = list(neighborhood.keys())
    
    while depth > 1:
        additions = []
        while len(to_visit_next) > 0:
            w = to_visit_next.pop()
            nn = get_word_neighborhood(w, depth=1, allowed_rels=allowed_rels)
            for ww in nn:
                if ww in neighborhood:
                    neighborhood[ww]['from'].append(w)
                    neighborhood[ww]['rels'].extend(['<>'] + nn[ww]['rels'])
                else:
                    neighborhood[ww] = {}
                    neighborhood[ww]['from'] = [w]
                    neighborhood[ww]['rels'] = nn[ww]['rels']
                    if '/c/'+language+'/'+word in numberbatch and '/c/'+language+'/'+ww in numberbatch:
                        neighborhood[ww]['sim'] = numberbatch.similarity('/c/'+language+'/'+word, '/c/'+language+'/'+ww)
                    else:
                        neighborhood[ww]['sim'] = 0.0
                    additions.append(ww)
        to_visit_next = additions
        depth -= 1
    
    if keep.startswith('top'):
        k = int(keep.split('top')[1])
        all_scores = [neighborhood[kw]['sim'] for kw in neighborhood]
        all_words = list(neighborhood.keys())
        if k < len(all_scores):
            lowest_top = sorted(all_scores, reverse=True)[k]
            for kw in all_words:
                if neighborhood[kw]['sim'] <= lowest_top:
                    del neighborhood[kw]
    
    return neighborhood

In [105]:
def preprocess(doc):
    doc = ''.join(c for c in doc if c not in string.punctuation or c == ' ').lower()
    doc_tokens = doc.split(' ')
    doc_tokens = [w.lower() for w in doc_tokens]
    doc_tokens = [w for w in doc_tokens if w not in stopwords.words('english')]
    doc_tokens = [w.replace('"', '').replace('?', '') for w in doc_tokens]
    doc_tokens = [w.replace('-', '_') for w in doc_tokens if w != '']

    return doc_tokens

In [106]:
def get_words_neighborhood(words, depth=2, allowed_rels='all', keep='top20000', language=language):
    words = words.split(';')
    ns = []
    
    for word in words:
        ns.append(get_word_neighborhood(word, depth=depth, allowed_rels=allowed_rels, keep=keep, language=language))
    neighborhood = ns[0].copy()
    
    for w, nn in zip(words[1:], ns[1:]):
        for ww in nn:
            if ww in neighborhood:
                neighborhood[ww]['from'].append(w)
                neighborhood[ww]['rels'].extend(['<>'] + nn[ww]['rels'])
                neighborhood[ww]['sim'] = max(neighborhood[ww]['sim'], nn[ww]['sim'])
            else:
                neighborhood[ww] = {}
                neighborhood[ww]['from'] = [w]
                neighborhood[ww]['rels'] = nn[ww]['rels']
                neighborhood[ww]['sim']  = nn[ww]['sim']

    return neighborhood

In [107]:
sorted(ds_techniques.label.unique())

['http://data.silknow.org/vocabulary/168',
 'http://data.silknow.org/vocabulary/366',
 'http://data.silknow.org/vocabulary/379']

In [108]:
get_word_neighborhood('rose', 1, 'all').keys()

dict_keys(['rose', 'rhodologue', 'saponaire_de_montpellier', 'ornement', 'rosifier', 'saumon', 'sexualité', 'roser', 'fronteval', 'vulve', 'picpoul', 'commune', 'primerose', 'bébé_cadum', 'lunel', 'vin_rosé', 'clairette', 'rosiériste', 'rose_noisette', 'nacarat', 'politique', 'flamant_des_caraïbes', 'cathédrale', 'fuchsia', 'clitopile', 'frangipanier', 'pointue', 'guitare', 'adénostyle_à_feuilles_blanches', 'italie', 'gesse_tubéreuse', 'feuille', 'œillette', 'anémone_de_mer_verte', 'abaque', 'flamant_du_chili', 'rosier', 'sureau_hièble', 'épineux', 'trèfle_alpin', 'enjugeraie', 'cuisse_de_nymphe', 'liseron', 'liseron_de_biscaye', 'roses', 'vitrail', 'bignone_rose', 'liseron_des_dunes', 'bruyère_cendrée', 'pélican_blanc', 'rosalie', 'ciste', 'cosenza', 'rose_des_sables', 'transept', 'par_dessus', 'bijouterie', 'rosette', 'bruyère_quaternée', 'rosace', 'nénufar', 'pêche', 'persicaire', 'flamant_rose', 'calabre', 'rhodomite', 'plat', 'oignon', 'vipérine_commune', 'rosacée', 'rhodographie'

In [109]:
"""mapping = {  'brocade': 'brocade',
             'brocatelle': 'brocatelle',
             'chiné': 'chiné',
             'damask': 'damask',
             'embroidery': "embroidery",
             'florentine': "florentine",
             'gros': "gros",
             'jacquard weave': "jacquard_loom",
             'lampas': "lampas",
             'moiré' : 'moiré',
             'pattern weft': "weft",
             'plain': "plain",
             'velvet': "velvet"}"""

mapping = { 
             'http://data.silknow.org/vocabulary/168': 'damas',
             'http://data.silknow.org/vocabulary/379': 'velours',
             'http://data.silknow.org/vocabulary/259': 'taffetas',
             'http://data.silknow.org/vocabulary/355': 'mousseline',
             'http://data.silknow.org/vocabulary/366': 'satin' }

labels = sorted(set(mapping.values()))
print(len(labels))
print(labels)

5
['damas', 'mousseline', 'satin', 'taffetas', 'velours']


In [110]:
%%time
pbar = tqdm(labels)

labels_cgr = {}
for label in pbar:
    pbar.set_description(label)
    labels_cgr[label] = get_words_neighborhood(label, 2, 'all', keep='top20000')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


CPU times: user 1.83 s, sys: 116 ms, total: 1.95 s
Wall time: 1.94 s


In [111]:
{l:len(labels_cgr[l]) for l in labels_cgr}

{'damas': 6066,
 'mousseline': 4509,
 'satin': 7550,
 'taffetas': 3585,
 'velours': 4190}

In [112]:
labels_cgr.keys()

dict_keys(['damas', 'mousseline', 'satin', 'taffetas', 'velours'])

In [113]:
def get_document_score(doc, label_neighborhood):
    # tokens = preprocess(doc)
    if type(doc) == str:
        doc = doc.split(' ')
    tokens = doc
    related_words = []
    score = 0
    for token in tokens: 
        if token in label_neighborhood:
            similarity = label_neighborhood[token]['sim']
            if similarity > 0:
                related_words.append((token, label_neighborhood[token]['rels'][0], similarity))
                score += similarity
        
    return score# , sorted(related_words, key=lambda t: -t[-1])

In [114]:
ds_techniques.text[0]

"Elément de tenture, section rectangulaire. Damas 3 couleurs fond bleu, liseré en 2 couleurs d’or, dessin à compartiments de feuilles de lierre, couronne de roses, papillon, jatte de fruit, gerbe de blé.\n\nHistorique :\nTissu exécuté par Fournel, à Lyon, pour le premier salon de l’Empereur au château de Versailles. \nLa commande ne comprenait que des étoffes pour tenture sans bordures et le fabriquant s'engageait à les tisser sur régulateur. Ce damas ne fut pas utilisé sous l'Empire mais une partie des étoffes fut employée pour les Tuileries, entre 1815 et 1845, puis à Saint-Cloud en 1845, et enfin au Grand Trianon en 1874. En 1910, 1929 et 1933, plusieurs métrages en furent envoyés à Malmaison."

In [115]:
data_silknow = ds_techniques.text.tolist()

In [116]:
print("Number of processors: ", mp.cpu_count())

Number of processors:  32


In [117]:
%%time
with mp.Pool(mp.cpu_count()) as pool:
    data_preprocessed = pool.map(preprocess, ds_techniques.text)

CPU times: user 17.4 ms, sys: 2.61 s, total: 2.62 s
Wall time: 2.72 s


In [118]:
def scoring_silknow(doc, labels_ns=labels_cgr):
    return np.argmax([get_document_score(doc, labels_ns[l]) for l in sorted(labels_ns.keys())])

In [119]:
def generate_predictions(corpus_preprocessed, labels_cgr):
    with mp.Pool(processes=mp.cpu_count()) as pool:
        predictions = pool.map(scoring_silknow, corpus_preprocessed)
    return predictions

In [120]:
%%time
predictions = generate_predictions(data_preprocessed, labels_cgr)

CPU times: user 37.6 ms, sys: 2.49 s, total: 2.53 s
Wall time: 2.47 s


In [121]:


predictions[:10]

[1, 0, 4, 4, 4, 4, 4, 4, 2, 2]

In [122]:

prediction_labels = [labels[p] for p in predictions]

In [123]:
set(prediction_labels)

{'damas', 'mousseline', 'satin', 'velours'}

In [124]:
true_labels = [mapping[l] for l in ds_techniques.label.tolist()]

# Techniques

In [125]:
print(classification_report(prediction_labels, true_labels, digits=3))

              precision    recall  f1-score   support

       damas      0.667     1.000     0.800         2
  mousseline      0.000     0.000     0.000         1
       satin      0.667     1.000     0.800         2
     velours      1.000     0.909     0.952        11

    accuracy                          0.875        16
   macro avg      0.583     0.727     0.638        16
weighted avg      0.854     0.875     0.855        16



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [126]:
cm = confusion_matrix(prediction_labels, true_labels)
cm_df = pd.DataFrame(cm, columns=labels, index=labels)
plt.figure(figsize = (5,5))
sns.heatmap(cm_df)

ValueError: Shape of passed values is (4, 4), indices imply (5, 5)

# "Top-k" evaluation

In [127]:
k = 2
def scoring_silknow_top_n(doc, labels_ns=labels_cgr, n=k):
    return np.argsort([get_document_score(doc, labels_ns[l]) for l in sorted(labels_ns.keys())])[:-(n+1):-1]
def generate_top_predictions(corpus_preprocessed, labels_cgr):
    with mp.Pool(processes=mp.cpu_count()) as pool:
        predictions = pool.map(scoring_silknow_top_n, corpus_preprocessed)
    return predictions
true_labels = [mapping[l] for l in ds_techniques.label.tolist()]
topk_predictions_lists = generate_top_predictions(data_preprocessed, labels_cgr)
prediction_labels_lists = [[labels[l] for l in p] for p in topk_predictions_lists]
topk_predictions = []

for i, gt_label in enumerate(true_labels):
    topk_predictions.append(gt_label if gt_label in prediction_labels_lists[i] else prediction_labels_lists[i][0])
print(classification_report(topk_predictions, true_labels, digits=3))

              precision    recall  f1-score   support

       damas      1.000     1.000     1.000         3
       satin      1.000     1.000     1.000         3
     velours      1.000     1.000     1.000        10

    accuracy                          1.000        16
   macro avg      1.000     1.000     1.000        16
weighted avg      1.000     1.000     1.000        16



In [128]:
k = 3
def scoring_silknow_top_n(doc, labels_ns=labels_cgr, n=k):
    return np.argsort([get_document_score(doc, labels_ns[l]) for l in sorted(labels_ns.keys())])[:-(n+1):-1]
def generate_top_predictions(corpus_preprocessed, labels_cgr):
    with mp.Pool(processes=mp.cpu_count()) as pool:
        predictions = pool.map(scoring_silknow_top_n, corpus_preprocessed)
    return predictions
true_labels = [mapping[l] for l in ds_techniques.label.tolist()]
topk_predictions_lists = generate_top_predictions(data_preprocessed, labels_cgr)
prediction_labels_lists = [[labels[l] for l in p] for p in topk_predictions_lists]
topk_predictions = []

for i, gt_label in enumerate(true_labels):
    topk_predictions.append(gt_label if gt_label in prediction_labels_lists[i] else prediction_labels_lists[i][0])
print(classification_report(topk_predictions, true_labels, digits=3))

              precision    recall  f1-score   support

       damas      1.000     1.000     1.000         3
       satin      1.000     1.000     1.000         3
     velours      1.000     1.000     1.000        10

    accuracy                          1.000        16
   macro avg      1.000     1.000     1.000        16
weighted avg      1.000     1.000     1.000        16

