## 1. Lecture et préparation 

In [9]:
import json
import urllib.parse


def import_index(chemin: str):
    
    with open(chemin, "r", encoding="utf-8") as f:
        index = json.load(f)
    
    return index

In [12]:
brand_index = import_index(chemin="input/brand_index.json")
description_index = import_index(chemin="input/description_index.json")
origin_index = import_index(chemin="input/origin_index.json")
origin_synonyms = import_index(chemin="input/origin_synonyms.json")
reviews_index = import_index(chemin="input/reviews_index.json")
title_index = import_index(chemin="input/title_index.json")

In [13]:
import spacy

nlp = spacy.load("en_core_web_md")

def create_token(title: str):
    doc = nlp(title)

    tokens = [
        token.text
        for token in doc
        if not token.is_stop and not token.is_punct
    ]

    return tokens

In [32]:
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
import nltk 

nltk.download("wordnet")


def find_synonyms(word: str):
    """ 
    This fonction find the list of all synonyms of a given word
    """

    # nlp.add_pipe("wordnet", after="tagger")

    doc = nlp(word)

    for synset in doc[0]._.wordnet.synsets():
        print("Synset :", synset.name())
        print("Synonymes :", synset.lemma_names())


[nltk_data] Downloading package wordnet to /home/ensai/nltk_data...


In [43]:
find_synonyms("China")

Synset : china.n.01
Synonymes : ['China', "People's_Republic_of_China", 'mainland_China', 'Communist_China', 'Red_China', 'PRC', 'Cathay']
Synset : china.n.02
Synonymes : ['china']
Synset : taiwan.n.01
Synonymes : ['Taiwan', 'China', 'Nationalist_China', 'Republic_of_China']
Synset : chinaware.n.01
Synonymes : ['chinaware', 'china']


## 2. Filtrage des documents 

In [38]:
def tokenisation(query: str):

    tokens = query.split(" ")

    return tokens

In [39]:
import unicodedata
import re

def normalize_query(query: str):
    query = query.strip().lower()
    query = "".join(
        c for c in unicodedata.normalize("NFD", query)
        if unicodedata.category(c) != "Mn"
    )
    query = re.sub(r"\s+", " ", query)
    return query

print(normalize_query("  Éléphant   à Noël  "))
# "elephant a noel"

print(tokenisation(normalize_query("  Éléphant   à Noël  ")))

elephant a noel
['elephant', 'a', 'noel']


In [47]:
def find_token_in_brand_index(tokens: list):

    for token in tokens: 
        if token in brand_index.keys():
            return True
        
    return False

In [None]:
def find_token_in_description_index(tokens: list):

    for token in tokens: 
        if token in description_index.keys():
            return True
        
    return False

In [50]:
def find_token_in_origin_index(tokens: list):

    for token in tokens: 
        if token in origin_index.keys():
            return True
        
    return False

In [51]:
def find_token_in_title_index(tokens: list):

    for token in tokens: 
        if token in title_index.keys():
            return True
        
    return False

## 3. Ranking

In [92]:
def create_corpus(corpus: list, index):

    words_index = []

    for word in index.keys():
        words_index.append(word)

    corpus.append(words_index)

    return corpus

In [93]:
list_index = [
    brand_index,
    description_index,
    origin_index,
    origin_synonyms,
    reviews_index,
    title_index
]

In [96]:
len(corpus)

6

In [95]:
corpus = []
for index in list_index:
    create_corpus(corpus=corpus, index=index)

In [97]:
import rank_bm25


bm25 = rank_bm25.BM25Okapi(corpus=corpus)
query = "chocolate italy".split()
results = bm25.get_scores(query=query)

print(results)

[0.         0.2901406  2.08205007 0.         0.         0.68664027]


In [75]:
doc_scores

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [69]:
type(list(brand_index.keys()))

list