## 1. Lecture et prÃ©paration 

In [5]:
import json
import urllib.parse


def import_index(path: str):
    """
    This function read a json file.

    Args:
        path (str): The path that leads to the file

    Returns:
        dict: The imported file
    """
    
    with open(path, "r", encoding="utf-8") as f:
        index = json.load(f)
    
    return index

In [2]:
brand_index = import_index(path="input/brand_index.json")
description_index = import_index(path="input/description_index.json")
origin_index = import_index(path="input/origin_index.json")
origin_synonyms = import_index(path="input/origin_synonyms.json")
reviews_index = import_index(path="input/reviews_index.json")
title_index = import_index(path="input/title_index.json")

In [4]:
type(brand_index)

dict

In [6]:
import spacy

nlp = spacy.load("en_core_web_md")

def create_token(query: str):
    doc = nlp(query.lower())

    tokens = [
        token.text
        for token in doc
        if not token.is_stop and not token.is_punct
    ]

    return tokens

In [7]:
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
import nltk 

nltk.download("wordnet")


def find_synonyms(word: str):
    """ 
    This fonction find the list of all synonyms of a given word
    """

    # nlp.add_pipe("wordnet", after="tagger")

    doc = nlp(word)

    for synset in doc[0]._.wordnet.synsets():
        print("Synset :", synset.name())
        print("Synonymes :", synset.lemma_names())


[nltk_data] Downloading package wordnet to /home/ensai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Filtrage des documents 

In [13]:
def tokenisation(query: str):

    tokens = query.split(" ")

    return tokens

In [18]:
import unicodedata
import re

def normalize_query(query: str) -> list[str]:
    """
    Normalize a given query (removes special 
    characters, punctuation and spaces)

    Args:
        query (str): The query that we want to normalize

    Returns:
        list[str]: The list of all words of the query
    """

    query = query.strip().lower()
    query = "".join(
        c for c in unicodedata.normalize("NFD", query)
        if unicodedata.category(c) != "Mn"
    )
    query = re.sub(r"\s+", " ", query)
    return query.split(" ")

In [17]:
def find_token_in_brand_index(tokens: list, url: str) -> list[bool]:
    """
    For each word, this function tells if it is in the
    brand of the document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        list[bool]: List of bool 
    """

    presence_token = []

    for token in tokens: 
        if token in brand_index.keys():
    
            if url in brand_index[token]:
                presence_token.append(True)

            else:
                presence_token.append(False)
        
    return presence_token

In [9]:
find_token_in_brand_index(tokens=["timelessfootwear", "magicsteps"], url="https://web-scraping.dev/product/11")

[True, False]

In [10]:
def find_token_in_description_index(tokens: list, url: str) -> list[bool]:
    """
    For each word, this function tells if it is in the
    description of the document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        list[bool]: List of bool 
    """

    presence_token = []

    for token in tokens: 
        if token in description_index.keys():
    
            if url in description_index[token]:
                presence_token.append(True)

            else:
                presence_token.append(False)
        
    return presence_token

In [11]:
def find_token_in_origin_index(tokens: list, url: str) -> list[bool]:
    """
    For each word, this function tells if it is in the
    origin of the document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        list[bool]: List of bool 
    """

    presence_token = []

    for token in tokens: 
        if token in origin_index.keys():
    
            if url in origin_index[token]:
                presence_token.append(True)

            else:
                presence_token.append(False)
        
    return presence_token

In [12]:
def find_token_in_title_index(tokens: list, url: str) -> list[bool]:
    """
    For each word, this function tells if it is in the
    title of the document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        list[bool]: List of bool 
    """

    presence_token = []

    for token in tokens: 
        if token in brand_index.keys():
    
            if url in brand_index[token]:
                presence_token.append(True)

            else:
                presence_token.append(False)
        
    return presence_token

## 3. Ranking

In [None]:
import rank_bm25

rank_bm25.BM25Okapi

In [14]:
def create_corpus(corpus: list, index):

    words_index = []

    for word in index.keys():
        words_index.append(word)

    corpus.append(words_index)

    return corpus

In [16]:
list_index = [
    brand_index,
    description_index,
    origin_index,
    origin_synonyms,
    reviews_index,
    title_index
]

In [67]:
documents = []
with open("rearranged_products.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        documents.append(json.loads(line))

In [None]:
def get_score_presence_brand(tokens: list, url: str):
    """
    This function computes the score associated with
    the presence of each token in the brand of a given
    document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        int: The score
    """

    brand_rate = 5
    
    presence_brand = find_token_in_brand_index(
        tokens=tokens,
        url=url
    )

    return brand_rate*sum(presence_brand)

In [2]:
def get_score_presence_description(tokens: list, url: str):
    """
    This function computes the score associated with
    the presence of each token in the description of
    a given document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        int: The score
    """

    description_rate = 5
    
    presence_description = find_token_in_description_index(
        tokens=tokens,
        url=url
    )

    return description_rate*sum(presence_description)

In [4]:
def get_score_presence_origin(tokens: list, url: str):
    """
    This function computes the score associated with
    the presence of each token in the origin country 
    of a given document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        int: The score
    """

    origin_rate = 5
    
    presence_origin = find_token_in_origin_index(
        tokens=tokens,
        url=url
    )

    return origin_rate*sum(presence_origin)

In [5]:
def get_score_presence_title(tokens: list, url: str):
    """
    This function computes the score associated with
    the presence of each token in the title of
    a given document.

    Args:
        tokens (list): List of words

        url (str): The url of the document

    Returns:
        int: The score
    """

    title_rate = 5
    
    presence_title = find_token_in_title_index(
        tokens=tokens,
        url=url
    )

    if all(x == True for x in presence_title):
        return title_rate*(sum(presence_title) + 1)

    return title_rate*sum(presence_title)

In [None]:
import rank_bm25

rank_bm25.BM25Okapi

In [6]:
def get_score_rewiews(url: str):
    """
    This function computes the score associated with
    the marks of a given document.

    Args:
        url (str): The url of the document

    Returns:
        int: The score
    """

    review = reviews_index[url]

    return review["mean_mark"] + review["last_rating"]

In [83]:
def get_score_presence_all(query: str, url: str):

    tokens = create_token(query=query)

    score_brand = get_score_presence_brand(
        tokens=tokens,
        url=url
    )

    score_description = get_score_presence_description(
        tokens=tokens,
        url=url
    )

    score_origin = get_score_presence_origin(
        tokens=tokens,
        url=url
    )

    score_title = get_score_presence_title(
        tokens=tokens,
        url=url
    )

    score_reviews = get_score_rewiews(url=url)

    # We compute the score associated

    score = (score_brand +
        score_description +
        score_origin +
        score_title +
        score_reviews
    )

    return score


In [85]:
def get_score_for_all_url(query: str, documents: list):
    
    scores = {}
    
    for document in documents:
        scores[document["url"]] = get_score_presence_all(
            query=query,
            url=document["url"]
        )

    return scores

In [68]:
documents

[{'url': 'https://web-scraping.dev/products',
  'title': 'web-scraping.dev product page 1',
  'description': '',
  'product_features': {'made in': 'switzerland'},
  'links': ['https://web-scraping.dev/',
   'https://web-scraping.dev/products',
   'https://web-scraping.dev/docs',
   'https://web-scraping.dev/api/graphql',
   'https://web-scraping.dev/products',
   'https://web-scraping.dev/reviews',
   'https://web-scraping.dev/testimonials',
   'https://web-scraping.dev/login',
   'https://web-scraping.dev/cart',
   'https://web-scraping.dev/products?category=apparel',
   'https://web-scraping.dev/products?category=consumables',
   'https://web-scraping.dev/products?category=household',
   'https://web-scraping.dev/product/1',
   'https://web-scraping.dev/product/2',
   'https://web-scraping.dev/product/3',
   'https://web-scraping.dev/product/4',
   'https://web-scraping.dev/product/5',
   'https://web-scraping.dev/products?page=1',
   'https://web-scraping.dev/products?page=2',
   'h

In [87]:
query = "Energy drink"

data = get_score_for_all_url(query=query, documents=documents)

sorted_urls = sorted(data, key=data.get, reverse=True)


In [88]:
sorted_urls

['https://web-scraping.dev/product/14',
 'https://web-scraping.dev/product/14?variant=one',
 'https://web-scraping.dev/product/14?variant=six-pack',
 'https://web-scraping.dev/product/15',
 'https://web-scraping.dev/product/15?variant=one',
 'https://web-scraping.dev/product/15?variant=six-pack',
 'https://web-scraping.dev/product/16?variant=six-pack',
 'https://web-scraping.dev/product/2',
 'https://web-scraping.dev/product/26',
 'https://web-scraping.dev/product/26?variant=six-pack',
 'https://web-scraping.dev/product/27?variant=one',
 'https://web-scraping.dev/product/27?variant=six-pack',
 'https://web-scraping.dev/product/28',
 'https://web-scraping.dev/product/28?variant=six-pack',
 'https://web-scraping.dev/product/2?variant=one',
 'https://web-scraping.dev/product/2?variant=six-pack',
 'https://web-scraping.dev/product/3',
 'https://web-scraping.dev/product/3?variant=one',
 'https://web-scraping.dev/product/3?variant=six-pack',
 'https://web-scraping.dev/product/4',
 'https://w