## 1. Lecture et traitement de l’url  

In [1]:
import json
import urllib.parse


def parse_jsonl(chemin: str):
    
    input = []
    with open(chemin, "r", encoding="utf-8") as f:
        for line in f:
            input.append(json.loads(line))
    return input


In [3]:
doc_products = parse_jsonl("input/products.jsonl")

In [3]:
def extract_information_from_url(url: str):

    path = urllib.parse.urlparse(url=url).path.strip("/")   # ex: '12345-red'
    parts = path.split("/")                # ex: ['12345-red']

    product_id = None
    variant = None

    for part in parts:
        # chercher un segment qui commence par un nombre
        if part and part[0].isdigit():
            # séparer ID et variante
            for sep in ["-", "_"]:
                if sep in part:
                    pid, var = part.split(sep, 1)
                    if pid.isdigit():
                        return pid, var

            # cas sans variante
            if part.isdigit():
                return part, None

    return None, None

In [4]:
import re

def extract_info(url):
    path = urllib.parse.urlparse(url).path  # ex: /12345-red
    match = re.search(r"^https?:\/\/[^\/]+\/(?:[^\/]+\/)*?(?P<id>\d+)(?:\?.*[?&]variant=(?P<variant>[^&]+))?", path)

    if match:
        return match.group("id"), match.group("variant")
    return None, None

In [51]:
extract_info(url=json[6]["url"])

(None, None)

In [32]:
json[6]["url"]

'https://web-scraping.dev/product/10?variant=red-5'

## 2. Création des index inversés

In [57]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [50]:
import spacy

nlp = spacy.load("en_core_web_md")

def create_token(title: str):
    doc = nlp(title.lower())

    tokens = [
        token.text
        for token in doc
        if not token.is_stop and not token.is_punct
    ]

    return tokens

In [47]:
"Titre Est".lower()

'titre est'

In [25]:
create_token(doc_products[0]["title"])

['web-scraping.dev', 'product', 'page', '1']

In [7]:
def get_position_from_tokens(tokens: list):

    return [
        (token, position)
        for position, token in enumerate(tokens)
    ]

In [18]:
print(get_position_from_tokens(create_token(json[0]["title"])+ ['product']))

0
1
2
3
1
[('web-scraping.dev', 0), ('product', 1), ('page', 2), ('1', 3), ('product', 4)]


In [37]:
from collections import defaultdict


def create_inverse_index_for_title(documents: dict):
    
    index = defaultdict(lambda: defaultdict(list))

    for document in documents: 
        url = document["url"]
        titre = document["title"]
        tokens = create_token(titre)
        mots_positions = get_position_from_tokens(tokens)

        for mot, position in mots_positions:
            index[mot][url].append(position)

    return index

In [39]:
import json


def stocker_all_index_title(all_index_title: dict):

    with open("title_index.json", 'w') as file:
        json.dump(all_index_title, file, indent=4)

stocker_all_index_title(all_index_title=create_inverse_index_for_title(doc_products))

In [None]:
print(doc_products[0]["title"])

web-scraping.dev product page 1


In [38]:
print(create_inverse_index_for_title(doc_products))

defaultdict(<function create_inverse_index_for_title.<locals>.<lambda> at 0x7ea0d0a6f380>, {'web-scraping.dev': defaultdict(<class 'list'>, {'https://web-scraping.dev/products': [0], 'https://web-scraping.dev/products?category=apparel': [0], 'https://web-scraping.dev/products?category=apparel&page=1': [0], 'https://web-scraping.dev/products?category=apparel&page=2': [0], 'https://web-scraping.dev/products?category=apparel&page=3': [0], 'https://web-scraping.dev/products?category=apparel&page=4': [0], 'https://web-scraping.dev/products?category=apparel&page=5': [0], 'https://web-scraping.dev/products?category=consumables': [0], 'https://web-scraping.dev/products?category=consumables&page=1': [0], 'https://web-scraping.dev/products?category=consumables&page=2': [0], 'https://web-scraping.dev/products?category=consumables&page=3': [0], 'https://web-scraping.dev/products?category=consumables&page=4': [0], 'https://web-scraping.dev/products?category=consumables&page=5': [0], 'https://web-sc

In [40]:
def create_inverse_index_for_description(documents: dict):
    
    index = defaultdict(lambda: defaultdict(list))

    for document in documents: 
        url = document["url"]
        description = document["description"]
        tokens = create_token(description)
        mots_positions = get_position_from_tokens(tokens)

        for mot, position in mots_positions:
            index[mot][url].append(position)

    return index

In [51]:
def stocker_all_index_description(all_index_description: dict):

    with open("description_index.json", 'w') as file:
        json.dump(all_index_description, file, indent=4)

stocker_all_index_description(all_index_description=create_inverse_index_for_description(doc_products))

## 3. Index des reviews

In [52]:
def get_average_rating_reviews(reviews: list):
    """ 
    Compute the average rate of reviews
    """
    
    total_reviews = len(reviews)

    total_rating = 0

    for feedback in reviews:
        total_rating += feedback["rating"]

    return total_rating / total_reviews

In [54]:
from datetime import datetime

def create_review_index(reviews):
    total_reviews = len(reviews)
    if total_reviews == 0:
        return {"total_reviews": 0, "average_rating": 0, "last_rating": None}
    
    average_rating = get_average_rating_reviews(reviews=reviews)
    
    last_rating = reviews[-1]["rating"]
    
    return {
        "total_reviews": total_reviews,
        "average_rating": average_rating,
        "last_rating": last_rating
    }

# Utilisation
index = create_review_index(doc_products[50]["product_reviews"])
print(index)


{'total_reviews': 5, 'average_rating': 4.6, 'last_rating': 4}


In [56]:
def create_all_index_reviews(input: dict):

    all_index = {}

    for i in range(len(input)):

        url = input[i]["url"]
        all_index[url] = create_review_index(input[i]["product_reviews"])

    return all_index

data = {
    "nom": "Alice",
    "age": 30,
    "ville": "Paris"
}

all_index_reviews = create_all_index_reviews(doc_products)

In [57]:
import json


def stocker_all_index_reviews(all_index_reviews: dict):

    with open("reviews_index.json", 'w') as file:
        json.dump(all_index_reviews, file, indent=4)

stocker_all_index_reviews(all_index_reviews=all_index_reviews)

## 4. Index des features

### Les origines

In [62]:
def create_index_origin(documents: list): 

    index_origin = defaultdict(list)

    for document in documents:
        if "made in" in document["product_features"].keys():
            origin = document["product_features"]["made in"].lower()

            index_origin[origin].append(document["url"])

    return index_origin

In [63]:
def stocker_all_index_origin(all_index_origin: dict):

    with open("origin_index.json", 'w') as file:
        json.dump(all_index_origin, file, indent=4)

stocker_all_index_origin(all_index_origin=create_index_origin(doc_products))

### Les marques

In [64]:
def create_index_brand(documents: list): 

    index_brand = defaultdict(list)

    for document in documents:
        if "brand" in document["product_features"].keys():
            brand = document["product_features"]["brand"].lower()

            index_brand[brand].append(document["url"])

    return index_brand

In [66]:
def stocker_all_index_brand(all_index_brand: dict):

    with open("brand_index.json", 'w') as file:
        json.dump(all_index_brand, file, indent=4)

stocker_all_index_brand(all_index_brand=create_index_brand(doc_products))