## 1. Lecture et traitement de l’url  

In [10]:
import json
import urllib.parse


def parse_jsonl(chemin: str):
    
    input = []
    with open(chemin, "r", encoding="utf-8") as f:
        for line in f:
            input.append(json.loads(line))
    return input


In [13]:
json = parse_jsonl("input/products.jsonl")

In [23]:
def extract_information_from_url(url: str):

    path = urllib.parse.urlparse(url=url).path.strip("/")   # ex: '12345-red'
    parts = path.split("/")                # ex: ['12345-red']

    product_id = None
    variant = None

    for part in parts:
        # chercher un segment qui commence par un nombre
        if part and part[0].isdigit():
            # séparer ID et variante
            for sep in ["-", "_"]:
                if sep in part:
                    pid, var = part.split(sep, 1)
                    if pid.isdigit():
                        return pid, var

            # cas sans variante
            if part.isdigit():
                return part, None

    return None, None

In [48]:
import re

def extract_info(url):
    path = urllib.parse.urlparse(url).path  # ex: /12345-red
    match = re.search(r"^https?:\/\/[^\/]+\/(?:[^\/]+\/)*?(?P<id>\d+)(?:\?.*[?&]variant=(?P<variant>[^&]+))?", path)

    if match:
        return match.group("id"), match.group("variant")
    return None, None

In [51]:
extract_info(url=json[6]["url"])

(None, None)

In [32]:
json[6]["url"]

'https://web-scraping.dev/product/10?variant=red-5'

## 2. Création des index inversés

In [57]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [74]:
import spacy

nlp = spacy.load("en_core_web_md")

def create_token(title: str):
    doc = nlp(title)

    tokens = [
        token.text
        for token in doc
        if not token.is_stop and not token.is_punct
    ]

    return tokens

In [75]:
create_token(json[0]["title"])

['web-scraping.dev', 'product', 'page', '1']

In [None]:
def create_inverse_index_for_title(doc: list):
    

## 3. Index des reviews

In [70]:
def get_average_rating_reviews(reviews: list):
    """ 
    Compute the average rate of reviews
    """
    
    total_reviews = len(reviews)

    total_rating = 0

    for feedback in reviews:
        total_rating += feedback["rating"]

    return total_rating / total_reviews

In [79]:
from datetime import datetime

def create_review_index(reviews):
    total_reviews = len(reviews)
    if total_reviews == 0:
        return {"total_reviews": 0, "average_rating": 0, "last_rating": None}
    
    average_rating = get_average_rating_reviews(reviews=reviews)
    
    last_rating = reviews[-1]["rating"]
    
    return {
        "total_reviews": total_reviews,
        "average_rating": average_rating,
        "last_rating": last_rating
    }

# Utilisation
index = create_review_index(json[50]["product_reviews"])
print(index)


{'total_reviews': 5, 'average_rating': 4.6, 'last_rating': 4}


In [85]:
def create_all_index_reviews(input: dict):

    all_index = {}

    for i in range(len(input)):

        url = input[i]["url"]
        all_index[url] = create_review_index(input[i]["product_reviews"])

    return all_index

data = {
    "nom": "Alice",
    "age": 30,
    "ville": "Paris"
}

all_index_reviews = create_all_index_reviews(json)

TypeError: object of type 'module' has no len()

In [83]:
import json


def stocker_all_index_reviews(all_index_reviews: dict):

    with open("reviews_index.json", 'w') as file:
        json.dump(all_index_reviews, file, indent=4)

stocker_all_index_reviews(all_index_reviews=all_index_reviews)

## 4. Index des features