In [279]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m9.1 MB/s[0m  [33m0:00:03[0mm0:00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [280]:
import urllib
import json
import pandas as pd
import spacy
import string
import numpy as np
from tqdm import tqdm

In [281]:
# Load input jsonl

path_in = "input/products.jsonl"
with open(path_in, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)
df.head()

Unnamed: 0,url,title,description,product_features,links,product_reviews
0,https://web-scraping.dev/products,web-scraping.dev product page 1,,{},"[https://web-scraping.dev/, https://web-scrapi...",[]
1,https://web-scraping.dev/product/1,Box of Chocolate Candy,Indulge your sweet tooth with our Box of Choco...,"{'material': 'Premium quality chocolate', 'fla...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-22', 'id': 'chocolate-candy..."
2,https://web-scraping.dev/product/16,Red Energy Potion,"Elevate your game with our 'Red Potion', an ex...","{'flavor': 'Intense berry fusion', 'caffeine_c...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2023-02-10', 'id': 'red-potion-1', ..."
3,https://web-scraping.dev/product/10,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s..."
4,https://web-scraping.dev/product/10?variant=bl...,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s..."


In [282]:
def get_product_id(url:str) -> str:
    '''
    Extracts the product's ID from a page with a given url.
    '''
    url_parsed = urllib.parse.urlparse(url)
    path = url_parsed.path.split('/')
    if path[1] == 'product':
        id = path[2]
    else:
        id = None
    return id

In [283]:
def get_product_variant(url:str) -> list[str]:
    '''
    Extracts the product's variant (if it exists) from a page with a given url.
    '''
    url_parsed = urllib.parse.urlparse(url)
    url_parsed = urllib.parse.urlparse(url)
    query = url_parsed.query
    params = urllib.parse.parse_qs(query)
    if query and 'variant' in params.keys():
        variant = params['variant']
    else:
        variant = None
    return variant

In [284]:
tqdm.pandas()

df['product_id'] = df['url'].progress_apply(get_product_id)
df['product_variant'] = df['url'].progress_apply(get_product_variant)

df.head()

100%|██████████| 156/156 [00:00<00:00, 121687.08it/s]
100%|██████████| 156/156 [00:00<00:00, 171106.54it/s]


Unnamed: 0,url,title,description,product_features,links,product_reviews,product_id,product_variant
0,https://web-scraping.dev/products,web-scraping.dev product page 1,,{},"[https://web-scraping.dev/, https://web-scrapi...",[],,
1,https://web-scraping.dev/product/1,Box of Chocolate Candy,Indulge your sweet tooth with our Box of Choco...,"{'material': 'Premium quality chocolate', 'fla...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-22', 'id': 'chocolate-candy...",1.0,
2,https://web-scraping.dev/product/16,Red Energy Potion,"Elevate your game with our 'Red Potion', an ex...","{'flavor': 'Intense berry fusion', 'caffeine_c...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2023-02-10', 'id': 'red-potion-1', ...",16.0,
3,https://web-scraping.dev/product/10,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s...",10.0,
4,https://web-scraping.dev/product/10?variant=bl...,Kids' Light-Up Sneakers,Make your child's every step magical with thes...,{'material': 'Breathable fabric upper with syn...,"[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-01', 'id': 'kids-light-up-s...",10.0,[blue-5]


In [285]:
nlp = spacy.load("en_core_web_md", disable=["parser"])
print("Loading spaCy model...")

Loading spaCy model...


In [286]:
def get_processed_doc(doc:str) -> list[str]:
    '''
    Gives the tokenized version of the given doc, without stopwords and punctuation.
    '''
    text = doc.lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    text = nlp(text.lower())
    tokens = [token.text for token in text if not token.is_stop and not token.is_punct and not token.is_space]
    return tokens

In [287]:
def get_titles_index(df:pd.DataFrame, path_out:str=None):
    '''
    Creates the reversed index for titles.
    '''
    df['processed_title'] = df['title'].progress_apply(get_processed_doc)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        unique_tokens = set(row['processed_title'])

        for token in unique_tokens:
            if token not in reversed_index:
                reversed_index[token] = []
            reversed_index[token].append(current_url)
    
    if path_out:
        with open(path_out, 'w', encoding='utf-8') as f:
            for token in sorted(reversed_index.keys()):
                line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
                f.write(line + "\n")

        print(f"The reversed index title file has been saved ({path_out})!")

get_titles_index(df)

100%|██████████| 156/156 [00:00<00:00, 429.38it/s]


In [288]:
def get_description_index(df:pd.DataFrame, path_out:str=None):
    '''
    Creates the reversed index for description.
    '''
    df['processed_description'] = df['description'].progress_apply(get_processed_doc)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        unique_tokens = set(row['processed_description'])

        for token in unique_tokens:
            if token not in reversed_index:
                reversed_index[token] = []
            reversed_index[token].append(current_url)
    
    if path_out:
        with open(path_out, 'w', encoding='utf-8') as f:
            for token in sorted(reversed_index.keys()):
                line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
                f.write(line + "\n")

    print(f"The reversed index description file has been saved ({path_out})!")

get_description_index(df)

100%|██████████| 156/156 [00:01<00:00, 99.80it/s]

The reversed index description file has been saved (None)!





In [289]:
def get_review_index(df:pd.DataFrame, path_out:str):
    '''
    Creates the index for reviews.
    '''

    index = {}

    for _, row in df.iterrows():
        current_url = row['url']
        all_reviews = row['product_reviews']
        all_ratings = [r['rating'] for r in all_reviews]

        index[current_url] = {}
        index[current_url]['total_reviews'] = len(all_reviews)
        if len(all_reviews) > 0:
            index[current_url]['mean_marks'] = np.mean(all_ratings)
            index[current_url]['last_rating'] = all_ratings[-1]
        else:
            index[current_url]['mean_marks'] = None
            index[current_url]['last_rating'] = None

    print(f"The index review file has been saved ({path_out})!")

    with open(path_out, 'w', encoding='utf-8') as f:
        for token in sorted(index.keys()):
            line = json.dumps({str(token): index[token]}, ensure_ascii=False)
            f.write(line + "\n")

get_review_index(df, "output/reviews_index.jsonl")

The index review file has been saved (output/reviews_index.jsonl)!


In [290]:
def get_brand_index(df:pd.DataFrame, path_out:str):
    '''
    Creates the reversed index for brands.
    '''
    df['brand'] = df['product_features'].progress_apply(lambda x: x['brand'] if 'brand' in x.keys() else None)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        if 'brand' in row['product_features']:
            brand = row['product_features']['brand'].lower()
            if brand not in reversed_index:
                reversed_index[brand] = []
            reversed_index[brand].append(current_url)
    
    with open(path_out, 'w', encoding='utf-8') as f:
        for token in sorted(reversed_index.keys()):
            line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
            f.write(line + "\n")

    print(f"The reversed index brand file has been saved ({path_out})!")

get_brand_index(df, "output/brand_index.jsonl")

100%|██████████| 156/156 [00:00<00:00, 452184.81it/s]

The reversed index brand file has been saved (output/brand_index.jsonl)!





In [291]:
def get_origin_index(df:pd.DataFrame, path_out:str):
    '''
    Creates the reversed index for origin.
    '''
    df['origin'] = df['product_features'].progress_apply(lambda x: x['made in'] if 'made in' in x.keys() else None)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        if 'made in' in row['product_features']:
            origin = row['product_features']['made in'].lower()
            if origin not in reversed_index:
                reversed_index[origin] = []
            reversed_index[origin].append(current_url)
    
    with open(path_out, 'w', encoding='utf-8') as f:
        for token in sorted(reversed_index.keys()):
            line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
            f.write(line + "\n")

    print(f"The reversed index origin file has been saved ({path_out})!")

get_origin_index(df, "output/origin_index.jsonl")

100%|██████████| 156/156 [00:00<00:00, 416228.64it/s]

The reversed index origin file has been saved (output/origin_index.jsonl)!





In [292]:
def get_token_pos(token:str, doc:list['str']):
    '''
    Gives the postions of a token in a document.
    '''
    pos = []
    for i, t in enumerate(doc):
        if t == token:
            pos.append(i)

    return pos

In [293]:
def get_titles_index_with_pos(df:pd.DataFrame, path_out:str):
    '''
    Creates the reversed index for titles with position.
    '''
    df['processed_title'] = df['title'].progress_apply(get_processed_doc)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        unique_tokens = set(row['processed_title'])

        for token in unique_tokens:
            if token not in reversed_index:
                reversed_index[token] = []
            positions = get_token_pos(token, row['processed_title'])
            reversed_index[token].append({current_url: positions})
    
    with open(path_out, 'w', encoding='utf-8') as f:
        for token in sorted(reversed_index.keys()):
            line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
            f.write(line + "\n")

    print(f"The reversed index title with position file has been saved ({path_out})!")

get_titles_index_with_pos(df, "output/title_index.jsonl")

100%|██████████| 156/156 [00:00<00:00, 480.00it/s]

The reversed index title with position file has been saved (output/title_index.jsonl)!





In [294]:
def get_description_index_with_pos(df:pd.DataFrame, path_out:str):
    '''
    Creates the reversed index for description with position.
    '''
    df['processed_description'] = df['description'].progress_apply(get_processed_doc)

    reversed_index = {}
    
    for _, row in df.iterrows():
        current_url = row['url']
        unique_tokens = set(row['processed_description'])

        for token in unique_tokens:
            if token not in reversed_index:
                reversed_index[token] = []
            positions = get_token_pos(token, row['processed_description'])
            reversed_index[token].append({current_url: positions})
    
    with open(path_out, 'w', encoding='utf-8') as f:
        for token in sorted(reversed_index.keys()):
            line = json.dumps({str(token): reversed_index[token]}, ensure_ascii=False)
            f.write(line + "\n")

    print(f"The reversed index description with position file has been saved ({path_out})!")

get_description_index_with_pos(df, "output/description_index.jsonl")

100%|██████████| 156/156 [00:01<00:00, 98.88it/s]

The reversed index description with position file has been saved (output/description_index.jsonl)!



