In [334]:
import urllib
import json
import pandas as pd
import spacy
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

nltk.download("stopwords")
STOPWORDS = stopwords.words('english')
nlp = spacy.load("en_core_web_md", disable=["parser"])
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sayansuos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [318]:
PATH = "input"
SYNONYMS_PATH = 'origin_synonyms.json'
DOC_FIELD = ['title', 'description', 'origin', 'brand']

In [319]:
def load_json(path_in: str, lines=False) -> dict:
    if lines:
        data = []
        with open(path_in, "r", encoding="utf-8") as f:
            for line in f:
                data.append(json.loads(line))
    else:
        with open(path_in, "r", encoding="utf-8") as f:
            data = json.load(f)
    return data

In [320]:
def load_jsonl_as_df(path_in: str) -> pd.DataFrame:
    """
    Load the 'path_in' JSONL file and turns it into a pandas Dataframe.

    :param path_in: Path of the JSONL file.
    :type path_in: str
    :return: A dataframe corresponding to the JSONL file.
    :rtype: DataFrame
    """
    return pd.read_json(path_in, lines=True)

df = load_jsonl_as_df("input/rearranged_products.jsonl")
df.head()

Unnamed: 0,url,title,description,product_features,links,product_reviews
0,https://web-scraping.dev/products,web-scraping.dev product page 1,,{'made in': 'switzerland'},"[https://web-scraping.dev/, https://web-scrapi...",[]
1,https://web-scraping.dev/product/1,Box of Chocolate Candy,Whether you're looking for the perfect gift or...,"{'material': 'Premium quality chocolate', 'fla...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-07-22', 'id': 'chocolate-candy..."
2,https://web-scraping.dev/product/11,Classic Leather Sneakers,Whether you're dressing up for a formal event ...,"{'material': 'Premium genuine leather', 'made ...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-06-15', 'id': 'classic-leather..."
3,https://web-scraping.dev/product/11?variant=bl...,Classic Leather Sneakers - Black40,Whether you're dressing up for a formal event ...,"{'material': 'Premium genuine leather', 'made ...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-06-15', 'id': 'classic-leather..."
4,https://web-scraping.dev/product/11?variant=bl...,Classic Leather Sneakers - Black41,"Made from premium genuine leather, these sneak...","{'material': 'Premium genuine leather', 'made ...","[https://web-scraping.dev/, https://web-scrapi...","[{'date': '2022-06-15', 'id': 'classic-leather..."


In [321]:
def tokenize(text:str):
    return text.split()

def normalize(tokens:list[str]):
    translator = str.maketrans('', '', string.punctuation)
    return  [x.lower().translate(translator) for x in tokens]
    
def remove_stopwords(tokens:list[str]):
    return [x for x in tokens if x not in STOPWORDS]

In [322]:
def process_doc(doc: str) -> list[str]:
    return remove_stopwords(normalize(tokenize(doc)))

In [323]:
def get_synonyms(token: str) -> list[str]:
    origin_synonyms = load_json(PATH+'/'+SYNONYMS_PATH)
    synonyms = None
    for k, v in origin_synonyms.items():
        if k == token:
            synonyms = origin_synonyms[token]
        if token in v:
            synonyms = origin_synonyms[k]
            synonyms.append(k)
            synonyms.remove(token)
    return synonyms

In [324]:
def process_query(query:str) -> list[str]:
    q = process_doc(query)
    augment = []
    for token in q:
        synonyms = get_synonyms(token)
        if synonyms:
            augment += synonyms
    return q + augment


In [325]:
def get_occ_in_doc(token:str, doc_url:str, field:str) -> bool:
    index = load_json(PATH+'/'+field+'_index.json')
    if token in index.keys() and doc_url in index[token]:
            if isinstance(index[token], dict):
                return len(index[token][doc_url])
            else:
                return 1
    return 0

In [326]:
def get_occ(token:str, field:str) -> bool:
    index = load_json(PATH+'/'+field+'_index.json')
    occ = 0
    if token in index.keys():
            if isinstance(index[token], dict):
                for v in index[token].values():
                    occ += len(v)
            else:
                occ += len(index[token])
    return occ

In [327]:
def is_in_doc(token:str, doc_url:str) -> bool:
    for field in DOC_FIELD:
        index = load_json(PATH+'/'+field+'_index.json')
        if token in index.keys() and doc_url in index[token]:
            return True
    return False

In [328]:
def contain_1_token(query:str, doc_url:str) -> True:
    query = process_query(query)
    return any([is_in_doc(x, doc_url) for x in query])

In [329]:
def contain_all_tokens(query:str, doc_url:str):
    query = process_query(query)
    return all([is_in_doc(x, doc_url) for x in query])

In [362]:
df['title'].progress_apply(lambda x: len(process_doc(x)))

100%|██████████| 156/156 [00:00<00:00, 58931.05it/s]


0      4
1      3
2      3
3      5
4      5
      ..
151    4
152    4
153    4
154    4
155    4
Name: title, Length: 156, dtype: int64

In [None]:
def bm25(query:str, doc_url:str, field:str, b:float=.75, k:float=1.2, df:pd.DataFrame=df):
    query = process_query(query)
    if field in ['title', 'description']:
        len_content = len(process_doc(df.loc[df['url'] == doc_url][field]))
        avg_len_content = df[field].progress_apply(lambda x: len(process_doc(x)))

    bm25 = 0
    for token in query:
        n_docs = len(df['url'])
        doc_freq = sum(df['url'].progress_apply(lambda x: is_in_doc(token, x)))
        idf = np.log(n_docs/doc_freq)
        f = get_occ_in_doc(token, doc_url, field)
        bm25 += idf * (f*(k+1)) / (f + k * (1 - b + b * len_content / avg_len_content))
    return doc_freq

bm25("chocolate", "https://web-scraping.dev/product/1", "title")

100%|██████████| 156/156 [00:00<00:00, 487.40it/s]


21