In [2]:
import re
import math
import json
from collections import defaultdict, Counter
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional

import pandas as pd
import numpy as np


In [4]:
import requests
import pandas as pd

# irvine coordinates
lat, lng = 33.6846, -117.8265

# API request
response = requests.get(
    "https://www.refugerestrooms.org/api/v1/restrooms/by_location",
    params={"lat": lat, "lng": lng}
)

# req = requests.Request(
#     "GET",
#     "https://www.refugerestrooms.org/api/v1/restrooms/by_location",
#     params={"lat": lat, "lng": lng}
# ).prepare()

# print(req.url)

if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data)
    df["doc_id"] = range(len(df))
else:
    print("Download failed:", response.status_code)
    df = pd.DataFrame({
        "name": ["Starbucks", "Library", "Mall"],
        "street": ["123 Main St", "456 College Ave", "789 Center Rd"],
        "city": ["Irvine", "Irvine", "Irvine"],
        "state": ["CA", "CA", "CA"]
    })
    df["doc_id"] = range(len(df))

df.head()

Unnamed: 0,id,name,street,city,state,accessible,unisex,directions,comment,latitude,...,updated_at,downvote,upvote,country,changing_table,edit_id,approved,distance,bearing,doc_id
0,45697,Arby's,76 Corporate Park,Irvine,Ca,True,True,,,33.692565,...,2020-04-14T21:10:59.500Z,0,0,US,False,45697,True,0.550674,2.341482619368,0
1,434,Ralphs Westpark,17605 Harvard Avenue,IRVINE,CA,False,False,Walk into the store and straight down aisle se...,24 Hour store. The bathrooms are checked every...,33.677309,...,2014-02-02T20:49:36.640Z,1,0,US,False,434,True,0.628213,221.835946085132,1
2,48340,Peet's Coffee and Tea,3720-3992 Barranca Pkwy,Irvine,California,True,True,"Right next to the side entrance, labelled ""All...",,33.68499,...,2019-03-22T23:23:49.560Z,0,0,US,False,48340,True,0.907721,88.583641647825,2
3,21711,"2457 Park Ave, Tustin, CA 92782",2437 Park Ave,Tustin,California,False,False,Family restroom stall next to gendered restrooms,,33.697761,...,2016-04-29T01:48:22.212Z,0,0,US,False,21711,True,0.915697,351.891779455553,3
4,39339,Target,2300 Park Ave,Tustin,CA,False,True,Take a left from the front door,,33.700145,...,2018-01-07T01:12:41.976Z,0,1,US,True,39339,True,1.075717,356.142650694003,4


In [5]:
# Stopwords & tokenizer (same as your example)
STOPWORDS = set((
    "a an the and or but if then else for of to in on at by with without from as "
    "is are was were be been being this that these those it its im youre we you "
    "they he she them our your their"
).split())

TOKEN_RE = re.compile(r"[a-z0-9]+")

def tokenize(text: str) -> list[str]:
    text = str(text).lower()
    toks = TOKEN_RE.findall(text)
    return [t for t in toks if t not in STOPWORDS]

def token_positions(tokens: list[str]) -> dict[str,list[int]]:
    pos = defaultdict(list)
    for i, t in enumerate(tokens):
        pos[t].append(i)
    return dict(pos)

# name + description + amenities in one field
df["token_text"] = df["name"].astype(str) + " " + df["comment"].astype(str)
df["tokens"] = df["token_text"].map(tokenize)
df["pos"] = df["tokens"].map(token_positions)

df[["doc_id","name","tokens","pos","token_text"]].head()

Unnamed: 0,doc_id,name,tokens,pos,token_text
0,0,Arby's,"[arby, s]","{'arby': [0], 's': [1]}",Arby's
1,1,Ralphs Westpark,"[ralphs, westpark, 24, hour, store, bathrooms,...","{'ralphs': [0], 'westpark': [1], '24': [2], 'h...",Ralphs Westpark 24 Hour store. The bathrooms a...
2,2,Peet's Coffee and Tea,"[peet, s, coffee, tea]","{'peet': [0], 's': [1], 'coffee': [2], 'tea': ...",Peet's Coffee and Tea
3,3,"2457 Park Ave, Tustin, CA 92782","[2457, park, ave, tustin, ca, 92782]","{'2457': [0], 'park': [1], 'ave': [2], 'tustin...","2457 Park Ave, Tustin, CA 92782"
4,4,Target,[target],{'target': [0]},Target


In [6]:
@dataclass(frozen=True)
class Posting:
    doc_id: int
    tf: int
    positions: Tuple[int, ...]

InvertedIndex = Dict[str, List[Posting]]

def build_inverted_index(df: pd.DataFrame) -> InvertedIndex:
    idx: Dict[str, List[Posting]] = defaultdict(list)
    for row in df.itertuples(index=False):
        doc_id = int(row.doc_id)
        pos_map = row.pos  # dict term -> [positions]
        for term, positions in pos_map.items():
            idx[term].append(Posting(doc_id=doc_id, tf=len(positions), positions=tuple(positions)))
    for term in idx:
        idx[term].sort(key=lambda p: p.doc_id)
    return dict(idx)

inv = build_inverted_index(df)

for t in ["good", "bad", "irvine"]:
    print(t, "->", inv.get(t, [])[:5])

good -> []
bad -> []
irvine -> [Posting(doc_id=6, tf=1, positions=(4,))]


In [7]:
def postings_docs(term: str) -> List[int]:
    return [p.doc_id for p in inv.get(term, [])]

def and_query(terms: List[str]) -> List[int]:
    sets = [set(postings_docs(t)) for t in terms]
    if not sets:
        return []
    return sorted(set.intersection(*sets))

def or_query(terms: List[str]) -> List[int]:
    out = set()
    for t in terms:
        out |= set(postings_docs(t))
    return sorted(out)

def show_docs(doc_ids: List[int], n: int = 5):
    for doc_id in doc_ids[:n]:
        row = df.loc[df["doc_id"] == doc_id].iloc[0]
        print(f"[{doc_id}] {row['comment']}")

q1 = ["project", "meeting"]
hits1 = and_query(q1)
print("AND", q1, "->", hits1); show_docs(hits1)

q2 = ["win", "free", "prize"]
hits2 = or_query(q2)
print("\nOR", q2, "->", hits2); show_docs(hits2)


AND ['project', 'meeting'] -> []

OR ['win', 'free', 'prize'] -> []


In [8]:
def phrase_query(phrase: str) -> List[int]:
    terms = tokenize(phrase)
    if not terms:
        return []
    candidates = and_query(terms)
    if not candidates:
        return []
    results = []
    for doc_id in candidates:
        pos_map = df.loc[df.doc_id == doc_id, "pos"].iloc[0]
        pos_lists = [pos_map.get(t, []) for t in terms]
        base_positions = pos_lists[0]
        offsets = list(range(len(terms)))
        ok = False
        for p in base_positions:
            if all((p + off) in set(pos_lists[i]) for i, off in enumerate(offsets)):
                ok = True
                break
        if ok:
            results.append(doc_id)
    return results

hits = phrase_query("24 hour")
print("24 hour:", hits)
show_docs(hits, n=10)


24 hour: [1]
[1] 24 Hour store. The bathrooms are checked every hour for cleanliness.


In [9]:
N = len(df)
dfreq = {t: len(inv[t]) for t in inv.keys()}

def idf(term: str) -> float:
    return math.log((N + 1) / (dfreq.get(term, 0) + 1)) + 1.0  # smoothed

doc_vecs: List[Dict[str, float]] = [defaultdict(float) for _ in range(N)]
doc_norms = np.zeros(N, dtype=float)

for term, postings in inv.items():
    w_idf = idf(term)
    for p in postings:
        w = (1 + math.log(p.tf)) * w_idf
        doc_vecs[p.doc_id][term] = w

for doc_id in range(N):
    norm = math.sqrt(sum(v*v for v in doc_vecs[doc_id].values()))
    doc_norms[doc_id] = norm if norm > 0 else 1.0

def rank(query: str, top_k: int = 10) -> List[Tuple[int, float]]:
    q_terms = tokenize(query)
    if not q_terms:
        return []
    q_tf = Counter(q_terms)
    q_vec = {t: (1 + math.log(tf)) * idf(t) for t, tf in q_tf.items()}
    q_norm = math.sqrt(sum(v*v for v in q_vec.values())) or 1.0

    scores = []
    for doc_id in range(N):
        dv = doc_vecs[doc_id]
        dot = 0.0
        for term, qw in q_vec.items():
            dw = dv.get(term)
            if dw is not None:
                dot += qw * dw
        score = dot / (q_norm * doc_norms[doc_id])
        if score > 0:
            scores.append((doc_id, score))
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

results = rank("24", top_k=8)
results


[(1, np.float64(0.30335434132750927))]

In [10]:
for doc_id, score in results:
    row = df.loc[df.doc_id == doc_id].iloc[0]
    print(f"{score:0.3f}  [{doc_id}] {row['comment']}")


0.303  [1] 24 Hour store. The bathrooms are checked every hour for cleanliness.


In [11]:
from pandas.core import base
# Activity #2A (BM25) — ready-to-run baseline (modify/extend)
def proximity_bonus(doc_id: int, q_terms: List[str], max_distance: int = 3) -> float:
  pos_map = df.loc[df.doc_id == doc_id, "pos"].iloc[0]
  bonus = 0.0
  for i in range(len(q_terms) - 1):
      t1, t2 = q_terms[i], q_terms[i+1]
      for p1 in pos_map.get(t1, []):
          for p2 in pos_map.get(t2, []):
            d = p2 - p1
            if d > max_distance: break
            if abs(d) <= max_distance:
                bonus += 0.1
  return bonus


results = []
q_terms = tokenize("24 hour")
q_tf = Counter(q_terms)
q_vec = {t: (1 + math.log(tf)) * idf(t) for t, tf in q_tf.items()}
q_norm = math.sqrt(sum(v*v for v in q_vec.values())) or 1.0

for doc_id in range(N):
    dv = doc_vecs[doc_id]
    dot = sum(q_vec[t]*dv.get(t,0) for t in q_vec)
    score = dot / (q_norm * doc_norms[doc_id])
    score += proximity_bonus(doc_id, q_terms)  # add proximity bonus
    if score > 0:
        results.append((doc_id, score))


results.sort(key=lambda x: x[1], reverse = True)
for doc_id, score in results:
    row = df.loc[df.doc_id == doc_id].iloc[0]
    print(f"{score:0.3f}  [{doc_id}] {row['comment']}")


0.678  [1] 24 Hour store. The bathrooms are checked every hour for cleanliness.
