In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os
import mysql.connector as connection
import pandas as pd

try:
    mydb = connection.connect(
        host=os.environ.get("db_host"),
        database=os.environ.get("db_name"),
        user=os.environ.get("db_user"),
        passwd=os.environ.get("db_password"),
        use_pure=True,
    )
    query = """
    with latest_recipes as (
        select url as latest_url, max(id) as latest_id 
        from dim_recipes 
        group by url
    )
    select id, ds, url, type, description, directions, ingredients, tags, title, image, serves, time, host 
    from dim_recipes 
    right join latest_recipes as lts 
    on lts.latest_id = dim_recipes.id
    """
    df = pd.read_sql(query, mydb)
    print(df)
    mydb.close()  # close the connection
    
except Exception as e:
    mydb.close()
    print(str(e))

In [None]:
df

In [None]:
'A'.lower()

In [None]:
import re
def tokenize(sentence): 
  return sentence.lower().split(' ')

def clean_token(token):
  m = re.match("^([a-z].*[a-z])$", token)
  return m.group(0) if m != None else None

def valid_tokens(sentence):
  tokens = tokenize(sentence)
  valid = []
  for tk in tokens:
    clean = clean_token(tk)
    if clean:
      valid.append(clean)
  return valid

def get_entry(id):
  matches = list(df[df['id'] == id].iterrows())
  if len(matches) == 1:
    return matches[0][1]
  return None

In [None]:
import json
inv_index = {}
for index, row in df.iterrows():
  id = row['id']
  tokens = valid_tokens(row['title'])
  for tag in json.loads(row['tags']):
    for token in valid_tokens(tag):
      tokens.append(token)

  for token in tokens:
    values = inv_index[token] if token in inv_index else []
    values.append(id)
    inv_index[token] = values

In [None]:
class Features:
    def __init__(self, title):
        self.matches = 0
        self.title = title
        self.ingredient_matches = 0
        self.tag_matches = 0
        self.description_matches = 0
        self.direction_matches = 0

    def get_key(self):
        return (
            self.matches,
            self.tag_matches,
            self.ingredient_matches,
            self.description_matches,
            self.direction_matches,
            self.title,
        )

def get_features(entry, entries, query_tokens):
    id = entry["id"]
    features = Features(entry["title"])
    for _, v in entries.items():
        features.matches += 1 if id in v else 0

    for tk in valid_tokens(entry["description"]):
        if tk in query_tokens:
            features.description_matches += 1

    for tag in json.loads(entry["tags"]):
        for tk in valid_tokens(tag):
            if tk in query_tokens:
                features.tag_matches += 1

    for ingredient in json.loads(entry["ingredients"]):
        for tk in valid_tokens(ingredient):
            if tk in query_tokens:
                features.ingredient_matches += 1
    for direction in json.loads(entry["directions"]):
        for tk in valid_tokens(direction):
            if tk in query_tokens:
                features.direction_matches += 1
    return features

def likely(prefix):
    opts = []
    for k, v in inv_index.items():
        if k not in ('or', 'the', 'this') and k.startswith(prefix):
            opts.append((k, len(v)))
    opts.sort(reverse=True, key=lambda e: (e[1], -len(e[0]), e[0]))
    return opts

def search(query):
    query_tokens = valid_tokens(query)
    extended = [
        lk
        for (lk, _) in likely(query_tokens[-1])[:3]
    ]
    with_features = []
    for ext in extended + [None]: 
        entries = {}
        ids = set()
        tokens = query_tokens + [ext]
        for token in tokens:
            if token in ('or', 'the', 'this'):
                continue
            values = (
                inv_index[token]
                if token in inv_index else []
            )
            entries[token] = set(values)
            for id in values:
                ids.add(id)
        for id in ids:
            entry = get_entry(id)
            features = get_features(entry, entries, tokens)
            with_features.append((entry, ext, features.get_key()))
    with_features.sort(reverse=True, key=lambda e: (e[2], -len(e[1]) if e[1] else 0))
    filtered = []
    used = set()
    for (entry, ext, f) in with_features:
        if entry['id'] in used:
            continue
        used.add(entry['id'])
        filtered.append((entry, ext, f))

    return filtered, query_tokens

result, tokens = search("bowl beef")
tokens, [((r[0]["title"], r[0]["tags"]), r[1], r[2]) for r in result], len(result)

In [None]:
likely("te")