In [8]:
# !pip install gradio pandas

In [9]:
"""
BakedBot AI: Cannabis Strain Recommendation(RAG using BM25)
- Indexes text fields (dominant_terpenes, flavor_notes, effects) with BM25
- Retrieves top 3 strains for a free-text query
- Ranks with BM25 and a gentle review-based boost
- Generates a short recommendation blurb per result
- UI to see the results of recommendation system
"""
import re
import math
import pandas as pd
from collections import Counter
import gradio as gr


In [10]:
CSV_PATH = "sample_cannabis_strains.csv"  #File path
# Build corpus from text fields
TEXT_FIELDS = ["dominant_terpenes", "flavor_notes", "effects"] #provided in CSV

# Small synonym maps to help query intent
TERP_SYNONYMS = { "myrcene": ["myrcene"],
                  "limonene": ["limonene", "citrus"],
                  "pinene": ["pinene", "pine"],
                  "caryophyllene": ["caryophyllene", "pepper", "spicy"],
                  "humulene": ["humulene", "hoppy"],
                  "bisabolol": ["bisabolol"]
                }

EFFECT_SYNONYMS = { "sleep": ["sleep", "sleepy", "insomnia"],
                    "anxiety": ["anxiety", "calm", "relax", "relaxed", "stress"],
                    "focus": ["focus", "focused"],
                    "energy": ["energy", "energetic", "uplifted"],
                    "happy": ["happy", "euphoric"],
                    "creative": ["creative"],
                    "pain": ["pain", "analgesic"]
                  }

In [11]:
# Core retrieval: BM25
def bm25(corpus, query, k1=1.5, b=0.75):
    """
    Compute BM25 scores for a tokenized query against a tokenized corpus.
    Arguments:
        corpus: list[list[str]] where each inner list is a tokenized document.
        query:  list[str] tokenized query terms.
        k1, b:  Standard BM25 hyperparameters.
    Returns:
        list[float]: BM25 score per document in same order as corpus.
    """
    N = len(corpus)
    avgdl = sum(len(doc) for doc in corpus) / N
    # Document frequency for each term
    df = {}
    for doc in corpus:
        for word in set(doc):
            df[word] = df.get(word, 0) + 1
    # Inverse document frequency
    idf = {word: math.log((N - df[word] + 0.5) / (df[word] + 0.5) + 1)
           for word in df}
    scores = []
    for doc in corpus:
        score = 0.0
        freq = Counter(doc)
        for term in query:
            if term in freq:
                numerator = freq[term] * (k1 + 1)
                denominator = freq[term] + k1 * (1 - b + b * len(doc) / avgdl)
                score += idf.get(term, 0) * (numerator / denominator)
        scores.append(score)
    return scores

In [12]:
# Tokenization & expansion

token_re = re.compile(r"[a-zA-Z0-9]+")

def tokenize(text: str):
    """
    Lowercase alphanumeric tokenizer.
    Example: "Berry, Earthy" -> ["berry","earthy"]
    """
    return [t.lower() for t in token_re.findall(text or "")]

def expand_terms(tokens):
    """
    Light synonym expansion: if a token matches a known key or synonym
    (Example: "sleep" or "insomnia"), include the base term too.
    This gently improves recall without heavy "NLP" dependencies.
    """
    expanded = set(tokens)
    maps = {}
    maps.update(TERP_SYNONYMS)
    maps.update(EFFECT_SYNONYMS)
    for base, alts in maps.items():
        if base in tokens or any(t in tokens for t in alts):
            expanded.update(alts + [base])
    return list(expanded)

In [13]:
# Data loading & corpus build
def load_data(csv_path: str) -> pd.DataFrame:
    """
    Load strains CSV and normalize key numeric columns.
    Required fields: strain_name, type, thc_pct, cbd_pct, dominant_terpenes,
                     flavor_notes, effects, avg_review_score, review_count
    """
    df = pd.read_csv(csv_path)
    for col in ["thc_pct", "cbd_pct", "avg_review_score", "review_count"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    return df

def row_to_doc(row: pd.Series) -> str:
    """
    Concatenate the text fields we want to index into a single
    space-separated string (later tokenized).
    """
    parts = [str(row.get(f, "")) for f in TEXT_FIELDS]
    return " ".join(parts)

In [14]:
# Scoring and Ranking

def review_boost(row: pd.Series, alpha: float = 0.25, beta: float = 0.25) -> float:
    """
    Compute a multiplicative boost based on review quality & volume.
    - avg_review_score (1..5) -> score_norm 1→-1, 3→0, 5→1
    - review_count -> log-scaled vs dataset max to avoid outliers
    Boost = (1 + alpha * score_norm) * (1 + beta * count_norm)
    """
    score = float(row.get("avg_review_score", 0) or 0.0)
    count = float(row.get("review_count", 0) or 0.0)

    # Map 3.0 (neutral) -> 0.0, 5.0 -> 1.0
    score_norm = max(0.0, (score - 3.0) / 2.0)

    # Log-scale review counts to 0..1
    # (max_count must be injected from the global dataframe, set later)
    global _MAX_REVIEW_COUNT
    max_count = max(1.0, float(_MAX_REVIEW_COUNT))
    count_norm = math.log1p(count) / math.log1p(max_count)

    return (1 + alpha * score_norm) * (1 + beta * count_norm)

def score_query(query_text: str, df: pd.DataFrame, corpus: list[list[str]]) -> list[dict]:
    """
    Score a free-text query against the corpus and return top-3 results.
    Returns a list of dicts with metadata, BM25, boost, final_score, and recommendation.
    """
    # Tokenize & lightly expand query
    q_tokens = expand_terms(tokenize(query_text))

    # BM25 over our prebuilt tokenized corpus
    base_scores = bm25(corpus, q_tokens)

    # Compose results, apply review-based boost, then sort
    results = []
    for i, s in enumerate(base_scores):
        row = df.iloc[i]
        b = review_boost(row)
        final = s * b
        results.append({
            "rank": None,
            "strain_name": row.get("strain_name"),
            "type": row.get("type"),
            "thc_pct": row.get("thc_pct"),
            "cbd_pct": row.get("cbd_pct"),
            "bm25": round(float(s), 4),
            "boost": round(float(b), 4),
            "final_score": round(float(final), 4),
            "dominant_terpenes": row.get("dominant_terpenes"),
            "flavor_notes": row.get("flavor_notes"),
            "effects": row.get("effects"),
            "avg_review_score": row.get("avg_review_score"),
            "review_count": row.get("review_count"),
        })

    results.sort(key=lambda x: x["final_score"], reverse=True)

    # Fill in rank + generate a short blurb
    top_k = results[:3]
    for r_idx, r in enumerate(top_k, start=1):
        r["rank"] = r_idx
        r["recommendation"] = generate_recommendation(r, query_text)
    return top_k

In [15]:
# Generate Recommendation
def generate_recommendation(item: dict, query_text: str) -> str:
    """
    Produce a short, readable explanation tying the result to the query:
      - references terpenes/effects and THC/CBD
      - calls out common intents (sleep, anxiety, focus, energy, creativity)
      - mentions review summary for credibility
    """
    hints = []
    q = query_text.lower()
    if "sleep" in q or "insomnia" in q:
        hints.append("reports sleepy/relaxing effects")
    if any(w in q for w in ["anxiety", "stress", "calm"]):
        hints.append("often chosen to ease stress or anxiety")
    if "focus" in q:
        hints.append("reviewers mention focus/clarity")
    if any(w in q for w in ["energy", "energetic"]):
        hints.append("uplifting/energizing profile")
    if "creative" in q:
        hints.append("can support creativity")
    if "myrcene" in q and "myrcene" in (item.get("dominant_terpenes","").lower()):
        hints.append("dominant in myrcene as requested")
    if "limonene" in q and "limonene" in (item.get("dominant_terpenes","").lower()):
        hints.append("limonene-forward profile matches query")

    terp = item.get("dominant_terpenes") or "—"
    eff = item.get("effects") or "—"
    thc = item.get("thc_pct")
    cbd = item.get("cbd_pct")
    preface = f"{item['strain_name']} ({item['type']}) with {terp}; effects include {eff}. THC {thc}%, CBD {cbd}%."
    tail = (" " + " ".join(hints)) if hints else ""
    meta = f" Avg review {item['avg_review_score']}/5 from {item['review_count']} reviews."
    return preface + tail + meta

In [18]:
# Gradio UI
def build_ui(df: pd.DataFrame, corpus: list[list[str]]):
    """
    Build a tiny Gradio UI: text input -> top 3 results with scores & blurbs.
    """
    example_queries = [
        "High in myrcene, good for sleep and anxiety",
        "Limonene citrus notes, uplifting and creative",
        "Low THC, higher CBD for focus",
    ]

    def _search(query: str) -> str:
        top = score_query(query, df, corpus)
        lines = []
        for r in top:
            lines.append(
                f"#{r['rank']} — {r['strain_name']} | Final: {r['final_score']} (BM25 {r['bm25']} × Boost {r['boost']})\n"
                f"Terpenes: {r['dominant_terpenes']} | Effects: {r['effects']} | THC {r['thc_pct']}% | CBD {r['cbd_pct']}%\n"
                f"Recommendation: {r['recommendation']}\n"
            )
        return "\n".join(lines)

    with gr.Blocks() as demo:
        gr.Markdown("# BakedBot AI - Cannabis Strain RAG (BM25)")
        gr.Markdown("Type what you're looking for (terpenes/effects). Returns top 3 with scores.")
        inp = gr.Textbox(label="Your query", value=example_queries[0])
        btn = gr.Button("Search")
        out = gr.Textbox(label="Results", lines=15)
        btn.click(_search, inputs=inp, outputs=out)
        gr.Examples(example_queries, inputs=inp)
    return demo



In [19]:
# Load data
df = load_data(CSV_PATH)

# Precompute max review count for boosting
_MAX_REVIEW_COUNT = float(df["review_count"].max()) if "review_count" in df.columns else 1.0

# Build tokenized corpus
docs_raw = [row_to_doc(row) for _, row in df.iterrows()]
corpus = [tokenize(text) for text in docs_raw]

# Launch UI
app = build_ui(df, corpus)
app.launch(debug=False)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://387ce35a0635c7de3b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


