In [None]:
import os
import sys
from pathlib import Path
from transformers import pipeline

import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
CSV_RELATIVE_PATH = Path("AB_NYC_2019.csv") / "AB_NYC_2019.csv"


def load_local_airbnb_dataset() -> pd.DataFrame:
    """
    Looks for the dataset at the specified local path, loads it, and cleans it
    using Pandas/NumPy for statistical analysis and cleaning[cite: 68].
    """
    csv_path = CSV_RELATIVE_PATH

    if not csv_path.exists():
        # Fallback for if the file is placed directly in the root (less likely based on image)
        if Path("AB_NYC_2019.csv").exists() and Path("AB_NYC_2019.csv").is_file():
             csv_path = Path("AB_NYC_2019.csv")
        else:
            print(
                f"[ERROR] Could not find the dataset at the expected local path: {CSV_RELATIVE_PATH}\n"
                f"Please ensure '{CSV_RELATIVE_PATH}' exists relative to the script."
            )
            sys.exit(1)

    print(f"[INFO] Loading dataset from {csv_path}")
    df = pd.read_csv(csv_path)

    # Filtering columns to keep relevant data (as shown in the Dataset table)
    cols = [
        "name",
        "neighbourhood_group", # Location [cite: 60]
        "neighbourhood",       # Location [cite: 60]
        "room_type",           # Listing Info [cite: 60]
        "price",               # Listing Info [cite: 60]
        "minimum_nights",      # Activity [cite: 60]
        "number_of_reviews",   # Activity [cite: 60]
        "reviews_per_month",   # Quality/Review frequency [cite: 60]
    ]
    df = df[cols].copy()

    # Basic cleaning
    df["name"] = df["name"].fillna("Unknown listing")
    df["reviews_per_month"] = df["reviews_per_month"].fillna(0.0)

    # Drop rows with missing key fields
    df = df.dropna(subset=["neighbourhood_group", "price", "room_type"])

    # Ensure price is numeric and positive
    df = df[df["price"] > 0]

    print(f"[INFO] Loaded {len(df)} listings after cleaning.")
    return df

In [3]:
# 2. Search Module (TF-IDF) [cite: 33, 66]
# -------------------------------------------------------------------

class SearchModule:
    """Implements the Search Module using TF-IDF for keyword filtering[cite: 33]."""
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.vectorizer = TfidfVectorizer(stop_words="english")
        print("[INFO] Building TF-IDF index over listing names...")
        self.X = self.vectorizer.fit_transform(self.df["name"].astype(str))
        print("[INFO] TF-IDF index built.")

    def search(self, query: str, k: int = 10) -> pd.DataFrame:
        """
        Return top-k listings whose 'name' is most similar to the query.
        """
        q_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(q_vec, self.X).flatten()
        idx = sims.argsort()[::-1][:k]
        return self.df.iloc[idx].copy()

In [4]:
BOROUGHS = ["brooklyn", "manhattan", "queens", "bronx", "staten island"]


class PromptingModule:
    """
    Implements the Prompting Module to parse user queries into structured 
    search conditions[cite: 34, 35].
    """
    @staticmethod
    def parse_query(query: str):
        """
        Convert natural language query into a simple action spec:
            {
              "borough": <str or None>,
              "task": "cheapest" | "average" | "search"
            }
        """
        q = query.lower()
        spec = {"borough": None, "task": None}

        for b in BOROUGHS:
            if b in q:
                spec["borough"] = b.title()
                break

        if "cheapest" in q or "lowest price" in q:
            spec["task"] = "cheapest"
        elif "average" in q or "avg" in q or "typical" in q:
            spec["task"] = "average"
        else:
            spec["task"] = "search"

        return spec

In [5]:
class LanguageModel:
    """
    Uses the Hugging Face pipeline (FLAN-T5-small) to generate readable answers 
    and understand natural language[cite: 36, 67].
    """
    def __init__(self):
        self.model_name = "google/flan-t5-small"
        self.pipe = None

        if pipeline is None:
            print(
                "[WARN] transformers not installed. "
                "Answers will be plain text without LLM refinement."
            )
        else:
            print(f"[INFO] Loading Hugging Face model: {self.model_name}...")
            self.pipe = pipeline(
                "text2text-generation",
                model=self.model_name
            )
            print("[INFO] Model loaded.")

    def generate(self, summary: str) -> str:
        """
        If transformers is available, use FLAN-T5 to rewrite the summary.
        Otherwise, just return the summary.
        """
        if self.pipe is None:
            return summary

        prompt = (
            "You are an assistant summarizing Airbnb price information. "
            "Rewrite the following summary into a concise, user-friendly sentence:\n\n"
            f"{summary}"
        )
        # LLM generation pipeline [cite: 48]
        out = self.pipe(prompt, max_length=128, num_beams=4)[0]["generated_text"]
        return out.strip()

In [6]:
class AirbnbAgent:
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        # Components matching the workflow [cite: 32]
        self.search_module = SearchModule(self.df)
        self.prompting_module = PromptingModule()
        self.language_model = LanguageModel()

    def _filter_by_borough(self, borough: str | None) -> pd.DataFrame:
        if borough is None:
            return self.df
        subset = self.df[self.df["neighbourhood_group"] == borough]
        if subset.empty:
            return self.df
        return subset

    def _handle_cheapest(self, data: pd.DataFrame, borough: str | None) -> str:
        # Response Formatter / Summarize insights (cheapest price) [cite: 37]
        row = data.sort_values("price").iloc[0]
        b_label = borough if borough is not None else "New York City"
        summary = (
            f"The cheapest listing in {b_label} is '{row['name']}' in "
            f"{row['neighbourhood']} ({row['room_type']}) priced at "
            f"${row['price']} per night, with {row['number_of_reviews']} reviews."
        )
        return self.language_model.generate(summary)

    def _handle_average(self, data: pd.DataFrame, borough: str | None) -> str:
        # Response Formatter / Summarize insights (average price) [cite: 37]
        avg_price = round(data["price"].mean(), 2)
        n = len(data)
        b_label = borough if borough is not None else "New York City"
        summary = (
            f"The average nightly price in {b_label} is about "
            f"${avg_price} based on {n} listings."
        )
        return self.language_model.generate(summary)

    def _handle_search(self, query: str) -> str:
        # Uses the Search Module [cite: 33]
        results = self.search_module.search(query, k=5)
        names = results["name"].tolist()
        prices = results["price"].tolist()
        neighbourhoods = results["neighbourhood"].tolist()

        items = [
            f"'{n}' in {nbhd} (${p}/night)"
            for n, nbhd, p in zip(names, neighbourhoods, prices)
        ]
        joined = "; ".join(items)
        # Response Formatter [cite: 37]
        summary = (
            f"For your query '{query}', some relevant listings are: {joined}."
        )
        return self.language_model.generate(summary)

    def answer(self, query: str) -> str:
        """
        Main entry point: processes a natural language query and returns an answer.
        """
        # Uses the Prompting Module [cite: 34]
        spec = self.prompting_module.parse_query(query)
        borough = spec["borough"]
        task = spec["task"]

        data = self._filter_by_borough(borough)

        if task == "cheapest":
            return self._handle_cheapest(data, borough)
        elif task == "average":
            return self._handle_average(data, borough)
        else:
            return self._handle_search(query)

In [7]:
def main():
    # 1. Load dataset
    df = load_local_airbnb_dataset()

    # 2. Create agent
    agent = AirbnbAgent(df)

    # 3. Demo queries (Examples similar to the proposal's expected answers) [cite: 82]
    queries = [
        "What is the cheapest room in Brooklyn?", # Similar to "Cheapest rooms in Boston" [cite: 82]
        "What is the average price in Manhattan?", # Similar to "Average price in Boston" [cite: 82]
        "Find a cheap private room near the park",
        "What is the average price in Queens?",
        "Show me the cheapest listing in New York City",
    ]

    print("\n=== Airbnb Price Intelligence Agent Demo ===\n")
    for q in queries:
        print(f"Q: {q}")
        try:
            ans = agent.answer(q)
        except Exception as e:
            ans = f"[ERROR answering query: {e}]"
        print(f"A: {ans}\n")


if __name__ == "__main__":
    main()

[INFO] Loading dataset from AB_NYC_2019.csv\AB_NYC_2019.csv
[INFO] Loaded 48884 listings after cleaning.
[INFO] Building TF-IDF index over listing names...
[INFO] TF-IDF index built.
[INFO] Loading Hugging Face model: google/flan-t5-small...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[INFO] Model loaded.

=== Airbnb Price Intelligence Agent Demo ===

Q: What is the cheapest room in Brooklyn?


Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


A: The cheapest listing in Brooklyn is 'Beautiful room in Bushwick' in Bushwick (Private room) priced at $10 per night, with 2 reviews.

Q: What is the average price in Manhattan?


Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


A: The average nightly price in Manhattan is about $196.88 based on 21660 listings.

Q: Find a cheap private room near the park


Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


A: Cheap, furnished private room in Bensonhurst, Washington Heights, Washington Heights, Washington Heights, Washington Heights, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Williamsburg, Wil

Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


A: The average nightly price in Queens is about $99.52 based on 5666 listings.

Q: Show me the cheapest listing in New York City
A: The cheapest listing in New York City is 'IT'S SIMPLY CONVENIENT!' in Jamaica (Entire home/apt) priced at $10 per night, with 43 reviews.

