In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA_PATH = "airbnb_bookings.csv"
OUTPUT_DIR = "airbnb_analysis_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_synthetic_airbnb(n_listings=200, n_bookings=5000):
    """Generate synthetic Airbnb-like dataset for testing."""
    np.random.seed(42)
    listing_ids = [f"L{1000+i}" for i in range(n_listings)]
    room_types = ["Entire home/apt", "Private room", "Shared room"]
    neighborhoods = [f"Neighborhood_{i%10}" for i in range(n_listings)]

    listings = pd.DataFrame({
        "listing_id": listing_ids,
        "room_type": np.random.choice(room_types, n_listings, p=[0.6, 0.35, 0.05]),
        "neighborhood": neighborhoods,
        "base_price": np.random.randint(50, 300, n_listings)
    })

    bookings = []
    start_date = pd.Timestamp("2023-01-01")
    for i in range(n_bookings):
        listing = listings.sample(1).iloc[0]
        checkin = start_date + pd.to_timedelta(np.random.randint(0, 730), unit="D")
        nights = np.random.choice([1,2,3,4,5,7,10])
        checkout = checkin + pd.to_timedelta(nights, unit="D")
        price = listing.base_price * np.random.uniform(0.8, 1.2)
        cleaning_fee = np.random.uniform(10, 30)
        service_fee = price * 0.12
        cancelled = np.random.choice([0, 1], p=[0.9, 0.1])
        revenue = 0 if cancelled else (price * nights + cleaning_fee + service_fee)

        bookings.append({
            "booking_id": f"B{i+1:05d}",
            "listing_id": listing.listing_id,
            "room_type": listing.room_type,
            "neighborhood": listing.neighborhood,
            "book_date": checkin - pd.to_timedelta(np.random.randint(5, 60), unit="D"),
            "checkin": checkin,
            "checkout": checkout,
            "nights": nights,
            "price_per_night": round(price, 2),
            "cleaning_fee": round(cleaning_fee, 2),
            "service_fee": round(service_fee, 2),
            "revenue": round(revenue, 2),
            "cancelled": cancelled
        })
    return pd.DataFrame(bookings)

if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH, parse_dates=["book_date", "checkin", "checkout"])
    print(f"Loaded dataset: {len(df)} records from {DATA_PATH}")
else:
    df = generate_synthetic_airbnb()
    print("No CSV found — generated synthetic dataset.")

df["nights"] = (pd.to_datetime(df["checkout"]) - pd.to_datetime(df["checkin"])).dt.days
df["year_month"] = pd.to_datetime(df["checkin"]).dt.to_period("M").dt.to_timestamp()

if "revenue" not in df.columns:
    df["revenue"] = df["price_per_night"] * df["nights"] + df["cleaning_fee"] + df["service_fee"]

df["is_cancelled"] = df["cancelled"].astype(int)


overview = {
    "Total Bookings": len(df),
    "Unique Listings": df["listing_id"].nunique(),
    "Total Revenue (Confirmed)": df.loc[df["is_cancelled"] == 0, "revenue"].sum(),
    "Average Price per Night": df["price_per_night"].mean(),
    "Cancellation Rate": df["is_cancelled"].mean() * 100
}

summary = pd.DataFrame([overview])
summary.to_csv(f"{OUTPUT_DIR}/summary_overview.csv", index=False)

print("\n=== AIRBNB ANALYSIS SUMMARY ===")
print(summary.to_string(index=False))


monthly = df.groupby("year_month").agg(
    total_bookings=("booking_id", "count"),
    cancelled_bookings=("is_cancelled", "sum"),
    total_revenue=("revenue", "sum")
).reset_index()

monthly["confirmed_bookings"] = monthly["total_bookings"] - monthly["cancelled_bookings"]
monthly["cancel_rate"] = monthly["cancelled_bookings"] / monthly["total_bookings"]


plt.figure(figsize=(10,5))
plt.plot(monthly["year_month"], monthly["total_bookings"], label="Total Bookings")
plt.plot(monthly["year_month"], monthly["confirmed_bookings"], label="Confirmed")
plt.title("Monthly Bookings Trend")
plt.xlabel("Month")
plt.ylabel("Bookings")
plt.legend()
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/bookings_trend.png")
plt.close()

plt.figure(figsize=(10,5))
plt.plot(monthly["year_month"], monthly["total_revenue"])
plt.title("Monthly Revenue")
plt.xlabel("Month")
plt.ylabel("Revenue")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/monthly_revenue.png")
plt.close()

plt.figure(figsize=(6,4))
plt.boxplot(df["price_per_night"], vert=True)
plt.title("Price per Night Distribution")
plt.savefig(f"{OUTPUT_DIR}/price_distribution.png")
plt.close()

top_listings = (
    df[df["is_cancelled"] == 0]
    .groupby("listing_id")
    .agg(total_revenue=("revenue", "sum"), total_bookings=("booking_id", "count"))
    .sort_values("total_revenue", ascending=False)
    .head(10)
)
top_listings.to_csv(f"{OUTPUT_DIR}/top_listings_by_revenue.csv")

plt.figure(figsize=(10,5))
plt.bar(top_listings.index, top_listings["total_revenue"])
plt.title("Top 10 Listings by Revenue")
plt.xlabel("Listing ID")
plt.ylabel("Revenue")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/top10_listings.png")
plt.close()

print("\nAll analysis complete.")
print(f"Results and plots saved in: {OUTPUT_DIR}")
print("Generated files:")
for file in os.listdir(OUTPUT_DIR):
    print(" -", file)


No CSV found — generated synthetic dataset.

=== AIRBNB ANALYSIS SUMMARY ===
 Total Bookings  Unique Listings  Total Revenue (Confirmed)  Average Price per Night  Cancellation Rate
           5000              200                 3774204.81               172.706774              10.38

All analysis complete.
Results and plots saved in: airbnb_analysis_outputs
Generated files:
 - summary_overview.csv
 - price_distribution.png
 - bookings_trend.png
 - monthly_revenue.png
 - top_listings_by_revenue.csv
 - top10_listings.png
