In [6]:
# ============================================================
# 0. Setup & Imports
# ============================================================

import sys
from pathlib import Path
import pandas as pd
import numpy as np

# --- Fix path so src/ modules import correctly ---
project_root = Path.cwd().parent      # assuming notebook is inside /notebooks
sys.path.append(str(project_root))

from src.extract import load_csv


# ============================================================
# 1. Load Cleaned Airbnb Data
# ============================================================

df = load_csv("listings_cleaned.csv", folder="processed")
print(f"Loaded dataframe: {df.shape[0]} rows × {df.shape[1]} columns")

# Display preview
df.head()


Loaded 17,730 rows from C:\Users\Ross\Desktop\Data Analytics Proj\AirbnbProject\data\processed\listings_cleaned.csv
Loaded dataframe: 17730 rows × 32 columns


Unnamed: 0,host_since,last_review,first_review,accommodates,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,...,bedrooms,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,shared_bath,host_response_time,neighbourhood_cleansed,property_type,room_type
0,2009-09-23,2020-03-13,2009-12-05,1,90,180,29,59,89,364,...,,0.0,,False,False,True,a few days or more,sydney,private room in rental unit,private room
1,2009-12-03,2025-09-01,2012-02-23,2,1,90,14,42,63,295,...,,1.0,1.0,False,True,False,within an hour,sydney,private room in condo,private room
2,2010-04-22,2025-08-31,2010-10-20,2,2,180,0,0,0,0,...,1.0,1.0,0.82,True,True,False,within a few hours,sydney,entire loft,entire home/apt
3,2010-11-06,2025-08-31,2010-12-29,4,2,90,10,16,23,138,...,1.0,1.0,0.94,True,True,False,within a day,mosman,entire guest suite,entire home/apt
4,2011-01-03,2025-06-08,2011-06-21,4,1,30,0,2,15,265,...,2.0,1.0,0.9,True,True,True,within an hour,hornsby,private room in home,private room


In [7]:
# Columns that must be datetime
date_cols = ["host_since", "first_review", "last_review"]

for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

df[date_cols].dtypes

host_since      datetime64[ns]
first_review    datetime64[ns]
last_review     datetime64[ns]
dtype: object

In [8]:
# 3.1 Host Tenure (days & years)
if "host_since" in df.columns:
    today = pd.Timestamp.today()
    df["host_tenure_days"] = (today - df["host_since"]).dt.days
    df["host_tenure_years"] = df["host_tenure_days"] / 365


In [9]:
# 3.2 Host Experience Buckets
if "host_tenure_years" in df.columns:
    df["host_experience_level"] = pd.cut(
        df["host_tenure_years"],
        bins=[-1, 1, 3, 7, 20],
        labels=["New", "Intermediate", "Experienced", "Veteran"]
    )

In [10]:
# 4.1 Average review score (mean of all review_score_* columns)
score_cols = [
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
]

existing_scores = [c for c in score_cols if c in df.columns]

df["review_score_mean"] = df[existing_scores].mean(axis=1)

In [11]:
# 4.2 Time since last review
if "last_review" in df.columns:
    df["days_since_last_review"] = (today - df["last_review"]).dt.days
    df["months_since_last_review"] = df["days_since_last_review"] / 30


In [12]:
# 4.3 Review Activity Bucket
if "reviews_per_month" in df.columns:
    df["review_activity_level"] = pd.cut(
        df["reviews_per_month"],
        bins=[-1, 0.1, 1, 5, 30],
        labels=["Rare", "Occasional", "Regular", "High"]
    )


In [13]:
# 5.1 Listing Size Indicator
if "accommodates" in df.columns:
    df["is_large_listing"] = df["accommodates"] >= 6


In [14]:
# 5.2 Bedroom to bathroom ratio
if "bedrooms" in df.columns and "bathrooms" in df.columns:
    df["bed_bath_ratio"] = df["bedrooms"] / (df["bathrooms"].replace(0, np.nan))


In [15]:
# 6.1 High-level latitude/longitude buckets
if "latitude" in df.columns and "longitude" in df.columns:
    df["lat_bin"] = pd.qcut(df["latitude"], 5, labels=False, duplicates="drop")
    df["lon_bin"] = pd.qcut(df["longitude"], 5, labels=False, duplicates="drop")


In [16]:
# Convert key categorical columns into normalized, clean string values
cat_cols = ["property_type", "room_type", "neighbourhood_cleansed"]

for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype("string").str.strip().str.lower()


In [17]:
print("Final DF Shape:", df.shape)
df.head()


Final DF Shape: (17730, 43)


Unnamed: 0,host_since,last_review,first_review,accommodates,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,...,host_tenure_years,host_experience_level,review_score_mean,days_since_last_review,months_since_last_review,review_activity_level,is_large_listing,bed_bath_ratio,lat_bin,lon_bin
0,2009-09-23,2020-03-13,2009-12-05,1,90,180,29,59,89,364,...,16.158904,Veteran,4.636,2074.0,69.133333,Regular,False,,2,3
1,2009-12-03,2025-09-01,2012-02-23,2,1,90,14,42,63,295,...,15.964384,Veteran,4.63,76.0,2.533333,Regular,False,,2,3
2,2010-04-22,2025-08-31,2010-10-20,2,2,180,0,0,0,0,...,15.580822,Veteran,4.888,77.0,2.566667,Occasional,False,1.0,2,2
3,2010-11-06,2025-08-31,2010-12-29,4,2,90,10,16,23,138,...,15.038356,Veteran,4.68,77.0,2.566667,Regular,False,1.0,3,3
4,2011-01-03,2025-06-08,2011-06-21,4,1,30,0,2,15,265,...,14.879452,Veteran,4.946,161.0,5.366667,Occasional,False,0.8,4,0


In [18]:
# output_path = project_root / "data" / "processed" / "listings_transformed.csv"
# df.to_csv(output_path, index=False)

# print(f"Transformed dataset saved to:\n{output_path}")


Transformed dataset saved to:
C:\Users\Ross\Desktop\Data Analytics Proj\AirbnbProject\data\processed\listings_transformed.csv
