# NYC Airbnb Price Analysis & Prediction

This notebook explores Airbnb prices in NYC, performs exploratory analysis, 
and builds machine learning models to predict nightly price.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv("data/listings.csv")   
df.shape
df.head()
df.info()

## 1. Load data and initial inspection

In [None]:
df.columns
df.isna().sum()
df.describe(include="all")

## 2. Data cleaning
- Select relevant columns.
- Handle missing values.
- Remove/limit extreme prices.

In [None]:
cols_keep = [
    "id", "name", "host_id", "neighborhood_overview", "neighbourhood",
    "latitude", "longitude", "room_type", "price",
    "minimum_nights", "number_of_reviews",
    "last_review", "reviews_per_month",
    "calculated_host_listings_count", "availability_365"
]

df = df[cols_keep]
df.head()

In [None]:
# Clean price column
df["price"] = (
    df["price"]
    .astype(str)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
)

df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Remove rows with price <= 0
df = df[df["price"] > 0]

# Handle missing reviews_per_month
if "reviews_per_month" in df.columns:
    df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# Convert last_review to datetime
if "last_review" in df.columns:
    df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")

df.info()

## 3. Exploratory data analysis
### 3.1 Price distribution

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df["price"], bins=100, kde=True)
plt.xlim(0, 1000)  
plt.title("Price distribution (capped at 1000)")
plt.savefig("images/histogram.png", dpi=300, bbox_inches="tight")
plt.show()
df_eda = df[df["price"] <= 500]  

### 3.2 Price by room type

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=df_eda, x="room_type", y="price")
plt.title("Price by Room Type")
plt.savefig("images/boxplot.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# groupby stats
df_eda.groupby("room_type")["price"] \
      .agg(["count", "mean", "median"]) \
      .sort_values("mean", ascending=False)

In [None]:
df_eda.groupby("neighbourhood")["price"] \
      .agg(["count", "mean", "median"]) \
      .sort_values("mean", ascending=False)

### 3.3 Location scatter (price on map)

In [None]:
# scatter map 
df_map = df_eda.sample(n=min(5000, len(df_eda)), random_state=42)

plt.figure(figsize=(6,6))
sns.scatterplot(data=df_map, x="longitude", y="latitude", hue="price", palette="magma", s=10)
plt.title("NYC listings coloured by price")
plt.legend([],[],frameon=False)
plt.savefig("images/scatterplot.png", dpi=300, bbox_inches="tight")
plt.show()

### 3.4 Numeric correlations

In [None]:
numeric_cols = ["price", "minimum_nights", "number_of_reviews","reviews_per_month", "room_type",
                "availability_365"]

corr = df_eda[numeric_cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation between numeric variables")
plt.savefig("images/heatmap.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
df.columns.tolist()

## 4. Feature engineering and modelling
### 4.1 Prepare features and target

In [None]:
target = "price"

features = [
    "neighbourhood",
    "latitude", "longitude",
    "room_type",
    "minimum_nights",
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365"
]

data_ml = df_eda[features + [target]].dropna()
data_ml.shape

In [None]:
X = pd.get_dummies(
    data_ml[features],
    columns=["neighbourhood", "room_type"],
    drop_first=True
)
y = data_ml[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

### 4.2 Baseline and Linear Regression

In [None]:
# baseline: always predict training mean
y_mean = y_train.mean()
y_pred_baseline = np.full_like(y_test, fill_value=y_mean, dtype=float)

mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

print("Baseline MAE:", mae_baseline)
print("Baseline RMSE:", rmse_baseline)

# linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred_lr = lin_reg.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("Linear Regression MAE:", mae_lr)
print("Linear Regression RMSE:", rmse_lr)

### 4.3 Random Forest

In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest MAE:", mae_rf)
print("Random Forest RMSE:", rmse_rf)

In [None]:
results = {
    "Baseline": (mae_baseline, rmse_baseline),
    "Linear Regression": (mae_lr, rmse_lr),
    "Random Forest": (mae_rf, rmse_rf),
}
results

### 4.4 Feature importance

In [None]:
importances = rf.feature_importances_
feat_names = X_train.columns

fi = pd.DataFrame({
    "feature": feat_names,
    "importance": importances
}).sort_values("importance", ascending=False)

fi.head(20)

In [None]:
top_n = 15
plt.figure(figsize=(8,6))
plt.barh(fi["feature"].head(top_n)[::-1],
         fi["importance"].head(top_n)[::-1])
plt.title("Top feature importances (Random Forest)")
plt.xlabel("Importance")
plt.savefig("images/price_distribution.png", dpi=300, bbox_inches="tight")
plt.show()
import os
os.makedirs("images", exist_ok=True)

## KEY FINDINGS


### EDA insights
1. Airbnb prices in NYC are highly right‑skewed, with most listings below roughly 200–300 per night and a long tail of expensive properties, so extreme prices were capped when analysing and modelling.
2. Entire homes/apartments and hotel rooms have much higher median prices than private and shared rooms, confirming that room type is a major driver of price.
3. Visualizing listings on a longitude–latitude scatter shows high‑priced listings concentrated in specific central areas of the city, reflecting location premiums.


### Model performance
1. A simple baseline model that always predicts the mean price achieves MAE ≈ 65.7, while Linear Regression improves this to ≈ 48.6 and Random Forest reduces it further to ≈ 42.7, a substantial error reduction compared to the baseline.
2. Random Forest also achieves the lowest RMSE, indicating it handles large price deviations better than the simpler models.


### Feature importance insights
1. Random Forest feature importances show room_type and precise geographic coordinates (longitude, latitude) as the strongest predictors of price, highlighting the combined effect of property type and location.
2. Booking behaviour and host activity features such as availability_365, reviews_per_month, number_of_reviews, and calculated_host_listings_count also contribute meaningfully, suggesting that frequently booked or highly reviewed listings tend to follow different pricing patterns.