## Pre-processing

In [697]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from geopy.distance import geodesic

In [189]:
airbnb_data = pd.read_csv("listings.csv")

In [190]:
airbnb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            37784 non-null  int64  
 1   listing_url                                   37784 non-null  object 
 2   scrape_id                                     37784 non-null  int64  
 3   last_scraped                                  37784 non-null  object 
 4   source                                        37784 non-null  object 
 5   name                                          37782 non-null  object 
 6   description                                   36811 non-null  object 
 7   neighborhood_overview                         20607 non-null  object 
 8   picture_url                                   37783 non-null  object 
 9   host_id                                       37784 non-null 

In [191]:
selected_features = ['host_response_rate', 'host_listings_count', 'host_acceptance_rate', 'host_is_superhost', 'host_has_profile_pic',
                     'host_total_listings_count', 'host_identity_verified', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
                     'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'amenities', 'price',
                     'bathrooms', 'minimum_nights', 'availability_30', 'maximum_nights', 'has_availability', 'availability_60',
                     'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
                     'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                     'review_scores_checkin', 'review_scores_communication', 'review_scores_location','review_scores_value',
                     'instant_bookable', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
                     'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month']

airbnb_selected_data = airbnb_data[selected_features]
airbnb_selected_data = airbnb_selected_data.dropna(subset=['price'])
airbnb_selected_data['price'] = airbnb_selected_data['price'].str.replace(r'[$,]', '', regex=True).astype(float)

In [192]:
airbnb_selected_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22969 entries, 0 to 37783
Data columns (total 44 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_response_rate                            19030 non-null  object 
 1   host_listings_count                           22968 non-null  float64
 2   host_acceptance_rate                          19409 non-null  object 
 3   host_is_superhost                             22635 non-null  object 
 4   host_has_profile_pic                          22968 non-null  object 
 5   host_total_listings_count                     22968 non-null  float64
 6   host_identity_verified                        22968 non-null  object 
 7   neighbourhood_cleansed                        22969 non-null  object 
 8   neighbourhood_group_cleansed                  22969 non-null  object 
 9   latitude                                      22969 non-null  floa

In [193]:
airbnb_selected_data['host_response_rate'] = airbnb_selected_data['host_response_rate'].str.rstrip('%').astype(float) / 100
airbnb_selected_data['host_acceptance_rate'] = airbnb_selected_data['host_acceptance_rate'].str.rstrip('%').astype(float) / 100

In [194]:
binary_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable', 'has_availability']
for col in binary_columns:
    airbnb_selected_data[col] = airbnb_selected_data[col].map({'t': 1, 'f': 0})

In [195]:
airbnb_selected_data['first_review'] = pd.to_datetime(airbnb_selected_data['first_review']).astype(int) // 10**9
airbnb_selected_data['last_review'] = pd.to_datetime(airbnb_selected_data['last_review']).astype(int) // 10**9

In [196]:
# Convert 'neighbourhood_cleansed' to categorical and store the mapping
neighborhood_categories = dict(enumerate(airbnb_selected_data["neighbourhood_cleansed"].astype('category').cat.categories))
airbnb_selected_data['neighbourhood_cleansed'] = airbnb_selected_data['neighbourhood_cleansed'].astype('category').cat.codes

# Convert 'neighbourhood_group_cleansed' to categorical and store the mapping
neighborhood_group_categories = dict(enumerate(airbnb_selected_data["neighbourhood_group_cleansed"].astype('category').cat.categories))
airbnb_selected_data['neighbourhood_group_cleansed'] = airbnb_selected_data['neighbourhood_group_cleansed'].astype('category').cat.codes

# Convert 'property_type' to categorical and store the mapping
property_categories = dict(enumerate(airbnb_selected_data["property_type"].astype('category').cat.categories))
airbnb_selected_data['property_type'] = airbnb_selected_data['property_type'].astype('category').cat.codes

# Convert 'room_type' to categorical and store the mapping
room_categories = dict(enumerate(airbnb_selected_data["room_type"].astype('category').cat.categories))
airbnb_selected_data['room_type'] = airbnb_selected_data['room_type'].astype('category').cat.codes

In [197]:
important_amenities = [
    "Wi-Fi", "Kitchen", "Air conditioning", "Heating", "Pool", "Gym", "Free parking", "TV",
    "Laptop-friendly workspace", "Smoke alarm", "Carbon monoxide alarm", "First aid kit",
    "Washer", "Dryer", "Dishwasher", "Hot tub", "Netflix", "Hulu", "Pet-friendly",
    "Elevator", "Wheelchair Accessible", "Balcony", "BBQ grill", "Beachfront"
]

# Convert amenities column from string to list
airbnb_selected_data['amenities'] = airbnb_selected_data['amenities'].apply(lambda x: eval(x))  # Convert string representation to list

# Create binary columns for important amenities
for amenity in important_amenities:
    airbnb_selected_data[amenity] = airbnb_selected_data['amenities'].apply(lambda x: 1 if amenity in x else 0)

# Drop original 'amenities' column
airbnb_selected_data.drop(columns=['amenities'], inplace=True)

In [198]:
airbnb_selected_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22969 entries, 0 to 37783
Data columns (total 67 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_response_rate                            19030 non-null  float64
 1   host_listings_count                           22968 non-null  float64
 2   host_acceptance_rate                          19409 non-null  float64
 3   host_is_superhost                             22635 non-null  float64
 4   host_has_profile_pic                          22968 non-null  float64
 5   host_total_listings_count                     22968 non-null  float64
 6   host_identity_verified                        22968 non-null  float64
 7   neighbourhood_cleansed                        22969 non-null  int16  
 8   neighbourhood_group_cleansed                  22969 non-null  int8   
 9   latitude                                      22969 non-null  floa

## Drop NaNs

In [228]:
df1 = airbnb_selected_data.copy()

In [229]:
df_without_nan = df1.dropna()

In [230]:
X = df_without_nan.drop(columns=["price"])
y = df_without_nan["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [231]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.3888
Mean Absolute Error (MAE): 70.7272
Root Mean Squared Error (RMSE): 136.6350


## Replace NaNs with Mean and Median

In [232]:
df2 = airbnb_selected_data.copy()

In [233]:
skewness = df2.skew()
skewness

host_response_rate      -3.355471
host_listings_count      3.462790
host_acceptance_rate    -1.415844
host_is_superhost        1.041877
host_has_profile_pic    -4.807627
                           ...   
Elevator                 1.390800
Wheelchair Accessible    0.000000
Balcony                  0.000000
BBQ grill                3.170862
Beachfront               0.000000
Length: 67, dtype: float64

In [234]:
for col in df2.columns:
    if skewness[col] > -1 and skewness[col] < 1:
        df2[col].fillna(airbnb_selected_data[col].mean(), inplace=True)  # Use mean
    else:
        df2[col].fillna(airbnb_selected_data[col].median(), inplace=True)  # Use median

df2.isna().sum()
df_without_nan = df2.copy()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2[col].fillna(airbnb_selected_data[col].median(), inplace=True)  # Use median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2[col].fillna(airbnb_selected_data[col].median(), inplace=True)  # Use median
The behavior will change in pandas 3.0. This inplace method will never w

In [235]:
X = df_without_nan.drop(columns=["price"])
y = df_without_nan["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [236]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.1081
Mean Absolute Error (MAE): 105.6712
Root Mean Squared Error (RMSE): 420.7003


## Replace Outliers

In [268]:
df3 = df_without_nan.copy()

In [269]:
def winsorize_all_columns(df, lower=20, upper=80):
    for col in df.select_dtypes(include=[np.number]).columns:  # Loop over numerical columns
        lower_limit = np.percentile(df[col], lower)
        upper_limit = np.percentile(df[col], upper)
        df[col] = np.clip(df[col], lower_limit, upper_limit)  # Apply capping
    return df

df_without_outliers = winsorize_all_columns(df3)  # Apply Winsorization to all numeric columns

In [270]:
X = df_without_outliers.drop(columns=["price"])
y = df_without_outliers["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [271]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5040
Mean Absolute Error (MAE): 38.5285
Root Mean Squared Error (RMSE): 48.0723


## Latitude/Longitude handing from center

In [698]:
df4 = df_without_outliers.copy()

In [699]:
import pandas as pd
import numpy as np

# Define NYC center
nyc_center = (40.7128, -74.0060)

# Compute distance from city center
df4['distance_from_center'] = df4.apply(lambda row: geodesic((row['latitude'], row['longitude']), nyc_center).km, axis=1)

# KMeans clustering on lat/lon
kmeans = KMeans(n_clusters=50, random_state=42, n_init=10)
df4['location_cluster'] = kmeans.fit_predict(df4[['latitude', 'longitude']])

df_with_center = df4.copy()

In [700]:
X = df4.drop(columns=["price"])
y = df4["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [701]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5155
Mean Absolute Error (MAE): 37.9693
Root Mean Squared Error (RMSE): 47.5096


## Latitude/Longitude handing from popular landmarks

In [702]:
df5 = df_with_center.copy()

In [703]:
import pandas as pd
import numpy as np
from haversine import haversine

# Define popular NYC landmarks with (latitude, longitude)
landmarks = {
    "Times_Square": (40.7580, -73.9855),
    "Central_Park": (40.7851, -73.9683),
    "Statue_of_Liberty": (40.6892, -74.0445),
    "Brooklyn_Bridge": (40.7061, -73.9969),
    "Empire_State_Building": (40.7488, -73.9854),
    "JFK_Airport": (40.6413, -73.7781),
    "LaGuardia_Airport": (40.7769, -73.8740),
    "Wall_Street": (40.7074, -74.0113),
    "Fifth_Avenue": (40.7750, -73.9650),
    "Broadway_Theater_District": (40.7590, -73.9845),
    "Rockefeller_Center": (40.7587, -73.9787),
    "Chrysler_Building": (40.7516, -73.9755),
    "Madison_Square_Garden": (40.7505, -73.9934),
    "Grand_Central_Terminal": (40.7527, -73.9772),
    "SoHo": (40.7229, -74.0007),
    "Little_Italy": (40.7191, -73.9973),
    "Chinatown": (40.7158, -73.9970),
    "Greenwich_Village": (40.7336, -74.0027),
    "DUMBO_Brooklyn": (40.7033, -73.9896),
    "Coney_Island": (40.5749, -73.9850),
    "Yankee_Stadium": (40.8296, -73.9262),
    "Metropolitan_Museum": (40.7794, -73.9632),
    "Brooklyn_Museum": (40.6714, -73.9638),
    "One_World_Trade_Center": (40.7127, -74.0134),
    "High_Line": (40.7479, -74.0049)
}


# Compute Haversine distance from each landmark
for name, coord in landmarks.items():
    df5[f'distance_from_{name}'] = df5.apply(lambda row: haversine((row['latitude'], row['longitude']), coord), axis=1)

df_with_landmarks = df5.copy()

In [704]:
X = df5.drop(columns=["price"])
y = df5["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [705]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5386
Mean Absolute Error (MAE): 36.9662
Root Mean Squared Error (RMSE): 46.3643


## Adding Derived Features

#### Room-related Features

In [687]:
df = df_with_landmarks.copy()

In [688]:
# Feature 1: Beds per Room
df['beds_per_room'] = df['beds'] / (df['bedrooms'] + 1)  # Avoid division by zero

# Feature 2: Bathrooms per Bedroom
df['bathrooms_per_bedroom'] = df['bathrooms'] / (df['bedrooms'] + 1)

# Feature 3: Guests per Bed
df['guests_per_bed'] = df['accommodates'] / (df['beds'] + 1)

# Feature 4: Guests per Bedroom
df['guests_per_bedroom'] = df['accommodates'] / (df['bedrooms'] + 1)

# Feature 5: Bathrooms per Guest
df['bathrooms_per_guest'] = df['bathrooms'] / (df['accommodates'] + 1)

df.fillna(0, inplace=True)

In [627]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [628]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5478
Mean Absolute Error (MAE): 36.4502
Root Mean Squared Error (RMSE): 45.8989


#### Geographical Features

In [629]:
df = df_with_landmarks.copy()

In [689]:
# Feature 1: Listings per Neighbourhood
df['listings_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['calculated_host_listings_count'].transform('sum')

# Feature 2: Reviews per Neighbourhood
df['reviews_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['number_of_reviews'].transform('mean')

# Feature 3: Distance from City Center
city_center_coords = (40.7580, -73.9855)
df['distance_from_city_center'] = df.apply(lambda row: geodesic((row['latitude'], row['longitude']), city_center_coords).km, axis=1)

# Feature 4: Neighbourhood Popularity Score
df['neighbourhood_popularity'] = df['reviews_per_neighbourhood'] * df['listings_per_neighbourhood']

# Feature 5: Room Density
df['listings_per_neighbourhood_group'] = df.groupby('neighbourhood_group_cleansed')['calculated_host_listings_count'].transform('sum')

# Feature 6: Average Price per Neighbourhood
df['avg_price_neighbourhood'] = df.groupby('neighbourhood_cleansed')['price'].transform('mean')

In [631]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [632]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5430
Mean Absolute Error (MAE): 36.8363
Root Mean Squared Error (RMSE): 46.1422


#### Review-based Features

In [706]:
df = df_with_landmarks.copy()

In [707]:
# Feature 1: Log-transformed Reviews (Handles Skewness)
df['log_reviews'] = df['number_of_reviews'].apply(lambda x: np.log1p(x))

# Feature 2: Recent Reviews Intensity (Last 30 Days)
df['reviews_last_30_days'] = df['number_of_reviews_l30d'] / (df['number_of_reviews'] + 1)

# Feature 3: Review Growth Rate (Last 12 Months)
df['review_growth_rate'] = (df['number_of_reviews_ltm'] + 1) / (df['number_of_reviews'] + 1)

# Feature 4: Review Score Aggregation (Overall Rating)
df['avg_review_score'] = df[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                             'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                             'review_scores_value']].mean(axis=1)

# Feature 5: Review Sentiment Proxy (Binary Score)
df['positive_reviews'] = (df['review_scores_rating'] >= 4.5).astype(int)

# Feature 6: Review Variance (Guest Satisfaction Stability)
df['review_variance'] = df[['review_scores_accuracy', 'review_scores_cleanliness',
                            'review_scores_checkin', 'review_scores_communication',
                            'review_scores_location', 'review_scores_value']].std(axis=1)

# Feature 7: Time Since First Review (Longevity Indicator)
df['days_since_first_review'] = (pd.to_datetime('today') - pd.to_datetime(df['first_review'])).dt.days

# Feature 8: Time Since Last Review (Recent Activity Indicator)
df['days_since_last_review'] = (pd.to_datetime('today') - pd.to_datetime(df['last_review'])).dt.days

# Feature 9: Review Frequency (Total Reviews per Availability)
df['review_frequency'] = df['number_of_reviews'] / (df['availability_365'] + 1)

# Feature 14: Review-Based Demand in Neighbourhood
df['reviews_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['number_of_reviews'].transform('mean')

In [708]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [709]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5446
Mean Absolute Error (MAE): 36.7526
Root Mean Squared Error (RMSE): 46.0620


#### Amentities-based Features

In [710]:
df = df_with_landmarks.copy()

In [711]:
# Feature 1: Total Number of Amenities (Luxury & Convenience Score)
df['total_amenities'] = df[['Wi-Fi', 'Kitchen', 'Air conditioning', 'Heating', 'Pool', 'Gym', 
                            'Free parking', 'TV', 'Laptop-friendly workspace', 'Smoke alarm', 
                            'Carbon monoxide alarm', 'First aid kit', 'Washer', 'Dryer', 
                            'Dishwasher', 'Hot tub', 'Netflix', 'Hulu', 'Pet-friendly', 
                            'Elevator', 'Wheelchair Accessible', 'Balcony', 'BBQ grill', 
                            'Beachfront']].sum(axis=1)

# Feature 2: Entertainment Score (Availability of Streaming Services & TV)
df['entertainment_score'] = df[['TV', 'Netflix', 'Hulu']].sum(axis=1)

# Feature 3: Safety Score (Availability of Safety-Related Amenities)
df['safety_score'] = df[['Smoke alarm', 'Carbon monoxide alarm', 'First aid kit']].sum(axis=1)

# Feature 4: Luxury Score (Premium Features like Pool, Gym, Hot Tub, Beachfront)
df['luxury_score'] = df[['Pool', 'Gym', 'Hot tub', 'Beachfront']].sum(axis=1)

# Feature 5: Accessibility Score (Features Supporting Mobility Needs)
df['accessibility_score'] = df[['Elevator', 'Wheelchair Accessible']].sum(axis=1)

# Feature 6: Work-Friendly Score (Remote Work Essentials)
df['work_friendly_score'] = df[['Wi-Fi', 'Laptop-friendly workspace']].sum(axis=1)

# Feature 7: Pet-Friendly Indicator (Whether Pets are Allowed)
df['is_pet_friendly'] = df['Pet-friendly'].astype(int)

# Feature 8: Essential Utilities Score (Basic Comfort Features)
df['essential_utilities_score'] = df[['Heating', 'Air conditioning', 'Kitchen']].sum(axis=1)

In [712]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [713]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5389
Mean Absolute Error (MAE): 36.9542
Root Mean Squared Error (RMSE): 46.3511


## With All Derived Features

In [718]:
df = df_with_landmarks.copy()

In [719]:
df['beds_per_room'] = df['beds'] / (df['bedrooms'] + 1)  # Avoid division by zero
df['bathrooms_per_bedroom'] = df['bathrooms'] / (df['bedrooms'] + 1)
df['guests_per_bed'] = df['accommodates'] / (df['beds'] + 1)
df['guests_per_bedroom'] = df['accommodates'] / (df['bedrooms'] + 1)
df['bathrooms_per_guest'] = df['bathrooms'] / (df['accommodates'] + 1)
df.fillna(0, inplace=True)

df['listings_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['calculated_host_listings_count'].transform('sum')
df['reviews_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['number_of_reviews'].transform('mean')
city_center_coords = (40.7580, -73.9855)
df['distance_from_city_center'] = df.apply(lambda row: geodesic((row['latitude'], row['longitude']), city_center_coords).km, axis=1)
df['neighbourhood_popularity'] = df['reviews_per_neighbourhood'] * df['listings_per_neighbourhood']
df['listings_per_neighbourhood_group'] = df.groupby('neighbourhood_group_cleansed')['calculated_host_listings_count'].transform('sum')
df['avg_price_neighbourhood'] = df.groupby('neighbourhood_cleansed')['price'].transform('mean')

df['log_reviews'] = df['number_of_reviews'].apply(lambda x: np.log1p(x))
df['reviews_last_30_days'] = df['number_of_reviews_l30d'] / (df['number_of_reviews'] + 1)
df['review_growth_rate'] = (df['number_of_reviews_ltm'] + 1) / (df['number_of_reviews'] + 1)
df['avg_review_score'] = df[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                             'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                             'review_scores_value']].mean(axis=1)
df['positive_reviews'] = (df['review_scores_rating'] >= 4.5).astype(int)
df['review_variance'] = df[['review_scores_accuracy', 'review_scores_cleanliness',
                            'review_scores_checkin', 'review_scores_communication',
                            'review_scores_location', 'review_scores_value']].std(axis=1)
df['days_since_first_review'] = (pd.to_datetime('today') - pd.to_datetime(df['first_review'])).dt.days
df['days_since_last_review'] = (pd.to_datetime('today') - pd.to_datetime(df['last_review'])).dt.days
df['review_frequency'] = df['number_of_reviews'] / (df['availability_365'] + 1)
df['reviews_per_neighbourhood'] = df.groupby('neighbourhood_cleansed')['number_of_reviews'].transform('mean')

df['total_amenities'] = df[['Wi-Fi', 'Kitchen', 'Air conditioning', 'Heating', 'Pool', 'Gym', 
                            'Free parking', 'TV', 'Laptop-friendly workspace', 'Smoke alarm', 
                            'Carbon monoxide alarm', 'First aid kit', 'Washer', 'Dryer', 
                            'Dishwasher', 'Hot tub', 'Netflix', 'Hulu', 'Pet-friendly', 
                            'Elevator', 'Wheelchair Accessible', 'Balcony', 'BBQ grill', 
                            'Beachfront']].sum(axis=1)
df['entertainment_score'] = df[['TV', 'Netflix', 'Hulu']].sum(axis=1)
df['safety_score'] = df[['Smoke alarm', 'Carbon monoxide alarm', 'First aid kit']].sum(axis=1)
df['luxury_score'] = df[['Pool', 'Gym', 'Hot tub', 'Beachfront']].sum(axis=1)
df['accessibility_score'] = df[['Elevator', 'Wheelchair Accessible']].sum(axis=1)
df['work_friendly_score'] = df[['Wi-Fi', 'Laptop-friendly workspace']].sum(axis=1)
df['is_pet_friendly'] = df['Pet-friendly'].astype(int)
df['essential_utilities_score'] = df[['Heating', 'Air conditioning', 'Kitchen']].sum(axis=1)

In [720]:
X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [721]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.5566
Mean Absolute Error (MAE): 36.1693
Root Mean Squared Error (RMSE): 45.4492
