In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report
from scipy.stats import pointbiserialr

# airbnb = pd.read_csv("/content/Airbnb_Open_Data.csv", header=0)
airbnb = pd.read_csv("Airbnb_Open_Data.csv", header=0)
airbnb.head()
df_subset = airbnb.drop(['id', 'NAME', 'neighbourhood', 'house_rules', 'host name', 'host id', 'lat', 'long', 'country',
                         'calculated host listings count', 'country code', 'license'], axis=1)
df_subset.head()


print(df_subset.dtypes) # checked to see types --> a few that should be int are float

df_subset = df_subset.dropna(subset=['Construction year', 'minimum nights', 'number of reviews', 'review rate number', 'availability 365'])

df_type_clean = df_subset.astype({'Construction year': 'int32',
                                  'minimum nights': 'int32',
                                  'number of reviews': 'int32',
                                  'review rate number': 'int32',
                                  'availability 365': 'int32'})

print(df_type_clean.dtypes)
# Remove any non-numeric characters (e.g., $) and convert to float
df_type_clean['price'] = df_type_clean['price'].replace('[\$,]', '', regex=True).astype(float)
df_type_clean['service fee'] = df_type_clean['service fee'].replace('[\$,]', '', regex=True).astype(float)

df_type_clean['reviews per month'] = df_type_clean['reviews per month'].fillna(0) # Fill with appropriate defaults
df_type_clean = df_type_clean.dropna(subset=['last review', 'host_identity_verified']) # Drop rows with critical missing data

df_type_clean['last review'] = pd.to_datetime(df_type_clean['last review'], errors='coerce') #Convert last review to a datetime object

#Convert categorical columns (host_identity_verified, neighbourhood group, room type, etc.) to category
categorical_columns = ['host_identity_verified', 'neighbourhood group', 'instant_bookable', 'cancellation_policy', 'room type']
for col in categorical_columns:
    df_type_clean[col] = df_type_clean[col].astype('category')

#df_type_clean['house_rules'] = df_type_clean['house_rules'].str.strip() #Clean up text columns like house_rules to remove extra spaces or inconsistent casing
#df_type_clean['house_rules'] = df_type_clean['house_rules'].replace('#NAME?', np.nan) #this was an excel error I believe --> wanted to get rid of it

print(df_type_clean.info())
print(df_type_clean.describe(include='all'))
# wanted to ensure that all of the prices had to be greater than 0
filter(df_type_clean, df_type_clean['price'] > 0)
print(df_type_clean['price'].min())
# for a few of the built-in functions, we can't have any NaN values
# so, we will create this new Pandas DataFrame, but keep our old one for other analyis
df_cleaned = df_type_clean.dropna()


# Load data
data = df_cleaned
data['host_identity_verified'] = data['host_identity_verified'].astype(str)
data['host_identity_verified'] = data['host_identity_verified'].map(
    {'unconfirmed': 0, 'verified': 1, '0': 0, '1': 1})


# One-hot encode 'neighbourhood group' and 'room type'
#data = pd.get_dummies(data, columns=['neighbourhood group', 'room type', 'cancellation_policy'], drop_first=True)
data['cancellation_policy'] = data['cancellation_policy'].astype(str)
cancellation_policy_mapping = {'strict': 0, 'moderate': 1, 'flexible': 2}
data['cancellation_policy'] = data['cancellation_policy'].map(cancellation_policy_mapping)

data = pd.get_dummies(data, columns=['neighbourhood group', 'room type', 'instant_bookable'], drop_first=True)

# 4. Handle the 'last review' column (extract year and month)
data['last_review'] = pd.to_datetime(data['last review'], errors='coerce')
data['review_year'] = data['last_review'].dt.year
data['review_month'] = data['last_review'].dt.month
data.drop(['last review'], axis=1, inplace=True)
print(data.dtypes)

# Preprocessing: Transform the dataset
# Ensure datetime is converted to numeric and drop non-numeric columns
data['days_since_last_review'] = (pd.to_datetime('today') - data['last_review']).dt.days
data.drop(columns=[ 'last_review','predicted_price', 'price_category'], inplace=True, errors='ignore')  # Drop non-numeric columns
bool_columns = data.select_dtypes(include='bool').columns
data[bool_columns] = data[bool_columns].astype(int)

# Check for non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {list(non_numeric_cols)}")  # Ensure no strings remain
assert len(non_numeric_cols) == 0, "Dataset contains non-numeric columns!"

# Separate features and target variable
X = data.drop(columns=['price','service fee'])
y = data['price']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  df_type_clean['price'] = df_type_clean['price'].replace('[\$,]', '', regex=True).astype(float)
  df_type_clean['service fee'] = df_type_clean['service fee'].replace('[\$,]', '', regex=True).astype(float)
  airbnb = pd.read_csv("Airbnb_Open_Data.csv", header=0)


host_identity_verified     object
neighbourhood group        object
instant_bookable           object
cancellation_policy        object
room type                  object
Construction year         float64
price                      object
service fee                object
minimum nights            float64
number of reviews         float64
last review                object
reviews per month         float64
review rate number        float64
availability 365          float64
dtype: object
host_identity_verified     object
neighbourhood group        object
instant_bookable           object
cancellation_policy        object
room type                  object
Construction year           int32
price                      object
service fee                object
minimum nights              int32
number of reviews           int32
last review                object
reviews per month         float64
review rate number          int32
availability 365            int32
dtype: object
<class 'pandas.core.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['host_identity_verified'] = data['host_identity_verified'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['host_identity_verified'] = data['host_identity_verified'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cancellation_policy'] = data['cancellation_policy'].as

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Build the model
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.1, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
y_pred_nn = model.predict(X_test_scaled).flatten()
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Deep Learning Model Evaluation:\nMSE: {mse_nn}\nR²: {r2_nn}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NameError: name 'X_train_scaled' is not defined

In [12]:
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np



# Base learners for Stacking and Voting Regressor
base_learners = [
    ('rf', RandomForestRegressor(max_depth=17, max_features=None, min_samples_leaf=3,
                                  min_samples_split=5, n_estimators=666, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42, verbosity=0)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),
    ('lr', Ridge())
]

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=base_learners,
    final_estimator=Ridge(),
    cv=5
)

# Voting Regressor
voting_regressor = VotingRegressor(estimators=base_learners)

# Train Stacking Regressor
stacking_regressor.fit(X_train, y_train)

# Train Voting Regressor
voting_regressor.fit(X_train, y_train)

# Evaluate models
stacking_predictions = stacking_regressor.predict(X_test)
voting_predictions = voting_regressor.predict(X_test)

stacking_r2 = r2_score(y_test, stacking_predictions)
voting_r2 = r2_score(y_test, voting_predictions)

print(f"R^2 Score for Stacking Regressor: {stacking_r2}")
print(f"R^2 Score for Voting Regressor: {voting_r2}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1169
[LightGBM] [Info] Number of data points in the train set: 68018, number of used features: 19
[LightGBM] [Info] Start training from score 626.328222
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1166
[LightGBM] [Info] Number of data points in the train set: 54414, number of used features: 19
[LightGBM] [Info] Start training from score 626.221358
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e