In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report
from scipy.stats import pointbiserialr

# airbnb = pd.read_csv("/content/Airbnb_Open_Data.csv", header=0)
airbnb = pd.read_csv("Airbnb_Open_Data.csv", header=0)
airbnb.head()
df_subset = airbnb.drop(['id', 'NAME', 'neighbourhood', 'house_rules', 'host name', 'host id', 'lat', 'long', 'country',
                         'calculated host listings count', 'country code', 'license'], axis=1)
df_subset.head()


print(df_subset.dtypes) # checked to see types --> a few that should be int are float

df_subset = df_subset.dropna(subset=['Construction year', 'minimum nights', 'number of reviews', 'review rate number', 'availability 365'])

df_type_clean = df_subset.astype({'Construction year': 'int32',
                                  'minimum nights': 'int32',
                                  'number of reviews': 'int32',
                                  'review rate number': 'int32',
                                  'availability 365': 'int32'})

print(df_type_clean.dtypes)
# Remove any non-numeric characters (e.g., $) and convert to float
df_type_clean['price'] = df_type_clean['price'].replace('[\$,]', '', regex=True).astype(float)
df_type_clean['service fee'] = df_type_clean['service fee'].replace('[\$,]', '', regex=True).astype(float)

df_type_clean['reviews per month'] = df_type_clean['reviews per month'].fillna(0) # Fill with appropriate defaults
df_type_clean = df_type_clean.dropna(subset=['last review', 'host_identity_verified']) # Drop rows with critical missing data

df_type_clean['last review'] = pd.to_datetime(df_type_clean['last review'], errors='coerce') #Convert last review to a datetime object

#Convert categorical columns (host_identity_verified, neighbourhood group, room type, etc.) to category
categorical_columns = ['host_identity_verified', 'neighbourhood group', 'instant_bookable', 'cancellation_policy', 'room type']
for col in categorical_columns:
    df_type_clean[col] = df_type_clean[col].astype('category')

#df_type_clean['house_rules'] = df_type_clean['house_rules'].str.strip() #Clean up text columns like house_rules to remove extra spaces or inconsistent casing
#df_type_clean['house_rules'] = df_type_clean['house_rules'].replace('#NAME?', np.nan) #this was an excel error I believe --> wanted to get rid of it

print(df_type_clean.info())
print(df_type_clean.describe(include='all'))
# wanted to ensure that all of the prices had to be greater than 0
filter(df_type_clean, df_type_clean['price'] > 0)
print(df_type_clean['price'].min())
# for a few of the built-in functions, we can't have any NaN values
# so, we will create this new Pandas DataFrame, but keep our old one for other analyis
df_cleaned = df_type_clean.dropna()


# Load data
data = df_cleaned
data['host_identity_verified'] = data['host_identity_verified'].astype(str)
data['host_identity_verified'] = data['host_identity_verified'].map(
    {'unconfirmed': 0, 'verified': 1, '0': 0, '1': 1})


# One-hot encode 'neighbourhood group' and 'room type'
#data = pd.get_dummies(data, columns=['neighbourhood group', 'room type', 'cancellation_policy'], drop_first=True)
data['cancellation_policy'] = data['cancellation_policy'].astype(str)
cancellation_policy_mapping = {'strict': 0, 'moderate': 1, 'flexible': 2}
data['cancellation_policy'] = data['cancellation_policy'].map(cancellation_policy_mapping)

data = pd.get_dummies(data, columns=['neighbourhood group', 'room type', 'instant_bookable'], drop_first=True)

# 4. Handle the 'last review' column (extract year and month)
data['last_review'] = pd.to_datetime(data['last review'], errors='coerce')
data['review_year'] = data['last_review'].dt.year
data['review_month'] = data['last_review'].dt.month
data.drop(['last review'], axis=1, inplace=True)
print(data.dtypes)

# Preprocessing: Transform the dataset
# Ensure datetime is converted to numeric and drop non-numeric columns
data['days_since_last_review'] = (pd.to_datetime('today') - data['last_review']).dt.days
data.drop(columns=[ 'last_review','predicted_price', 'price_category'], inplace=True, errors='ignore')  # Drop non-numeric columns
bool_columns = data.select_dtypes(include='bool').columns
data[bool_columns] = data[bool_columns].astype(int)

# Check for non-numeric columns
non_numeric_cols = data.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {list(non_numeric_cols)}")  # Ensure no strings remain
assert len(non_numeric_cols) == 0, "Dataset contains non-numeric columns!"

# Separate features and target variable
X = data.drop(columns=['price','service fee'])
y = data['price']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  df_type_clean['price'] = df_type_clean['price'].replace('[\$,]', '', regex=True).astype(float)
  df_type_clean['service fee'] = df_type_clean['service fee'].replace('[\$,]', '', regex=True).astype(float)
  airbnb = pd.read_csv("Airbnb_Open_Data.csv", header=0)


host_identity_verified     object
neighbourhood group        object
instant_bookable           object
cancellation_policy        object
room type                  object
Construction year         float64
price                      object
service fee                object
minimum nights            float64
number of reviews         float64
last review                object
reviews per month         float64
review rate number        float64
availability 365          float64
dtype: object
host_identity_verified     object
neighbourhood group        object
instant_bookable           object
cancellation_policy        object
room type                  object
Construction year           int32
price                      object
service fee                object
minimum nights              int32
number of reviews           int32
last review                object
reviews per month         float64
review rate number          int32
availability 365            int32
dtype: object
<class 'pandas.core.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['host_identity_verified'] = data['host_identity_verified'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['host_identity_verified'] = data['host_identity_verified'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cancellation_policy'] = data['cancellation_policy'].as

In [15]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Build the model
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.1, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
y_pred_nn = model.predict(X_test_scaled).flatten()
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Deep Learning Model Evaluation:\nMSE: {mse_nn}\nR²: {r2_nn}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 163124.2344 - mse: 163124.2344 - val_loss: 112315.3594 - val_mse: 112315.3594
Epoch 2/100
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 111247.3750 - mse: 111247.3750 - val_loss: 112963.7266 - val_mse: 112963.7266
Epoch 3/100
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 111163.8438 - mse: 111163.8438 - val_loss: 111798.4844 - val_mse: 111798.4844
Epoch 4/100
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 110287.1094 - mse: 110287.1094 - val_loss: 113298.1094 - val_mse: 113298.1094
Epoch 5/100
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 110591.6172 - mse: 110591.6172 - val_loss: 112084.8516 - val_mse: 112084.8516
Epoch 6/100
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 110611.0469 - mse: 110611.0469 - val_los

In [14]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorf

In [20]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers, models

# Build the neural network model
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Define the early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train_scaled, y_train, validation_split=0.1, epochs=200, batch_size=32, 
                    verbose=1, callbacks=[early_stopping])

# Evaluate the model
y_pred_nn = model.predict(X_test_scaled).flatten()
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Optimized Deep Learning Model Evaluation:\nMSE: {mse_nn}\nR²: {r2_nn}")


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 230793.4844 - mse: 230793.4844 - val_loss: 112452.6406 - val_mse: 112452.6406
Epoch 2/200
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 883us/step - loss: 111251.0312 - mse: 111251.0312 - val_loss: 111909.7656 - val_mse: 111909.7656
Epoch 3/200
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 888us/step - loss: 109721.7188 - mse: 109721.7188 - val_loss: 111638.7031 - val_mse: 111638.7031
Epoch 4/200
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 945us/step - loss: 110337.0703 - mse: 110337.0703 - val_loss: 111392.7266 - val_mse: 111392.7266
Epoch 5/200
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 881us/step - loss: 110484.9062 - mse: 110484.9062 - val_loss: 111629.8047 - val_mse: 111629.8047
Epoch 6/200
[1m1913/1913[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 911us/step - loss: 109582.4297 - mse: 109582.4297

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the price
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest MSE: {mse_rf}, R²: {r2_rf}")

# Feature Importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

Random Forest MSE: 75850.95538064688, R²: 0.31294704342705637
                              Feature    Importance
20             days_since_last_review  1.633569e-01
5                   reviews per month  1.594750e-01
7                    availability 365  1.442803e-01
4                   number of reviews  1.230880e-01
2                   Construction year  1.131421e-01
3                      minimum nights  6.599337e-02
6                  review rate number  5.351550e-02
19                       review_month  4.544594e-02
1                 cancellation_policy  2.823612e-02
0              host_identity_verified  1.677581e-02
15             room type_Private room  1.675198e-02
17              instant_bookable_True  1.603570e-02
8        neighbourhood group_Brooklyn  1.497091e-02
9       neighbourhood group_Manhattan  1.402143e-02
10         neighbourhood group_Queens  1.119641e-02
18                        review_year  7.354575e-03
16              room type_Shared room  3.998412e-03
11

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

# Train the model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict the price
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting MSE: {mse_gb}, R²: {r2_gb}")

# Feature Importance
importances = gb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

Gradient Boosting MSE: 110188.67376975996, R²: 0.0019182525197151135
                              Feature  Importance
5                   reviews per month    0.225872
20             days_since_last_review    0.224425
7                    availability 365    0.151287
4                   number of reviews    0.121533
2                   Construction year    0.081205
3                      minimum nights    0.055851
19                       review_month    0.033347
6                  review rate number    0.025795
15             room type_Private room    0.021052
9       neighbourhood group_Manhattan    0.011818
16              room type_Shared room    0.010302
18                        review_year    0.009751
10         neighbourhood group_Queens    0.009640
1                 cancellation_policy    0.007971
14               room type_Hotel room    0.004950
17              instant_bookable_True    0.003439
0              host_identity_verified    0.001570
8        neighbourhood group_Br

In [25]:
from sklearn.neural_network import MLPRegressor

# Train the model
nn_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)

# Predict the price
y_pred_nn = nn_model.predict(X_test)

# Evaluate the model
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print(f"Neural Network MSE: {mse_nn}, R²: {r2_nn}")

Neural Network MSE: 110523.27739776392, R²: -0.001112565098157603


In [30]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict the price
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost MSE: {mse_xgb}, R²: {r2_xgb}")

XGBoost MSE: 105922.51119100528, R²: 0.0405608720918097


In [28]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/124.9 MB ? eta -:--:--
   ---------------------------------------- 1.0/124.9 MB 2.8 MB/s eta 0:00:45
    --------------------------------------- 1.6/124.9 MB 3.0 MB/s eta 0:00:42
    --------------------------------------- 2.4/124.9 MB 3.0 MB/s eta 0:00:42
   - -------------------------------------- 3.4/124.9 MB 3.4 MB/s eta 0:00:37
   - -------------------------------------- 4.5/124.9 MB 3.7 MB/s eta 0:00:33
   - -------------------------------------- 5.2/124.9 MB 3.7 MB/s eta 0:00:33
   -- ------------------------------------- 6.6/124.9 MB 4.0 MB/s eta 0:00:30
   -- ------------------------------------- 8.1/124.9 MB 4.4 MB/s eta 0:00:27
   --- ------------------------------------ 9.4/124.9 MB 4.7 MB/s eta 0:00:25
   --- -

In [32]:
import xgboost as xgb

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R^2 Score: {r2_score(y_test, y_pred)}')

R^2 Score: 0.0405608720918097


In [34]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R^2 Score: {r2_score(y_test, y_pred)}')


R^2 Score: -0.001112565098157603


In [36]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R^2 Score: {r2_score(y_test, y_pred)}')


R^2 Score: -0.001112565098157603


In [38]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_train)


In [43]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# Base models
base_learners = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('lr', Ridge())
]
model = StackingRegressor(estimators=base_learners, final_estimator=Ridge())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'R^2 Score: {r2_score(y_test, y_pred)}')


R^2 Score: 0.35391885745449203


In [44]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)  # Reduce to 10 components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Fit a model on the transformed data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

print(f'R^2 Score: {r2_score(y_test, y_pred)}')


R^2 Score: 0.2797427658246271


In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

y_pred = model.predict(X_test)
print(f'R^2 Score: {r2_score(y_test, y_pred)}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 876us/step - loss: 117388.3438
Epoch 2/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 839us/step - loss: 110933.4609
Epoch 3/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 807us/step - loss: 110954.2188
Epoch 4/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 806us/step - loss: 110354.2109
Epoch 5/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 823us/step - loss: 110325.0625
Epoch 6/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 839us/step - loss: 110283.8750
Epoch 7/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 826us/step - loss: 109585.5625
Epoch 8/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 832us/step - loss: 110503.9141
Epoch 9/100
[1m2126/2126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 873us/step - loss: 111096.3594
Epoch 10/100


In [53]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

# Define model
model = RandomForestRegressor()

# Define hyperparameter space with corrected 'max_features'
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_features': ['sqrt', 'log2', None],  # Corrected this line
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

# Apply RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get best model
best_model = random_search.best_estimator_
print(best_model)


KeyboardInterrupt: 

In [55]:
print(best_model)


RandomForestRegressor(max_depth=17, max_features=None, min_samples_leaf=3,
                      min_samples_split=5, n_estimators=666)


In [58]:
from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define optimized Random Forest model
optimized_rf = RandomForestRegressor(
    max_depth=17,
    max_features=None,
    min_samples_leaf=3,
    min_samples_split=5,
    n_estimators=666,
    random_state=42
)

# Base models for stacking and voting
base_learners = [
    ('rf', optimized_rf),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('lr', Ridge())
]

# Stacking Regressor
stacking_model = StackingRegressor(estimators=base_learners, final_estimator=Ridge())
stacking_model.fit(X_train, y_train)
stacking_y_pred = stacking_model.predict(X_test)
stacking_r2 = r2_score(y_test, stacking_y_pred)
print(f'Stacking Regressor R^2 Score: {stacking_r2}')

# Voting Regressor
voting_model = VotingRegressor(estimators=base_learners)
voting_model.fit(X_train, y_train)
voting_y_pred = voting_model.predict(X_test)
voting_r2 = r2_score(y_test, voting_y_pred)
print(f'Voting Regressor R^2 Score: {voting_r2}')

# Hyperparameter tuning for RandomForest
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_features': ['sqrt', 'log2', None],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist, 
                                   n_iter=50, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get best RandomForest model from RandomizedSearchCV
best_rf = random_search.best_estimator_
print(f'Optimized RandomForestRegressor: {best_rf}')

# Evaluate the best RF model
rf_y_pred = best_rf.predict(X_test)
rf_r2 = r2_score(y_test, rf_y_pred)
print(f'Optimized RandomForest R^2 Score: {rf_r2}')


Stacking Regressor R^2 Score: 0.21211268025798147
Voting Regressor R^2 Score: 0.027502923193305273
Optimized RandomForestRegressor: RandomForestRegressor(max_depth=17, max_features=None, min_samples_leaf=3,
                      min_samples_split=5, n_estimators=666, random_state=42)
Optimized RandomForest R^2 Score: 0.0815121287804611


In [59]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Pipeline for scaling and stacking
scaler = StandardScaler()

# Base models with tuned parameters
base_learners = [
    ('rf', RandomForestRegressor(max_depth=17, n_estimators=666, min_samples_leaf=3, min_samples_split=5, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42))
]

# Stacking with a stronger final estimator
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
)

# Fit and predict
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)

# Evaluate with R^2
r2 = r2_score(y_test, y_pred)
print(f'Stacking Regressor R^2 Score (Improved): {r2}')

# Cross-validation for robust evaluation
cv_scores = cross_val_score(stacking_model, X, y, cv=5, scoring='r2')
print(f'Cross-Validation R^2 Scores: {cv_scores}')
print(f'Mean Cross-Validation R^2: {cv_scores.mean()}')


Stacking Regressor R^2 Score (Improved): 0.23241462809994806
Cross-Validation R^2 Scores: [ 0.2553755   0.31268431 -0.21025603  0.28602443  0.47723312]
Mean Cross-Validation R^2: 0.2242122677765887


In [64]:
model = RandomForestRegressor(max_depth=17, n_estimators=666, min_samples_leaf=3, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Predict prices
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model Evaluation:\nMSE: {mse}\nR²: {r2}")

Model Evaluation:
MSE: 101401.47403636416
R²: 0.0815121287804611
