In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 999

In [175]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor


# Features Engineering

In [238]:
df = pd.read_csv(r'C:\Users\rergu\OneDrive\IronHack\Projects\London Airbnb\df_cleaned.csv', index_col=False)

In [223]:
'''1. Target Transformation 
price is often right-skewed. Use log transform to reduce skewness and improve model fit:'''
df['log_price'] = np.log1p(df['price'])


In [191]:
'''borough_group: Use one-hot encoding'''

df = pd.get_dummies(df, columns=['borough_group'], drop_first=True)


In [164]:
'''weighted_amenities_bin: Use one-hot encoding'''

df = pd.get_dummies(df, columns=['weighted_amenities_bin'], drop_first=True)

In [239]:
'''neighborhood: Use one-hot encoding'''

# One-hot encode neighbourhood_cleansed with 1/0 instead of True/False
df = pd.get_dummies(df, columns=['neighbourhood_cleaned'], prefix='', prefix_sep='', dtype=int)


In [192]:
'''price_tier: Use one-hot encoding'''

df = pd.get_dummies(df, columns=['price_tier'], drop_first=True)

In [138]:
'''weighted_amenities_bin: Map to ordinal scale (e.g., Low=0, Medium=1, High=2)'''

ordinal_map = {'Low': 0, 'Medium': 1, 'High': 2}
df['weighted_amenities_bin'] = df['weighted_amenities_bin'].map(ordinal_map)


In [241]:
df.drop(columns=['latitude', 'longitude', 'room_type', 'accommodates', 'bathrooms',
       'bedrooms', 'number_of_reviews', 'review_scores_rating', 'reviews_per_month', 'toiletries', 'wifi',
       'entertainment', 'cooker', 'fridge', 'housekeeping', 'sound_system',
       'essentials', 'HDTV', 'pool_hottube', 'parking', 'avg_selling_price_2024', 'price_index',
       'price_per_bed', 'price_per_person', 'borough_group', 'weighted_amenities'], inplace=True)

In [242]:
df.reset_index(inplace=True, drop=True)

In [187]:
df.columns

Index(['latitude', 'longitude', 'room_type', 'accommodates', 'bathrooms',
       'bedrooms', 'beds', 'price', 'number_of_reviews',
       'review_scores_rating', 'reviews_per_month', 'toiletries', 'wifi',
       'entertainment', 'cooker', 'fridge', 'housekeeping', 'sound_system',
       'essentials', 'HDTV', 'pool_hottube', 'parking',
       'neighbourhood_cleaned', 'avg_selling_price_2024', 'price_index',
       'price_per_bed', 'price_per_person', 'borough_group', 'price_tier',
       'weighted_amenities', 'weighted_amenities_bin'],
      dtype='object')

In [243]:
df.head()

Unnamed: 0,beds,price,price_tier,weighted_amenities_bin,barking and dagenham,barnet,bexley,brent,bromley,camden,city of london,croydon,ealing,enfield,greenwich,hackney,hammersmith and fulham,haringey,harrow,havering,hillingdon,hounslow,islington,kensington and chelsea,kingston upon thames,lambeth,lewisham,merton,newham,redbridge,richmond upon thames,southwark,sutton,tower hamlets,waltham forest,wandsworth,westminster
0,1.0,94.0,High,Low,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1.0,86.0,High,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3.0,175.0,High,High,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2.0,153.0,High,Medium,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,213.0,High,High,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [251]:
df.to_csv(r"C:\Users\rergu\OneDrive\IronHack\Projects\London Airbnb\df_ml3.csv", index=False)

# Machine Learning

In [None]:
""" Feature Engineering

# Add number_of_reviews, review_scores_rating, minimum_nights, etc.

# Derive price_per_bed, beds_per_guest, or amenities count 

# histograms of locations for price and violin 
# group neghboorhood
# 
# 
# """



In [None]:
# Re-import and re-save model after reset
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder
import joblib

# Load dataset
df = pd.read_csv("df_ml3.csv")

# Encode categorical columns
categorical_cols = ['price_tier', 'weighted_amenities_bin']
encoder = OrdinalEncoder()
df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

scaler = StandardScaler()
df[['beds']] = scaler.fit_transform(df[['beds']])

# Prepare features and target
features = df.drop(columns=['price'], errors='ignore')
target = df['price']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

# Train model
model = XGBRegressor(
    subsample=0.8,
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    objective='reg:squarederror',
    random_state=42
)
model.fit(features, target)
pred_test = model.predict(X_test)

print("r2 score: ", r2_score(y_test, pred_test))
print("mae: ", mean_absolute_error(y_test, pred_test))


r2 score:  0.24824033544863677
mae:  46.98799161202567


In [280]:
#using flask export model and model features
joblib.dump(model, 'xgb_airbnb_model_final.pkl')
joblib.dump(features.columns, 'xgb_airbnb_model_features.pkl')


['xgb_airbnb_model_features.pkl']