In [59]:
import pandas as pd
import numpy as np

#Import data preprocessing packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler

from statsmodels import regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb


#Import random forest package
from sklearn.ensemble import RandomForestRegressor

#Import scoring metrics
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

In [60]:
airbnb_df = pd.read_csv("airbnb_data_13-03-2024.csv")

In [61]:
airbnb_df.head()

Unnamed: 0,listing_id,date,price,property_type,room_type,accomodates,bedrooms,beds
0,40334325,2022-08-03,56.0,Entire rental unit,Entire home/apt,2,1.0,2.0
1,22742449,2022-11-13,95.0,Entire rental unit,Entire home/apt,4,2.0,2.0
2,22742449,2022-06-28,112.0,Entire rental unit,Entire home/apt,4,2.0,2.0
3,22742449,2022-01-02,155.0,Entire rental unit,Entire home/apt,4,2.0,2.0
4,22742449,2022-09-05,125.0,Entire rental unit,Entire home/apt,4,2.0,2.0


### Prediction using Linear Regression Model

In [62]:
categorical_variables = airbnb_df[['property_type','room_type','accomodates','bedrooms','beds']].columns

In [63]:
#Performing get_dummies to  categorical variables
airbnb_df_expanded = airbnb_df.copy()
airbnb_df_expanded = pd.get_dummies(airbnb_df, sparse=True, columns=categorical_variables, drop_first=True)

pd.get_dummies(): The categorical variables in the DataFrame are converted into a set of binary variables, where each category of the original variable becomes a new binary variable (dummy variable)

In [64]:
#Creating X and y
X_vars = airbnb_df_expanded.columns.difference(['listing_id', 'price', 'date'])
X = airbnb_df_expanded[X_vars]
y = airbnb_df_expanded[['price']]

In [65]:
# Creating train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [66]:
#Initialize MaxAbsScaler
scaler = MaxAbsScaler()
#Initialize Linear Regression
regression = LinearRegression()
#Create pipeline
pipeline_ols = Pipeline([("Scaler", scaler), ("Regression", regression)])

Tried using MinMax Scalar:it was giving convert to dense numpy array error.
OLS(Ordinary Least Squares) is used as multiple predictor variables are involved. OLS can handle more complex relationships between the predictors.

In [67]:
pipeline_ols.fit(X_train, y_train['price'])

In [68]:
#Calculating R- squared on train and test data
ols_train_r2 = pipeline_ols.score(X_train, y_train['price'])
ols_test_r2 = pipeline_ols.score(X_test, y_test['price'])

In [69]:
print("train r-squared is ",ols_train_r2, " and test r-squared is ",ols_test_r2 )

train r-squared is  0.3304400536152847  and test r-squared is  0.36966449299508464


In [70]:
#Calculating Mean Squared Error on train and test data
x_train_predictor = pipeline_ols.predict(X_train)
x_test_predictor = pipeline_ols.predict(X_test)

mse_train = mean_squared_error(y_train['price'], x_train_predictor)
mse_test = mean_squared_error(y_test['price'], x_test_predictor)

In [71]:
#Calculating Root mean squared error on train and test data
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

In [72]:
model_scores_df = pd.DataFrame([{'Model':'OLS_Airbnb_Price', 'R2_train':ols_train_r2, 'R2_test':ols_test_r2, 
                          'RMSE_train':rmse_train, 'RMSE_test':rmse_test}])

In [73]:
model_scores_df[['Model', 'R2_train', 'R2_test', 'RMSE_train', 'RMSE_test']]

Unnamed: 0,Model,R2_train,R2_test,RMSE_train,RMSE_test
0,OLS_Airbnb_Price,0.33044,0.369664,35.438557,35.633949


Here we can see the R2 value improved from train to test(from ~0.33 to ~0.36)

In [74]:
# Retrieving coefficients from trained model and creating df to store them
features_coefficients = pd.DataFrame({"Feature":X_train.columns, 
                             "Coefficients_RawPrice":np.transpose(pipeline_ols.named_steps['Regression'].coef_)})

features_coefficients[features_coefficients['Coefficients_RawPrice']!=0].count()

Feature                  59
Coefficients_RawPrice    59
dtype: int64

The model has 59 feature variables with valid coefficient value(not equal to 0)

In [75]:
features_coefficients

Unnamed: 0,Feature,Coefficients_RawPrice
0,accomodates_10,34.407319
1,accomodates_11,46.326094
2,accomodates_12,22.374357
3,accomodates_14,16.289967
4,accomodates_15,7.82251
5,accomodates_16,71.772219
6,accomodates_2,12.092056
7,accomodates_3,18.656846
8,accomodates_4,34.546831
9,accomodates_5,27.605391


### Using Decision Trees Model

In [76]:
# Creating train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [77]:
# Initialize MaxAbsScaler
scaler = MaxAbsScaler()

# Initialize DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state=31)

In [78]:
# Create pipeline
pipeline_tree = Pipeline([("Scaler", scaler), ("DecisionTreeRegressor", tree_regressor)])

# Fit the model
pipeline_tree.fit(X_train, y_train)

In [79]:
# Predicting prices
tree_train_predictor = pipeline_tree.predict(X_train)
tree_test_predictor = pipeline_tree.predict(X_test)

In [80]:
# Calculating R-squared on train and test data
tree_train_r2 = pipeline_tree.score(X_train, y_train)
tree_test_r2 = pipeline_tree.score(X_test, y_test)

print("Decision Tree - train r-squared:", tree_train_r2, "and test r-squared:", tree_test_r2)

Decision Tree - train r-squared: 0.41081782898778574 and test r-squared: 0.40509862082603765


In [81]:
# Calculating Mean Squared Error on train and test data
tree_mse_train = mean_squared_error(y_train, tree_train_predictor)
tree_mse_test = mean_squared_error(y_test, tree_test_predictor)

# Calculating Root Mean Squared Error on train and test data
tree_rmse_train = np.sqrt(tree_mse_train)
tree_rmse_test = np.sqrt(tree_mse_test)

In [82]:
# Create a DataFrame to store model scores
tree_model_scores_df = pd.DataFrame([{
    'Model': 'DecisionTree_Airbnb_Price',
    'R2_train': tree_train_r2,
    'R2_test': tree_test_r2,
    'RMSE_train': tree_rmse_train,
    'RMSE_test': tree_rmse_test
}])

# Display the scores
print(tree_model_scores_df)


                       Model  R2_train   R2_test  RMSE_train  RMSE_test
0  DecisionTree_Airbnb_Price  0.410818  0.405099   33.243451  34.617887


In [83]:
# If you want to retrieve feature importances, you can do so with RandomForestRegressor
features_importances_tree = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": np.transpose(pipeline_tree.named_steps['DecisionTreeRegressor'].feature_importances_)
})

# Sorting features_importances DataFrame by Importance column in descending order
sorted_features_importances_tree = features_importances_tree.sort_values(by='Importance', ascending=False)

# Displaying the sorted DataFrame
print(sorted_features_importances_tree)

                                              Feature  Importance
27            property_type_Entire serviced apartment    0.233786
6                                       accomodates_2    0.170493
57                             room_type_Private room    0.122520
14                                       bedrooms_2.0    0.073634
15                                       bedrooms_3.0    0.051446
16                                           beds_2.0    0.045740
10                                      accomodates_6    0.038022
25                   property_type_Entire rental unit    0.033222
8                                       accomodates_4    0.032178
48                        property_type_Room in hotel    0.025323
9                                       accomodates_5    0.017195
24                          property_type_Entire loft    0.016034
46                   property_type_Room in aparthotel    0.014020
17                                           beds_3.0    0.012818
47        

### Using Random Forest Regression Model

In [84]:
# Creating train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [85]:
# Initialize RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=31)

In [86]:
# Create pipeline
pipeline_rf = Pipeline([("Scaler", scaler), ("RandomForestRegressor", rf_regressor)])

In [87]:
pipeline_rf.fit(X_train, y_train['price'])

In [88]:
# Calculating R-squared on train and test data
rf_train_r2 = pipeline_rf.score(X_train, y_train['price'])
rf_test_r2 = pipeline_rf.score(X_test, y_test['price'])

print("Random Forest - train r-squared:", rf_train_r2, "and test r-squared:", rf_test_r2)

Random Forest - train r-squared: 0.40798058863710496 and test r-squared: 0.41490240748806884


In [89]:
# Calculating Mean Squared Error on train and test data
rf_train_predictor = pipeline_rf.predict(X_train)
rf_test_predictor = pipeline_rf.predict(X_test)

rf_mse_train = mean_squared_error(y_train['price'], rf_train_predictor)
rf_mse_test = mean_squared_error(y_test['price'], rf_test_predictor)

In [90]:
# Calculating Root Mean Squared Error on train and test data
rf_rmse_train = np.sqrt(rf_mse_train)
rf_rmse_test = np.sqrt(rf_mse_test)

In [91]:
# Create a DataFrame to store model scores
regression_model_scores_df = pd.DataFrame([{
    'Model': 'RandomForest_Airbnb_Price',
    'R2_train': rf_train_r2,
    'R2_test': rf_test_r2,
    'RMSE_train': rf_rmse_train,
    'RMSE_test': rf_rmse_test
}])

# Display the scores
print(regression_model_scores_df)

                       Model  R2_train   R2_test  RMSE_train  RMSE_test
0  RandomForest_Airbnb_Price  0.407981  0.414902   33.323398  34.331456


In [92]:
# If you want to retrieve feature importances, you can do so with RandomForestRegressor
features_importances = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": np.transpose(pipeline_rf.named_steps['RandomForestRegressor'].feature_importances_)
})

# Sorting features_importances DataFrame by Importance column in descending order
sorted_features_importances = features_importances.sort_values(by='Importance', ascending=False)

# Displaying the sorted DataFrame
print(sorted_features_importances)





                                              Feature  Importance
27            property_type_Entire serviced apartment    0.230249
6                                       accomodates_2    0.128570
57                             room_type_Private room    0.118500
14                                       bedrooms_2.0    0.083859
15                                       bedrooms_3.0    0.070001
16                                           beds_2.0    0.046840
25                   property_type_Entire rental unit    0.030249
8                                       accomodates_4    0.027587
10                                      accomodates_6    0.027554
48                        property_type_Room in hotel    0.023901
24                          property_type_Entire loft    0.022943
17                                           beds_3.0    0.018622
9                                       accomodates_5    0.017605
7                                       accomodates_3    0.013678
47        

### Using Gradient Boosting Machine for modelling

In [93]:
# Creating train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [94]:
# Initializing MaxAbsScaler
scaler = MaxAbsScaler()

# Initializing XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=31)

In [95]:
# Creating pipeline
pipeline_xgb = Pipeline([("Scaler", scaler), ("XGBRegressor", xgb_regressor)])

# Fitting the model
pipeline_xgb.fit(X_train, y_train)

In [96]:
# Predicting prices
xgb_train_predictor = pipeline_xgb.predict(X_train)
xgb_test_predictor = pipeline_xgb.predict(X_test)

In [97]:
# Calculating R-squared on train and test data
xgb_train_r2 = pipeline_xgb.score(X_train, y_train)
xgb_test_r2 = pipeline_xgb.score(X_test, y_test)

print("XGBoost - train r-squared:", xgb_train_r2, "and test r-squared:", xgb_test_r2)

XGBoost - train r-squared: 0.4076899406133351 and test r-squared: 0.4165100607158291


In [98]:
# Calculating Mean Squared Error on train and test data
xgb_mse_train = mean_squared_error(y_train, xgb_train_predictor)
xgb_mse_test = mean_squared_error(y_test, xgb_test_predictor)

# Calculating Root Mean Squared Error on train and test data
xgb_rmse_train = np.sqrt(xgb_mse_train)
xgb_rmse_test = np.sqrt(xgb_mse_test)

In [99]:
# Creating a DataFrame to store model scores
GBM_model_scores_df = pd.DataFrame([{
    'Model': 'XGBoost_Airbnb_Price',
    'R2_train': xgb_train_r2,
    'R2_test': xgb_test_r2,
    'RMSE_train': xgb_rmse_train,
    'RMSE_test': xgb_rmse_test
}])

# Displaying the scores
print(GBM_model_scores_df)


                  Model  R2_train  R2_test  RMSE_train  RMSE_test
0  XGBoost_Airbnb_Price   0.40769  0.41651   33.331576  34.284258


## Using Ridge Regression Model to predict prices

In [100]:
from sklearn.linear_model import Ridge

In [101]:
# Initializing Ridge Regression
ridge_regression = Ridge(alpha=1.0)  # You can adjust the alpha parameter as needed

In [102]:
# Creating pipeline
pipeline_ridge = Pipeline([("Scaler", scaler), ("Regression", ridge_regression)])

In [103]:
# Fit the Ridge regression model
pipeline_ridge.fit(X_train, y_train['price'])

In [104]:
# Calculate R-squared on train and test data
ridge_train_r2 = pipeline_ridge.score(X_train, y_train['price'])
ridge_test_r2 = pipeline_ridge.score(X_test, y_test['price'])

print("Train r-squared is ", ridge_train_r2, " and test r-squared is ", ridge_test_r2)

Train r-squared is  0.3289879302236264  and test r-squared is  0.3698972943641


In [105]:
# Predict prices on train and test data
y_train_predictor_ridge = pipeline_ridge.predict(X_train)
y_test_predictor_ridge = pipeline_ridge.predict(X_test)

# Calculate Mean Squared Error on train and test data
mse_train_ridge = mean_squared_error(y_train['price'], y_train_predictor_ridge)
mse_test_ridge = mean_squared_error(y_test['price'], y_test_predictor_ridge)

In [106]:
# Calculate Root Mean Squared Error on train and test data
rmse_train_ridge = np.sqrt(mse_train_ridge)
rmse_test_ridge = np.sqrt(mse_test_ridge)


In [107]:
# Create DataFrame for model scores
model_scores_df_ridge = pd.DataFrame([{'Model': 'Ridge_Airbnb_Price', 'R2_train': ridge_train_r2, 'R2_test': ridge_test_r2, 
                                       'RMSE_train': rmse_train_ridge, 'RMSE_test': rmse_test_ridge}])

print(model_scores_df_ridge)

                Model  R2_train   R2_test  RMSE_train  RMSE_test
0  Ridge_Airbnb_Price  0.328988  0.369897   35.476966  35.627368


In [108]:
# Retrieve coefficients from trained model and create DataFrame to store them
coefficients_ridge = pd.DataFrame({"Feature": X_train.columns, 
                                   "Coefficients_Price1": np.transpose(pipeline_ridge.named_steps['Regression'].coef_)})

print(coefficients_ridge)


                                              Feature  Coefficients_Price1
0                                      accomodates_10            28.200457
1                                      accomodates_11            38.251595
2                                      accomodates_12            16.600318
3                                      accomodates_14            10.177457
4                                      accomodates_15             1.767599
5                                      accomodates_16            56.021220
6                                       accomodates_2             7.528906
7                                       accomodates_3            13.583664
8                                       accomodates_4            29.201260
9                                       accomodates_5            21.819879
10                                      accomodates_6            36.785710
11                                      accomodates_7            37.187476
12                       

In [109]:
#Comparison of model scores of linear regression and ridge regression

merged_model_scores_df = pd.concat([model_scores_df,regression_model_scores_df,tree_model_scores_df,GBM_model_scores_df, model_scores_df_ridge], ignore_index=True)

In [110]:
merged_model_scores_df

Unnamed: 0,Model,R2_train,R2_test,RMSE_train,RMSE_test
0,OLS_Airbnb_Price,0.33044,0.369664,35.438557,35.633949
1,RandomForest_Airbnb_Price,0.407981,0.414902,33.323398,34.331456
2,DecisionTree_Airbnb_Price,0.410818,0.405099,33.243451,34.617887
3,XGBoost_Airbnb_Price,0.40769,0.41651,33.331576,34.284258
4,Ridge_Airbnb_Price,0.328988,0.369897,35.476966,35.627368


To select the model, the metrics (R-squared and Root Mean Squared Error) values are compared. The standard procedure is to select the model with high R- squared value and low RMSE value. 

If the above practice is followed, Decision Tree Regression has the highest R-squared and lowest RMSE value. But it seems to overfit more on training data compared to others and it could potentially limit its performance on unseen data.

But Linear regression (OLS) is the model providing us the coefficients which help us to predict the price directly. For selecting the best model, it's essential to consider the context of the problem and the specific requirements of the application. Considering these two important factors, Linear Regression model is selected.
