In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.preprocessing import StandardScaler
from ydata_profiling import ProfileReport

plt.style.use("seaborn-v0_8")

In [2]:
df = pd.read_csv("airbnb_dataset.csv.csv",encoding="latin1",low_memory=False)
df

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,f,1.0,...,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
1,3705183,39 mÃÂ² Paris (Sacre CÃâur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,f,1.0,...,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
2,4082273,"Lovely apartment with Terrace, 60m2",19252768,2014-07-31,"Paris, Ile-de-France, France",,,,f,1.0,...,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
3,4797344,Cosy studio (close to Eiffel tower),10668311,2013-12-17,"Paris, Ile-de-France, France",,,,f,1.0,...,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
4,4823489,Close to Eiffel Tower - Beautiful flat : 2 rooms,24837558,2014-12-14,"Paris, Ile-de-France, France",,,,f,1.0,...,2,1125,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279707,38338635,Appartement T2 neuf prÃÂ¨s du tram T3a Porte ...,31161181,2015-04-13,"Paris, Ile-de-France, France",,,,f,1.0,...,1,7,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
279708,38538692,Cozy Studio in Montmartre,10294858,2013-11-27,"Paris, Ile-de-France, France",,,,f,1.0,...,7,15,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
279709,38683356,Nice and cosy mini-appartement in Paris,2238502,2012-04-27,"Paris, Ile-de-France, France",,,,f,1.0,...,6,30,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f
279710,39659000,Charming apartment near Rue Saint Maur / Oberk...,38633695,2015-07-16,"Paris, Ile-de-France, France",,,,f,1.0,...,3,18,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f


In [3]:
selected_cols = [
    'price',
    'room_type',
    'neighbourhood',
    'minimum_nights',
    'review_scores_rating',
    'instant_bookable'
]

df = df[selected_cols]
df

Unnamed: 0,price,room_type,neighbourhood,minimum_nights,review_scores_rating,instant_bookable
0,53,Entire place,Buttes-Montmartre,2,100.0,f
1,120,Entire place,Buttes-Montmartre,2,100.0,f
2,89,Entire place,Elysee,2,100.0,f
3,58,Entire place,Vaugirard,2,100.0,f
4,60,Entire place,Passy,2,100.0,f
...,...,...,...,...,...,...
279707,120,Entire place,Observatoire,1,100.0,f
279708,60,Entire place,Buttes-Montmartre,7,100.0,f
279709,50,Entire place,Buttes-Montmartre,6,100.0,f
279710,105,Entire place,Popincourt,3,100.0,f


In [4]:
df.head()

Unnamed: 0,price,room_type,neighbourhood,minimum_nights,review_scores_rating,instant_bookable
0,53,Entire place,Buttes-Montmartre,2,100.0,f
1,120,Entire place,Buttes-Montmartre,2,100.0,f
2,89,Entire place,Elysee,2,100.0,f
3,58,Entire place,Vaugirard,2,100.0,f
4,60,Entire place,Passy,2,100.0,f


In [5]:
df.describe()

Unnamed: 0,price,minimum_nights,review_scores_rating
count,279712.0,279712.0,188307.0
mean,608.792737,8.050967,93.405195
std,3441.826611,31.518946,10.070437
min,0.0,1.0,20.0
25%,75.0,1.0,91.0
50%,150.0,2.0,96.0
75%,474.0,5.0,100.0
max,625216.0,9999.0,100.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279712 entries, 0 to 279711
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   price                 279712 non-null  int64  
 1   room_type             279712 non-null  object 
 2   neighbourhood         279712 non-null  object 
 3   minimum_nights        279712 non-null  int64  
 4   review_scores_rating  188307 non-null  float64
 5   instant_bookable      279712 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 12.8+ MB


In [7]:
df.shape

(279712, 6)

In [8]:
df.columns

Index(['price', 'room_type', 'neighbourhood', 'minimum_nights',
       'review_scores_rating', 'instant_bookable'],
      dtype='object')

In [9]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/6 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.11it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
df['price'].head()


0     53
1    120
2     89
3     58
4     60
Name: price, dtype: int64

In [11]:
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())


Unnamed: 0,price,room_type,neighbourhood,minimum_nights,review_scores_rating,instant_bookable
0,53,Entire place,Buttes-Montmartre,2,100.0,f
1,120,Entire place,Buttes-Montmartre,2,100.0,f
2,89,Entire place,Elysee,2,100.0,f
3,58,Entire place,Vaugirard,2,100.0,f
4,60,Entire place,Passy,2,100.0,f
...,...,...,...,...,...,...
279707,120,Entire place,Observatoire,1,100.0,f
279708,60,Entire place,Buttes-Montmartre,7,100.0,f
279709,50,Entire place,Buttes-Montmartre,6,100.0,f
279710,105,Entire place,Popincourt,3,100.0,f


In [12]:
df=df[df['price']>0]   #zero prices are removed as they are unrealistic $ introduce noise into datsset

In [13]:
upper_limit =df['price'].quantile(0.99) #outliers are handled using %-based capping to retain meaningful high values 
df = df[df['price']<=upper_limit]

In [14]:
df['price_log']=np.log1p(df['price'])  #log transformation is used to normalize the distribution $ improve model performance

In [15]:
df

Unnamed: 0,price,room_type,neighbourhood,minimum_nights,review_scores_rating,instant_bookable,price_log
0,53,Entire place,Buttes-Montmartre,2,100.0,f,3.988984
1,120,Entire place,Buttes-Montmartre,2,100.0,f,4.795791
2,89,Entire place,Elysee,2,100.0,f,4.499810
3,58,Entire place,Vaugirard,2,100.0,f,4.077537
4,60,Entire place,Passy,2,100.0,f,4.110874
...,...,...,...,...,...,...,...
279707,120,Entire place,Observatoire,1,100.0,f,4.795791
279708,60,Entire place,Buttes-Montmartre,7,100.0,f,4.110874
279709,50,Entire place,Buttes-Montmartre,6,100.0,f,3.931826
279710,105,Entire place,Popincourt,3,100.0,f,4.663439


In [16]:
upper_limit=df['minimum_nights'].quantile(.99)
df['minimum_nights']=df['minimum_nights'].clip(upper=upper_limit)  


In [18]:
df=df.drop('price', axis=1)#keep only 1 aspect not both price cols
df


Unnamed: 0,room_type,neighbourhood,minimum_nights,review_scores_rating,instant_bookable,price_log
0,Entire place,Buttes-Montmartre,2,100.0,f,3.988984
1,Entire place,Buttes-Montmartre,2,100.0,f,4.795791
2,Entire place,Elysee,2,100.0,f,4.499810
3,Entire place,Vaugirard,2,100.0,f,4.077537
4,Entire place,Passy,2,100.0,f,4.110874
...,...,...,...,...,...,...
279707,Entire place,Observatoire,1,100.0,f,4.795791
279708,Entire place,Buttes-Montmartre,7,100.0,f,4.110874
279709,Entire place,Buttes-Montmartre,6,100.0,f,3.931826
279710,Entire place,Popincourt,3,100.0,f,4.663439


In [19]:
df=df.drop_duplicates().reset_index(drop=True) # removes duplicate rows to prevent biad and overfitting caused by repeated obs influencing model 

In [20]:
df.duplicated().sum() #checks duplicate is removed or not

np.int64(0)

In [21]:
ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|                                                                                            | 0/6 [00:00<?, ?it/s]
[A%|██████████████                                                                      | 1/6 [00:00<00:02,  2.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00,  7.48it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [22]:
df['instant_bookable'].value_counts()

instant_bookable
f    130845
t     95628
Name: count, dtype: int64

In [23]:
df['instant_bookable']=df['instant_bookable'].map({'t':1,'f':0})  # convert t/f to 1/0 for binary classification

In [24]:
pd.get_dummies(df['room_type'], drop_first=True)


Unnamed: 0,Hotel room,Private room,Shared room
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
226468,False,False,False
226469,False,False,False
226470,False,False,False
226471,False,False,False


In [25]:
df=pd.get_dummies(df, columns=['room_type'], drop_first=True) #creating dummpy data(one hot encoding) of room_type in the form of bool as model cannot read text data



In [26]:
bool_cols=df.select_dtypes(include='bool').columns #converting hotel/private/shared cols from bool to 0/1
df[bool_cols]=df[bool_cols].astype(int)

In [27]:
freq_map=df['neighbourhood'].value_counts(normalize=True)    # doing frequency encoding of neighbourhood col bc it has 660 distinct values
df['neighbourhood_freq']=df['neighbourhood'].map(freq_map)
df.drop(columns=['neighbourhood'], inplace=True)

In [28]:
df.columns


Index(['minimum_nights', 'review_scores_rating', 'instant_bookable',
       'price_log', 'room_type_Hotel room', 'room_type_Private room',
       'room_type_Shared room', 'neighbourhood_freq'],
      dtype='object')

In [29]:
df

Unnamed: 0,minimum_nights,review_scores_rating,instant_bookable,price_log,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,100.0,0,3.988984,0,0,0,0.020042
1,2,100.0,0,4.795791,0,0,0,0.020042
2,2,100.0,0,4.499810,0,0,0,0.007166
3,2,100.0,0,4.077537,0,0,0,0.015644
4,2,100.0,0,4.110874,0,0,0,0.011533
...,...,...,...,...,...,...,...,...
226468,6,100.0,0,4.709530,0,0,0,0.008504
226469,6,100.0,0,4.189655,0,0,0,0.008862
226470,1,100.0,0,3.433987,0,0,0,0.008862
226471,6,100.0,0,4.394449,0,0,0,0.008504


In [30]:
df

Unnamed: 0,minimum_nights,review_scores_rating,instant_bookable,price_log,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,100.0,0,3.988984,0,0,0,0.020042
1,2,100.0,0,4.795791,0,0,0,0.020042
2,2,100.0,0,4.499810,0,0,0,0.007166
3,2,100.0,0,4.077537,0,0,0,0.015644
4,2,100.0,0,4.110874,0,0,0,0.011533
...,...,...,...,...,...,...,...,...
226468,6,100.0,0,4.709530,0,0,0,0.008504
226469,6,100.0,0,4.189655,0,0,0,0.008862
226470,1,100.0,0,3.433987,0,0,0,0.008862
226471,6,100.0,0,4.394449,0,0,0,0.008504


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226473 entries, 0 to 226472
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   minimum_nights          226473 non-null  int64  
 1   review_scores_rating    226473 non-null  float64
 2   instant_bookable        226473 non-null  int64  
 3   price_log               226473 non-null  float64
 4   room_type_Hotel room    226473 non-null  int64  
 5   room_type_Private room  226473 non-null  int64  
 6   room_type_Shared room   226473 non-null  int64  
 7   neighbourhood_freq      226473 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 13.8 MB


In [37]:
df

Unnamed: 0,minimum_nights,review_scores_rating,instant_bookable,price_log,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,100.0,0,3.988984,0,0,0,0.020042
1,2,100.0,0,4.795791,0,0,0,0.020042
2,2,100.0,0,4.499810,0,0,0,0.007166
3,2,100.0,0,4.077537,0,0,0,0.015644
4,2,100.0,0,4.110874,0,0,0,0.011533
...,...,...,...,...,...,...,...,...
226468,6,100.0,0,4.709530,0,0,0,0.008504
226469,6,100.0,0,4.189655,0,0,0,0.008862
226470,1,100.0,0,3.433987,0,0,0,0.008862
226471,6,100.0,0,4.394449,0,0,0,0.008504


In [38]:
x=df.drop(columns=['price_log'])
x

Unnamed: 0,minimum_nights,review_scores_rating,instant_bookable,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,100.0,0,0,0,0,0.020042
1,2,100.0,0,0,0,0,0.020042
2,2,100.0,0,0,0,0,0.007166
3,2,100.0,0,0,0,0,0.015644
4,2,100.0,0,0,0,0,0.011533
...,...,...,...,...,...,...,...
226468,6,100.0,0,0,0,0,0.008504
226469,6,100.0,0,0,0,0,0.008862
226470,1,100.0,0,0,0,0,0.008862
226471,6,100.0,0,0,0,0,0.008504


In [39]:
x=x.drop(columns=['review_scores_rating'])
x

Unnamed: 0,minimum_nights,instant_bookable,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,0,0,0,0,0.020042
1,2,0,0,0,0,0.020042
2,2,0,0,0,0,0.007166
3,2,0,0,0,0,0.015644
4,2,0,0,0,0,0.011533
...,...,...,...,...,...,...
226468,6,0,0,0,0,0.008504
226469,6,0,0,0,0,0.008862
226470,1,0,0,0,0,0.008862
226471,6,0,0,0,0,0.008504


In [81]:
feature_columns = x.columns


In [82]:
print(feature_columns)


Index(['minimum_nights', 'instant_bookable', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room',
       'neighbourhood_freq'],
      dtype='object')


In [40]:
y=df['price_log']
y

0         3.988984
1         4.795791
2         4.499810
3         4.077537
4         4.110874
            ...   
226468    4.709530
226469    4.189655
226470    3.433987
226471    4.394449
226472    4.912655
Name: price_log, Length: 226473, dtype: float64

In [41]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=.20,random_state=70)

In [42]:
model= LinearRegression()
model.fit(x_train,y_train)

In [43]:
y_pred= model.predict(x_test)

In [44]:
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
rmse,r2

(np.float64(1.1733194967595137), 0.0703364357454529)

In [45]:
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)


In [46]:
model= LinearRegression()
model.fit(x_train,y_train)

In [47]:
y_pred= model.predict(x_test_scaled)



In [48]:
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
rmse,r2

(np.float64(20.43671704759267), -281.0431131672169)

In [49]:
y.describe()

count    226473.000000
mean          5.333358
std           1.219090
min           0.693147
25%           4.382027
50%           5.111988
75%           6.216606
max           8.779711
Name: price_log, dtype: float64

In [50]:
x.head()

Unnamed: 0,minimum_nights,instant_bookable,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
0,2,0,0,0,0,0.020042
1,2,0,0,0,0,0.020042
2,2,0,0,0,0,0.007166
3,2,0,0,0,0,0.015644
4,2,0,0,0,0,0.011533


In [51]:
x.shape

(226473, 6)

In [52]:
x.tail()

Unnamed: 0,minimum_nights,instant_bookable,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_freq
226468,6,0,0,0,0,0.008504
226469,6,0,0,0,0,0.008862
226470,1,0,0,0,0,0.008862
226471,6,0,0,0,0,0.008504
226472,1,0,0,0,0,0.014028


In [54]:
from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    random_state=42,
    n_jobs=-1
)

rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print("R2:", r2_score(y_test, y_pred))

R2: 0.6204518108455521


In [55]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=25,
    min_samples_leaf=25,
    random_state=42,
    n_jobs=-1
)

rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print("R2:", r2_score(y_test, y_pred))

R2: 0.6091850982012754


In [56]:
new_listing= pd.DataFrame([{'minimum_nights':2,'instant_bookable':1,'room_type_Hotel room':0 ,'room_type_Private room':1,	'room_type_Shared room':0,'neighbourhood_freq':0.015}])

In [57]:
log_price_pred = rf.predict(new_listing)
log_price_pred

array([3.98641303])

In [58]:
price_pred = np.expm1(log_price_pred)
price_pred

array([52.86134347])

In [59]:
sample = x_test.sample(5, random_state=42)

log_preds = rf.predict(sample)
price_preds = np.expm1(log_preds)

actual_prices = np.expm1(y_test.loc[sample.index])

pd.DataFrame({
    'Actual Price': actual_prices,
    'Predicted Price': price_preds
})

Unnamed: 0,Actual Price,Predicted Price
61134,84.0,116.118421
216996,134.0,101.111312
215849,233.0,330.837704
83505,100.0,312.968079
123188,81.0,153.344428


In [87]:
import joblib

joblib.dump(rf, "airbnb_price_model.pkl")
joblib.dump(feature_columns, "feature_columns.pkl")

['feature_columns.pkl']

In [88]:
import os
os.listdir()

['.ipynb_checkpoints',
 'airbnb_dataset.csv.csv',
 'Airbnb_price_estimator_EDA.ipynb',
 'airbnb_price_model.pkl',
 'feature_columns.pkl']