In [410]:
# Import dependencies
import pandas as pd
from pathlib import Path
import hvplot.pandas
import tensorflow as tf

# Import machine learning dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

Read CSV File

In [411]:
# Read in the CSV file as a Pandas DataFrame
Airbnb_df = pd.read_csv(Path(r"C:\Users\Sezy\OneDrive\last_project_for_it\Airbnb_Score_Model\Resources\Airbnb_Data.csv"))

# Review the DataFrame
Airbnb_df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [412]:
Airbnb_df.shape[0]

74111

In [413]:
Airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

New Rating Category

In [414]:
Airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

Getting Rid of bad data

In [415]:

Airbnb_df = Airbnb_df.drop(['id', 'description','first_review', 'host_has_profile_pic','host_since', 'last_review', 'latitude', 'longitude', 'name', 'neighbourhood',
                         'thumbnail_url', 'zipcode'], axis = 1)

making host_response_rate a float instead, used for one_hot encoding

In [416]:
Airbnb_df['host_response_rate']= Airbnb_df['host_response_rate'].str.rstrip('%').astype('float')/100
Airbnb_df['host_response_rate'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 74111 entries, 0 to 74110
Series name: host_response_rate
Non-Null Count  Dtype  
--------------  -----  
55812 non-null  float64
dtypes: float64(1)
memory usage: 579.1 KB


Filling in missing values

In [417]:
from sklearn.impute import SimpleImputer
# Replace missing values with median
SI = SimpleImputer(strategy = 'median')
SI.fit(Airbnb_df.select_dtypes(include = 'number'))
# creating a side df to seperate data
imputer = SI.transform(Airbnb_df.select_dtypes(include = 'number'))
airbnb_num = pd.DataFrame(imputer, columns = Airbnb_df.select_dtypes(include = 'number').columns)
airbnb_num.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,number_of_reviews,review_scores_rating,bedrooms,beds
0,5.010635,3.0,1.0,1.0,2.0,100.0,1.0,1.0
1,5.129899,7.0,1.0,1.0,6.0,93.0,3.0,3.0
2,4.976734,5.0,1.0,1.0,10.0,92.0,1.0,3.0
3,6.620073,4.0,1.0,1.0,0.0,96.0,2.0,2.0
4,4.744932,2.0,1.0,1.0,4.0,40.0,0.0,1.0


In [418]:
airbnb_num.isnull().sum()

log_price               0
accommodates            0
bathrooms               0
host_response_rate      0
number_of_reviews       0
review_scores_rating    0
bedrooms                0
beds                    0
dtype: int64

In [419]:
SI = SimpleImputer(strategy = 'most_frequent')
SI.fit(Airbnb_df.select_dtypes(include = 'object'))
imputer = SI.transform(Airbnb_df.select_dtypes(include = 'object'))
airbnb_obj = pd.DataFrame(imputer, columns = Airbnb_df.select_dtypes(include = 'object').columns)
airbnb_obj.head()

Unnamed: 0,property_type,room_type,amenities,bed_type,cancellation_policy,city,host_identity_verified,instant_bookable
0,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,t,f
1,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",Real Bed,strict,NYC,f,t
2,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",Real Bed,moderate,NYC,t,t
3,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",Real Bed,flexible,SF,t,f
4,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",Real Bed,moderate,DC,t,t


In [420]:
airbnb_obj.isnull().sum()

property_type             0
room_type                 0
amenities                 0
bed_type                  0
cancellation_policy       0
city                      0
host_identity_verified    0
instant_bookable          0
dtype: int64

Merge Data
Combine clean data

In [421]:
Airbnb_df_v2 = pd.concat([airbnb_num, airbnb_obj], axis = 1)
Airbnb_df_v2['cleaning_fee'] = Airbnb_df['cleaning_fee']
Airbnb_df_v2.isnull().sum()

log_price                 0
accommodates              0
bathrooms                 0
host_response_rate        0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
property_type             0
room_type                 0
amenities                 0
bed_type                  0
cancellation_policy       0
city                      0
host_identity_verified    0
instant_bookable          0
cleaning_fee              0
dtype: int64

Creating a list of amenities count, another feature

In [422]:
aminities_count = []
for i in Airbnb_df_v2['amenities']:
  aminities_count.append(len(i))

Airbnb_df_v2['amenities'] = aminities_count
Airbnb_df_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   log_price               74111 non-null  float64
 1   accommodates            74111 non-null  float64
 2   bathrooms               74111 non-null  float64
 3   host_response_rate      74111 non-null  float64
 4   number_of_reviews       74111 non-null  float64
 5   review_scores_rating    74111 non-null  float64
 6   bedrooms                74111 non-null  float64
 7   beds                    74111 non-null  float64
 8   property_type           74111 non-null  object 
 9   room_type               74111 non-null  object 
 10  amenities               74111 non-null  int64  
 11  bed_type                74111 non-null  object 
 12  cancellation_policy     74111 non-null  object 
 13  city                    74111 non-null  object 
 14  host_identity_verified  74111 non-null

Making a rare category for properties that are not listed as often

In [423]:
value_count = Airbnb_df_v2['property_type'].value_counts()
other_values = value_count[value_count < 50].index
Airbnb_df_v2['property_type'] = Airbnb_df_v2['property_type'].replace(other_values, 'Other')

Combine the super strict categories that are extremely rare to avoid multicollinearity. 

In [424]:
Airbnb_df_v2['cancellation_policy'] = Airbnb_df_v2['cancellation_policy'].replace(['super_strict_30', 'super_strict_60'], 'super_strict')
Airbnb_df_v2['cancellation_policy'].value_counts()

cancellation_policy
strict          32374
flexible        22545
moderate        19063
super_strict      129
Name: count, dtype: int64

making the final DF to turn into a usable csv

In [425]:
airbnb_v3 = Airbnb_df_v2.copy()

In [426]:
airbnb_v3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   log_price               74111 non-null  float64
 1   accommodates            74111 non-null  float64
 2   bathrooms               74111 non-null  float64
 3   host_response_rate      74111 non-null  float64
 4   number_of_reviews       74111 non-null  float64
 5   review_scores_rating    74111 non-null  float64
 6   bedrooms                74111 non-null  float64
 7   beds                    74111 non-null  float64
 8   property_type           74111 non-null  object 
 9   room_type               74111 non-null  object 
 10  amenities               74111 non-null  int64  
 11  bed_type                74111 non-null  object 
 12  cancellation_policy     74111 non-null  object 
 13  city                    74111 non-null  object 
 14  host_identity_verified  74111 non-null

Bin the target variable

In [427]:
# Define bins and labels
bins = [0, 60 , 70, 80, 100]
labels = [0, 1, 2, 3]

# Create a new column with categorized ratings
airbnb_v3['rating_category'] = pd.cut(airbnb_v3['review_scores_rating'], bins=bins, labels=labels, include_lowest=True)

In [428]:
y = airbnb_v3['rating_category']  # Target variable
X = airbnb_v3.drop(columns=['rating_category', 'review_scores_rating'])

In [429]:
X_encoded = pd.get_dummies(X, drop_first=True, dtype=int)

In [430]:
airbnb_v3_encoded = pd.concat([X_encoded, y], axis=1)

In [431]:
airbnb_v3_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   log_price                         74111 non-null  float64 
 1   accommodates                      74111 non-null  float64 
 2   bathrooms                         74111 non-null  float64 
 3   host_response_rate                74111 non-null  float64 
 4   number_of_reviews                 74111 non-null  float64 
 5   bedrooms                          74111 non-null  float64 
 6   beds                              74111 non-null  float64 
 7   amenities                         74111 non-null  int64   
 8   cleaning_fee                      74111 non-null  bool    
 9   property_type_Bed & Breakfast     74111 non-null  int32   
 10  property_type_Boat                74111 non-null  int32   
 11  property_type_Boutique hotel      74111 non-null  int3

In [432]:
airbnb_v3_encoded['cleaning_fee'] = airbnb_v3['cleaning_fee'].astype('int')

In [433]:
airbnb_v3_encoded.drop(['property_type_Boat', 'room_type_Shared room', 'cancellation_policy_super_strict'], axis = 1, inplace = True)

In [434]:
airbnb_v3_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   log_price                      74111 non-null  float64 
 1   accommodates                   74111 non-null  float64 
 2   bathrooms                      74111 non-null  float64 
 3   host_response_rate             74111 non-null  float64 
 4   number_of_reviews              74111 non-null  float64 
 5   bedrooms                       74111 non-null  float64 
 6   beds                           74111 non-null  float64 
 7   amenities                      74111 non-null  int64   
 8   cleaning_fee                   74111 non-null  int32   
 9   property_type_Bed & Breakfast  74111 non-null  int32   
 10  property_type_Boutique hotel   74111 non-null  int32   
 11  property_type_Bungalow         74111 non-null  int32   
 12  property_type_Cabin            7

creating another csv

In [435]:
airbnb_v3_encoded.to_csv('airbnb_v3_encoded.csv', index=False)

In [371]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [372]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)


In [374]:
model.fit(X_train, y_train)

In [375]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed report
print(classification_report(y_test, y_pred))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.25      0.01      0.01       143
           1       0.00      0.00      0.00        72
           2       0.27      0.01      0.02       578
           3       0.95      1.00      0.97     14030

    accuracy                           0.95     14823
   macro avg       0.37      0.25      0.25     14823
weighted avg       0.91      0.95      0.92     14823

[[    1     0     0   142]
 [    0     0     0    72]
 [    0     0     7   571]
 [    3     0    19 14008]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [344]:
airbnb_v3.drop(['property_type_Boat', 'room_type_Shared room', 'cancellation_policy_super_strict'], axis = 1, inplace = True)

In [381]:
airbnb_v3_encoded['cleaning_fee'] = airbnb_v3['cleaning_fee'].astype('int')

In [268]:
airbnb_v3.head()

Unnamed: 0,log_price,accommodates,bathrooms,host_response_rate,number_of_reviews,review_scores_rating,bedrooms,beds,amenities,bed_type,...,property_type_Loft,property_type_Other,property_type_Timeshare,property_type_Townhouse,property_type_Villa,room_type_Entire home/apt,room_type_Private room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict
0,5.010635,3.0,1.0,1.0,2.0,100.0,1.0,1.0,152,Real Bed,...,0,0,0,0,0,1,0,0,0,1
1,5.129899,7.0,1.0,1.0,6.0,93.0,3.0,3.0,218,Real Bed,...,0,0,0,0,0,1,0,0,0,1
2,4.976734,5.0,1.0,1.0,10.0,92.0,1.0,3.0,311,Real Bed,...,0,0,0,0,0,1,0,0,1,0
3,6.620073,4.0,1.0,1.0,0.0,96.0,2.0,2.0,210,Real Bed,...,0,0,0,0,0,1,0,1,0,0
4,4.744932,2.0,1.0,1.0,4.0,40.0,0.0,1.0,174,Real Bed,...,0,0,0,0,0,1,0,0,1,0


In [380]:
airbnb_v3_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   log_price                         74111 non-null  float64 
 1   accommodates                      74111 non-null  float64 
 2   bathrooms                         74111 non-null  float64 
 3   host_response_rate                74111 non-null  float64 
 4   number_of_reviews                 74111 non-null  float64 
 5   bedrooms                          74111 non-null  float64 
 6   beds                              74111 non-null  float64 
 7   amenities                         74111 non-null  int64   
 8   cleaning_fee                      74111 non-null  bool    
 9   property_type_Bed & Breakfast     74111 non-null  int32   
 10  property_type_Boat                74111 non-null  int32   
 11  property_type_Boutique hotel      74111 non-null  int3