In [1]:
# Importing Libraries
import pandas as pd
import numpy as np

In [2]:
# Importing Data file
df = pd.read_csv('Total_Listing_cleaned_V1.6.csv',low_memory=False)

In [3]:
# We have deleted the columns because Airbnb host will not fill this information as these columns does not make any sense for prediction
df = df.drop(['id','name','description','neighborhood_overview','host_location','host_about','number_of_reviews','number_of_reviews_ltm','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','reviews_per_month','property_type'], axis=1)
df = df.drop(['latitude','longitude','host_response_rate','host_response_time','host_acceptance_rate','host_has_profile_pic','host_identity_verified','has_availability','calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms'], axis=1)
df = df.drop(['host_since','host_is_superhost','instant_bookable','availability_60','availability_90','availability_365'], axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18016 entries, 0 to 18015
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   host_total_listings_count  18016 non-null  int64  
 1   neighbourhood_cleansed     18016 non-null  object 
 2   new_property_type          18016 non-null  object 
 3   room_type                  18016 non-null  object 
 4   accommodates               18016 non-null  int64  
 5   bathrooms_count            18016 non-null  float64
 6   Shared/Category            18016 non-null  object 
 7   bedrooms                   18016 non-null  int64  
 8   beds                       18016 non-null  int64  
 9   amenities                  18016 non-null  object 
 10  price                      18016 non-null  int64  
 11  minimum_nights             18016 non-null  int64  
 12  maximum_nights             18016 non-null  int64  
 13  availability_30            18016 non-null  int

In [5]:
df.isnull().sum()

host_total_listings_count    0
neighbourhood_cleansed       0
new_property_type            0
room_type                    0
accommodates                 0
bathrooms_count              0
Shared/Category              0
bedrooms                     0
beds                         0
amenities                    0
price                        0
minimum_nights               0
maximum_nights               0
availability_30              0
dtype: int64

In [6]:
# Price is selected till 500 as majority of airbnb data lies between 0 to 500 and Hotel room has only 29 entries
df = df[(df['price'] < 500)]
df = df[(df['room_type'] != 'Hotel room')]

In [7]:
# Neighbourhood_Cleansed columns transformed to neighbourhood_level for better prediction
neighbourhood_avg_price = df[['neighbourhood_cleansed','price']].groupby('neighbourhood_cleansed')['price'].mean().sort_values(ascending=False)

neighbourhood_class_df = neighbourhood_avg_price.to_frame()

# Converting neighbourhoods to Levels
def neigbourhood_class(row):
  if row['price'] >=0 and row['price'] <= 70:
    return 1
  elif row['price'] > 70 and row['price'] <= 130:
    return 2
  elif row['price'] >130 and row['price'] <= 180: 
    return 3
  else:
    return 4
  
neighbourhood_class_df['neigbourhood_level'] = neighbourhood_class_df.apply(neigbourhood_class,axis=1)

df = df.merge(neighbourhood_class_df,on='neighbourhood_cleansed')

df = df.drop(['price_y','neighbourhood_cleansed'], axis=1)

In [8]:
# Natural Log of price is taken for better prediction
df['log_price'] = np.log(df['price_x'])
df = df.drop(['price_x'], axis=1)

In [9]:
# Amenities columns transformed into useful features for prediction
import re
from collections import Counter

df['amenities_num'] = [len(x) for x in df.amenities.str.split(',')] #number of amenities

df.amenities = df.amenities.apply(
    lambda x: re.sub('[^a-zA-Z,\/\s\d-]*', '', x).split(sep=','))

amenities_list = [item for sublist in df.amenities for item in sublist]
amenity_counts = Counter(amenities_list).most_common()
amenities_of_interest = [x[0] for x in amenity_counts[0:70]]

amenities_cols = []
amenities_col_names = []
for amenity in amenities_of_interest:
    amenities_cols.append([1 if x==True else 0 for x in df.amenities.apply(lambda x: amenity in x)])
    amenities_col_names.append(('amen_'+amenity))
    
amenity_columns = pd.DataFrame(amenities_cols).transpose()
amenity_columns.columns=amenities_col_names
amenity_columns.index=df.index

kitchen_amenities = [
    'Kitchen', 'Refrigerator', 'Dishes and silverware', 'Microwave',' Cooking basics',
    'Coffee maker',' Hot water kettle',' Freezer',' Stove',' Oven',' Dishwasher',' Toaster',' Dining table',' Wine glasses',
    ' Baking sheet',' BBQ grill']

safety_amenities = [' Smoke alarm',' Carbon monoxide alarm', ' Fire extinguisher',
    ' First aid kit',' Lock on bedroom door', ' Lockbox',' Security cameras on property',' Keypad']

household_amenities = [' Long term stays allowed',' Essentials',
    ' Air conditioning', ' Heating', ' Dedicated workspace',' Free parking on premises',
    ' Private entrance',' Luggage dropoff allowed', 'Elevator',' Free street parking', 
    ' Paid parking off premises',' Room-darkening shades',' Conditioner', ' Indoor fireplace',' Host greets you',' Paid parking on premises', ' Elevator',
    ' Outdoor furniture']

bedroom_amenities = [' Hangers',' Bed linens',' Extra pillows and blankets']

electronics_amenities = [' TV',' Wifi',' TV with standard cable',' Ethernet connection']

extra_spaces_amenities = [' Patio or balcony',' Pool',' Private patio or balcony',' Backyard',' Hot tub', 'BBQ grill',' Gym',' Single level home']

cleaning_amenities = [' Cleaning products','Shampoo', 'Hair dryer', 'Washer', 'Dryer', 'Iron', 'Hot water', 'Bathtub',' Shower gel',' Body soap']

amenity_groups = [
    kitchen_amenities, safety_amenities, cleaning_amenities,
    household_amenities, bedroom_amenities, electronics_amenities,
    extra_spaces_amenities
]

amenity_group_cols = []
for group in amenity_groups:
    amenity_group_cols.append(df.amenities.apply(
    lambda x: len(set(group).intersection(set(x)))))
    
amenity_group_columns = pd.DataFrame(amenity_group_cols).transpose()
amenity_group_columns.index = df.index
amenity_group_columns.columns = [
    'amen_group_kitchen', 'amen_group_cleaning', 'amen_group_safety',
    'amen_group_household', 'amen_group_bedroom', 'amen_group_electronics',
    'amen_group_extra_spaces'
]

df = df.merge(amenity_group_columns,how='left',left_index=True,right_index=True)
df = df.drop("amenities", axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17548 entries, 0 to 17547
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   host_total_listings_count  17548 non-null  int64  
 1   new_property_type          17548 non-null  object 
 2   room_type                  17548 non-null  object 
 3   accommodates               17548 non-null  int64  
 4   bathrooms_count            17548 non-null  float64
 5   Shared/Category            17548 non-null  object 
 6   bedrooms                   17548 non-null  int64  
 7   beds                       17548 non-null  int64  
 8   minimum_nights             17548 non-null  int64  
 9   maximum_nights             17548 non-null  int64  
 10  availability_30            17548 non-null  int64  
 11  neigbourhood_level         17548 non-null  int64  
 12  log_price                  17548 non-null  float64
 13  amenities_num              17548 non-null  int

In [11]:
# Copied data to different variable
df3 = df.copy()

In [12]:
# data is divided into 'modeling data' and 'unseen data'. Then extracted data in two excel files.
data = df3.sample(frac=0.9, random_state=123).reset_index(drop=True)
data_unseen = df3.drop(data.index).reset_index(drop=True)
data.to_csv('new_pycaret2.csv', index=False)
data_unseen.to_csv('test.csv', index=False)