## Clean Raw Data Scraped from Airbnb's Site

### Import packages

In [107]:
import os
import pandas as pd

### Read in the files from the search folder

In [109]:
results = os.listdir('search/')
#results

### Open the files and add the dates of the search performed to each file

In [110]:
open_file = pd.read_csv('search/'+results[0])
open_file['dates'] = str(results[0])
weekends = open_file.head(0)

In [111]:
for file in results: 
    open_file = pd.read_csv('search/'+file)
    open_file['dates'] = str(file)
    weekends = weekends.append(open_file)

In [112]:
weekends.reset_index(drop = True)
weekends.head(1)

Unnamed: 0,name,nightly cost,category,superhost,url,guests_rental,total cost,reviews,dates
0,Private Room & Bath 5 minutes to Downtown Nash,Price:$35 / night,Private room in Nashville,SUPERHOST,/rooms/46832964?check_in=2020-12-20&check_out=...,2 guests · 1 bedroom · 1 bed · 1 private bath,Price:$177 totalShow details,Price:$35 / nightPrice:$177 totalShow details,ci2020-12-20co2020-12-24.csv


### Create columns for checkin and checkout dates

In [113]:
# new data frame with split value columns 
dates = weekends['dates'].str.split('co', n = 1, expand = True)
weekends['checkin'] = dates[0] 
weekends['checkout'] = dates[1]
#weekends.head()

In [114]:
weekends['checkin'] = weekends['checkin'].str.replace('ci','')
weekends['checkout'] = weekends['checkout'].str.replace('.csv','')
weekends.drop('dates', axis=1, inplace=True)
#weekends.head()                                                    

In [115]:
weekends['checkin'] =  pd.to_datetime(weekends['checkin'], format='%Y/%m/%d')
weekends['checkout'] =  pd.to_datetime(weekends['checkout'], format='%Y/%m/%d')
#weekends.head()

### Clean the columns scrubbed from Airbnb's site

#### Create a column for the unique listing ID

In [116]:
weekends['ID'] = weekends['url'].str.partition('?check_in')[0]
weekends['ID'] = weekends['ID'].str.replace('/rooms/', '')
#weekends.head()

#### Create a column for the level of privacy (Entire, Private, Shared, etc.)

In [117]:
weekends['privacy'] = weekends['category'].str.partition(' ')[0]

# Combine types of stays to simplify categories
weekends['privacy'] = weekends['privacy'].str.replace('Hotel', 'Private')
weekends['privacy'] = weekends['privacy'].str.replace('Room', 'Private')
weekends['privacy'] = weekends['privacy'].str.replace('Resort', 'Private')
weekends['privacy'] = weekends['privacy'].str.replace('Campsite', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Camper/RV', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Tiny', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Barn', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Farm', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Bus', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Tent', 'Entire')
weekends['privacy'] = weekends['privacy'].str.replace('Hostel', 'Shared')

#weekends['privacy'].value_counts()

Entire     68622
Private     9458
Shared       216
Name: privacy, dtype: int64

#### Create a column for the number of guests 

In [118]:
weekends['guests'] = weekends['guests_rental'].str.partition(' · ')[0]
#weekends.head()

In [119]:
weekends['guest_rental2'] = weekends['guests_rental'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of bedrooms

In [120]:
weekends['bedroom'] = weekends['guest_rental2'].str.partition(' · ')[0] 
#weekends.head()

In [121]:
weekends['guest_rental3'] = weekends['guest_rental2'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of beds

In [122]:
weekends['beds'] = weekends['guest_rental3'].str.partition(' · ')[0] 
#weekends.head()

In [123]:
weekends['bath'] = weekends['guest_rental3'].str.partition(' · ')[2]
#weekends.head()

#### Drop original columns that have been cleaned

In [124]:
weekends.drop(columns = ['guests_rental', 'guest_rental2', 'guest_rental3'], axis=1, inplace=True)
#weekends.head()

#### Create columns for the prices associated with each listing

In [125]:
weekends['not_discounted'] = weekends['nightly cost'].str.partition('Previous price:')[0]
#weekends.head()

In [126]:
weekends['test'] = weekends['nightly cost'].str.partition('Previous price:')[2]
#weekends.head()

In [127]:
weekends['discounted'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [128]:
weekends['discounted'] = weekends['test'].str.partition(':')[2]
#weekends.head()

In [129]:
weekends['previous_price'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [130]:
weekends.drop(columns=['test', 'nightly cost'], axis=1, inplace=True)
#weekends.head()

In [131]:
weekends['not_discounted'] = weekends['not_discounted'].str.replace('Price:', '') 
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' / night', '')

#weekends.head()

In [132]:
weekends['discounted'] = weekends['discounted'].str.replace(' / night', '')
#weekends.head()

In [133]:
weekends['previous_price'] = weekends['previous_price'].str.replace('Discounted price', '')
#weekends.head()

In [134]:
weekends['current_nightly'] = weekends['not_discounted'] + weekends['discounted']
#weekends.head()

#### Remove redundant words from values in columns with clear descriptions

In [135]:
weekends['guests'] = weekends['guests'].str.replace(' guests', '')
#weekends.head()

In [136]:
weekends['guests'] = weekends['guests'].str.replace(' guest', '')
#weekends.head(25)

In [137]:
weekends['beds'] = weekends['beds'].str.replace(' beds', '')
weekends['beds'] = weekends['beds'].str.replace(' bed', '')
#weekends.head()

In [138]:
weekends['total cost'] = weekends['total cost'].str.replace('Price:', '')
weekends['total cost'] = weekends['total cost'].str.replace('totalShow details', '')
#weekends.head()

#### Create columns for number of reviews and average stars

In [139]:
weekends['review_test'] = weekends['reviews'].str.partition('Rating')[2]
#weekends.head()

In [140]:
weekends['avg_stars'] = weekends['review_test'].str.partition(' out of')[0]
#weekends.head()

In [141]:
weekends['review_count2'] = weekends['review_test'].str.partition('(')[2]
#weekends.head()

In [142]:
weekends['review_count'] = weekends['review_count2'].str.partition(')')[0]
#weekends.head()

In [143]:
weekends.drop(columns = ['review_count2', 'review_test', 'reviews'], axis=1, inplace=True)
#weekends.head()

#### Drop original columns that have been cleaned

In [144]:
weekends = weekends[['ID', 'name', 'category', 'privacy', 'superhost', 'url', 'avg_stars', 'review_count', 'guests','beds', 'bedroom', 'bath', 'checkin', 'checkout', 'previous_price', 'discounted', 'not_discounted', 'current_nightly', 'total cost']]
weekends.head(2)

Unnamed: 0,ID,name,category,privacy,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost
0,46832964,Private Room & Bath 5 minutes to Downtown Nash,Private room in Nashville,Private,SUPERHOST,/rooms/46832964?check_in=2020-12-20&check_out=...,,,2,1,1 bedroom,1 private bath,2020-12-20,2020-12-24,,,$35,$35,$177
1,45934635,BW5 Downtown Nashville!! Do Bro 8!!,Entire apartment in Nashville,Entire,,/rooms/45934635?check_in=2020-12-20&check_out=...,,,1,2,Studio,1 bath,2020-12-20,2020-12-24,$62,$45,,$45,$265


#### Check data types before performing calculations on prices

In [145]:
weekends.dtypes

ID                         object
name                       object
category                   object
privacy                    object
superhost                  object
url                        object
avg_stars                  object
review_count               object
guests                     object
beds                       object
bedroom                    object
bath                       object
checkin            datetime64[ns]
checkout           datetime64[ns]
previous_price             object
discounted                 object
not_discounted             object
current_nightly            object
total cost                 object
dtype: object

#### Remove extraneous characters from price columns and convert to integers

In [146]:
weekends['current_nightly'] = weekends['current_nightly'].astype(str)

weekends['current_nightly'] = weekends['current_nightly'].str.replace('$','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace(',','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace('nan', '0')

weekends['current_nightly'] = weekends['current_nightly'].astype(int)

#weekends.head()

In [147]:
weekends['total cost'] = weekends['total cost'].astype(str)

weekends['total cost'] = weekends['total cost'].str.replace('$','')
weekends['total cost'] = weekends['total cost'].str.replace(',','')
weekends['total cost'] = weekends['total cost'].str.replace(' total','')
weekends['total cost'] = weekends['total cost'].str.replace('nan', '0')

weekends['total cost'] = weekends['total cost'].astype(int)

#weekends.head()

In [148]:
weekends['previous_price'] = weekends['previous_price'].astype(str)

weekends['previous_price'] = weekends['previous_price'].str.replace('$','')
weekends['previous_price'] = weekends['previous_price'].str.replace(',','')
weekends['previous_price'] = weekends['previous_price'].str.replace(' total','')
weekends['previous_price'] = weekends['previous_price'].str.replace('nan', '0')

#weekends.head()

In [149]:
weekends['discounted'] = weekends['discounted'].astype(str)

weekends['discounted'] = weekends['discounted'].str.replace('$','')
weekends['discounted'] = weekends['discounted'].str.replace(',','')
weekends['discounted'] = weekends['discounted'].str.replace(' total','')
weekends['discounted'] = weekends['discounted'].str.replace('nan', '0')

#weekends.head()

In [150]:
weekends['not_discounted'] = weekends['not_discounted'].astype(str)

weekends['not_discounted'] = weekends['not_discounted'].str.replace('$','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(',','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' total','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace('nan', '0')

#weekends.head()

#### Calculate the room fee (nightly rate * # of nights staying) and the additional fees

In [151]:
weekends['room_fee'] = weekends['current_nightly'] * 3 
#weekends.head()

In [152]:
weekends['taxes_fees_cleaning'] = weekends['total cost'] - weekends['room_fee']
#weekends.head()

### Drop empty and duplicate rows

In [153]:
weekends.dropna(subset=['name', 'url'], inplace=True)
weekends.shape

(78296, 21)

In [154]:
weekends.drop_duplicates(subset=['ID', 'checkin'], keep='first', inplace=True)
weekends.shape

(62348, 21)

### Calculate occupancy rate for each listing

In [155]:
weekends['occupancy count'] = weekends.groupby('ID')['ID'].transform('count')

In [156]:
weekends['occupancy rate'] = (26 - weekends['occupancy count']) / 26

### Save to a csv

In [157]:
weekends.to_csv('data_for_viz/weekends.csv')