## Clean Raw Data Scraped from Airbnb's Site

### Import packages

In [518]:
import os
import pandas as pd

### Read in the files from the search folder

In [519]:
results = os.listdir('weekday/')
#results

### Open the files and add the dates of the search performed to each file

In [520]:
open_file = pd.read_csv('weekday/'+results[0])
open_file['dates'] = str(results[0])
weekends = open_file.head(0)

In [521]:
for file in results: 
    open_file = pd.read_csv('weekday/'+file)
    open_file['dates'] = str(file)
    weekends = weekends.append(open_file)

In [522]:
weekends.reset_index(drop = True)
weekends.head(1)

Unnamed: 0,name,nightly cost,category,superhost,url,guests_rental,total cost,reviews,dates
0,Private Room & Bath 5 minutes to Downtown Nash,Price:$35 / night,Private room in Nashville,SUPERHOST,/rooms/46832964?check_in=2020-12-20&check_out=...,2 guests · 1 bedroom · 1 bed · 1 private bath,Price:$177 totalShow details,Price:$35 / nightPrice:$177 totalShow details,ci2020-12-20co2020-12-24.csv


### Create columns for checkin and checkout dates

In [523]:
# new data frame with split value columns 
dates = weekends['dates'].str.split('co', n = 1, expand = True)
weekends['checkin'] = dates[0] 
weekends['checkout'] = dates[1]
#weekends.head()

In [524]:
weekends['checkin'] = weekends['checkin'].str.replace('ci','')
weekends['checkout'] = weekends['checkout'].str.replace('.csv','')
weekends.drop('dates', axis=1, inplace=True)
#weekends.head()                                                    

### Clean the columns scrubbed from Airbnb's site

#### Create a column for the number of guests 

In [525]:
weekends['guests'] = weekends['guests_rental'].str.partition(' · ')[0]
#weekends.head()

In [526]:
weekends['guest_rental2'] = weekends['guests_rental'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of bedrooms

In [527]:
weekends['bedroom'] = weekends['guest_rental2'].str.partition(' · ')[0] 
#weekends.head()

In [528]:
weekends['guest_rental3'] = weekends['guest_rental2'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of beds

In [529]:
weekends['beds'] = weekends['guest_rental3'].str.partition(' · ')[0] 
#weekends.head()

In [530]:
weekends['bath'] = weekends['guest_rental3'].str.partition(' · ')[2]
#weekends.head()

#### Drop original columns that have been cleaned

In [531]:
weekends.drop(columns = ['guests_rental', 'guest_rental2', 'guest_rental3'], axis=1, inplace=True)
#weekends.head()

#### Create columns for the prices associated with each listing

In [532]:
weekends['not_discounted'] = weekends['nightly cost'].str.partition('Previous price:')[0]
#weekends.head()

In [533]:
weekends['test'] = weekends['nightly cost'].str.partition('Previous price:')[2]
#weekends.head()

In [534]:
weekends['discounted'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [535]:
weekends['discounted'] = weekends['test'].str.partition(':')[2]
#weekends.head()

In [536]:
weekends['previous_price'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [537]:
weekends.drop(columns=['test', 'nightly cost'], axis=1, inplace=True)
#weekends.head()

In [538]:
weekends['not_discounted'] = weekends['not_discounted'].str.replace('Price:', '') 
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' / night', '')

#weekends.head()

In [539]:
weekends['discounted'] = weekends['discounted'].str.replace(' / night', '')
#weekends.head()

In [540]:
weekends['previous_price'] = weekends['previous_price'].str.replace('Discounted price', '')
#weekends.head()

In [541]:
weekends['current_nightly'] = weekends['not_discounted'] + weekends['discounted']
#weekends.head()

#### Remove redundant words from values in columns with clear descriptions

In [542]:
weekends['guests'] = weekends['guests'].str.replace(' guests', '')
#weekends.head()

In [543]:
weekends['guests'] = weekends['guests'].str.replace(' guest', '')
#weekends.head(25)

In [544]:
weekends['beds'] = weekends['beds'].str.replace(' beds', '')
weekends['beds'] = weekends['beds'].str.replace(' bed', '')
#weekends.head()

In [545]:
weekends['total cost'] = weekends['total cost'].str.replace('Price:', '')
weekends['total cost'] = weekends['total cost'].str.replace('totalShow details', '')
#weekends.head()

#### Create columns for number of reviews and average stars

In [546]:
weekends['review_test'] = weekends['reviews'].str.partition('Rating')[2]
#weekends.head()

In [547]:
weekends['avg_stars'] = weekends['review_test'].str.partition(' out of')[0]
#weekends.head()

In [548]:
weekends['review_count2'] = weekends['review_test'].str.partition('(')[2]
#weekends.head()

In [549]:
weekends['review_count'] = weekends['review_count2'].str.partition(')')[0]
#weekends.head()

In [550]:
weekends.drop(columns = ['review_count2', 'review_test', 'reviews'], axis=1, inplace=True)
#weekends.head()

#### Drop original columns that have been cleaned

In [551]:
weekends = weekends[['name', 'category', 'superhost', 'url', 'avg_stars', 'review_count', 'guests','beds', 'bedroom', 'bath', 'checkin', 'checkout', 'previous_price', 'discounted', 'not_discounted', 'current_nightly', 'total cost']]
weekends.head(2)

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost
0,Private Room & Bath 5 minutes to Downtown Nash,Private room in Nashville,SUPERHOST,/rooms/46832964?check_in=2020-12-20&check_out=...,,,2,1,1 bedroom,1 private bath,2020-12-20,2020-12-24,,,$35,$35,$177
1,BW5 Downtown Nashville!! Do Bro 8!!,Entire apartment in Nashville,,/rooms/45934635?check_in=2020-12-20&check_out=...,,,1,2,Studio,1 bath,2020-12-20,2020-12-24,$62,$45,,$45,$265


#### Check data types before performing calculations on prices

In [552]:
weekends.dtypes

name               object
category           object
superhost          object
url                object
avg_stars          object
review_count       object
guests             object
beds               object
bedroom            object
bath               object
checkin            object
checkout           object
previous_price     object
discounted         object
not_discounted     object
current_nightly    object
total cost         object
dtype: object

#### Remove extraneous characters from price columns and convert to integers

In [553]:
weekends['current_nightly'] = weekends['current_nightly'].astype(str)

weekends['current_nightly'] = weekends['current_nightly'].str.replace('$','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace(',','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace('nan', '0')

weekends['current_nightly'] = weekends['current_nightly'].astype(int)

#weekends.head()

In [554]:
weekends['total cost'] = weekends['total cost'].astype(str)

weekends['total cost'] = weekends['total cost'].str.replace('$','')
weekends['total cost'] = weekends['total cost'].str.replace(',','')
weekends['total cost'] = weekends['total cost'].str.replace(' total','')
weekends['total cost'] = weekends['total cost'].str.replace('nan', '0')

weekends['total cost'] = weekends['total cost'].astype(int)

#weekends.head()

In [555]:
weekends['previous_price'] = weekends['previous_price'].astype(str)

weekends['previous_price'] = weekends['previous_price'].str.replace('$','')
weekends['previous_price'] = weekends['previous_price'].str.replace(',','')
weekends['previous_price'] = weekends['previous_price'].str.replace(' total','')
weekends['previous_price'] = weekends['previous_price'].str.replace('nan', '0')

#weekends.head()

In [556]:
weekends['discounted'] = weekends['discounted'].astype(str)

weekends['discounted'] = weekends['discounted'].str.replace('$','')
weekends['discounted'] = weekends['discounted'].str.replace(',','')
weekends['discounted'] = weekends['discounted'].str.replace(' total','')
weekends['discounted'] = weekends['discounted'].str.replace('nan', '0')

#weekends.head()

In [557]:
weekends['not_discounted'] = weekends['not_discounted'].astype(str)

weekends['not_discounted'] = weekends['not_discounted'].str.replace('$','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(',','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' total','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace('nan', '0')

#weekends.head()

#### Calculate the room fee (nightly rate * # of nights staying) and the additional fees

In [558]:
weekends['room_fee'] = weekends['current_nightly'] * 4 
#weekends.head()

In [559]:
weekends['taxes_fees_cleaning'] = weekends['total cost'] - weekends['room_fee']
#weekends.head()

### Drop empty and duplicate rows

In [560]:
weekends.dropna(subset=['name', 'url'], inplace=True)
weekends.shape

(78296, 19)

In [561]:
weekends.drop_duplicates(subset=['url'], keep='first', inplace=True)
weekends.shape

(78296, 19)

### Save to a csv

In [562]:
weekends.to_csv('data_for_viz/weekdays.csv')