## Clean Raw Data Scraped from Airbnb's Site

### Import packages

In [320]:
import os
import pandas as pd

### Read in the files from the search folder

In [321]:
results = os.listdir('search/')
#results

### Open the files and add the dates of the search performed to each file

In [322]:
open_file = pd.read_csv('search/'+results[0])
open_file['dates'] = str(results[0])
weekends = open_file.head(0)

In [323]:
for file in results: 
    open_file = pd.read_csv('search/'+file)
    open_file['dates'] = str(file)
    weekends = weekends.append(open_file)

In [324]:
weekends.reset_index(drop = True)
weekends.head(1)

Unnamed: 0,name,nightly cost,category,superhost,url,guests_rental,total cost,reviews,dates
0,NC5 The Joint!! Downtown Nashville Studio!,Price:$55 / night,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,6 guests · Studio · 2 beds · 1 bath,Price:$245 totalShow details,Price:$55 / nightPrice:$245 totalShow details,ci2020-12-17co2020-12-20.csv


### Create columns for checkin and checkout dates

In [325]:
# new data frame with split value columns 
dates = weekends['dates'].str.split('co', n = 1, expand = True)
weekends['checkin'] = dates[0] 
weekends['checkout'] = dates[1]
#weekends.head()

In [326]:
weekends['checkin'] = weekends['checkin'].str.replace('ci','')
weekends['checkout'] = weekends['checkout'].str.replace('.csv','')
weekends.drop('dates', axis=1, inplace=True)
#weekends.head()                                                    

### Clean the columns scrubbed from Airbnb's site

#### Create a column for the number of guests 

In [327]:
weekends['guests'] = weekends['guests_rental'].str.partition(' · ')[0]
#weekends.head()

In [328]:
weekends['guest_rental2'] = weekends['guests_rental'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of bedrooms

In [329]:
weekends['bedroom'] = weekends['guest_rental2'].str.partition(' · ')[0] 
#weekends.head()

In [330]:
weekends['guest_rental3'] = weekends['guest_rental2'].str.partition(' · ')[2]
#weekends.head()

#### Create a column for the number of beds

In [331]:
weekends['beds'] = weekends['guest_rental3'].str.partition(' · ')[0] 
#weekends.head()

In [332]:
weekends['bath'] = weekends['guest_rental3'].str.partition(' · ')[2]
#weekends.head()

#### Drop original columns that have been cleaned

In [333]:
weekends.drop(columns = ['guests_rental', 'guest_rental2', 'guest_rental3'], axis=1, inplace=True)
#weekends.head()

#### Create columns for the prices associated with each listing

In [334]:
weekends['not_discounted'] = weekends['nightly cost'].str.partition('Previous price:')[0]
#weekends.head()

In [335]:
weekends['test'] = weekends['nightly cost'].str.partition('Previous price:')[2]
#weekends.head()

In [336]:
weekends['discounted'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [337]:
weekends['discounted'] = weekends['test'].str.partition(':')[2]
#weekends.head()

In [338]:
weekends['previous_price'] = weekends['test'].str.partition(':')[0]
#weekends.head()

In [339]:
weekends.drop(columns=['test', 'nightly cost'], axis=1, inplace=True)
#weekends.head()

In [340]:
weekends['not_discounted'] = weekends['not_discounted'].str.replace('Price:', '') 
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' / night', '')

#weekends.head()

In [341]:
weekends['discounted'] = weekends['discounted'].str.replace(' / night', '')
#weekends.head()

In [342]:
weekends['previous_price'] = weekends['previous_price'].str.replace('Discounted price', '')
#weekends.head()

In [343]:
weekends['current_nightly'] = weekends['not_discounted'] + weekends['discounted']
#weekends.head()

#### Remove redundant words from values in columns with clear descriptions

In [344]:
weekends['guests'] = weekends['guests'].str.replace(' guests', '')
#weekends.head()

In [345]:
weekends['guests'] = weekends['guests'].str.replace(' guest', '')
#weekends.head(25)

In [346]:
weekends['beds'] = weekends['beds'].str.replace(' beds', '')
weekends['beds'] = weekends['beds'].str.replace(' bed', '')
#weekends.head()

In [347]:
weekends['total cost'] = weekends['total cost'].str.replace('Price:', '')
weekends['total cost'] = weekends['total cost'].str.replace('totalShow details', '')
#weekends.head()

#### Create columns for number of reviews and average stars

In [351]:
weekends['review_test'] = weekends['reviews'].str.partition('Rating')[2]
#weekends.head()

In [352]:
weekends['avg_stars'] = weekends['review_test'].str.partition(' out of')[0]
#weekends.head()

In [353]:
weekends['review_count2'] = weekends['review_test'].str.partition('(')[2]
#weekends.head()

In [354]:
weekends['review_count'] = weekends['review_count2'].str.partition(')')[0]
#weekends.head()

In [355]:
weekends.drop(columns = ['review_count2', 'review_test', 'reviews'], axis=1, inplace=True)
#weekends.head()

#### Drop original columns that have been cleaned

In [359]:
weekends = weekends[['name', 'category', 'superhost', 'url', 'avg_stars', 'review_count', 'guests','beds', 'bedroom', 'bath', 'checkin', 'checkout', 'previous_price', 'discounted', 'not_discounted', 'current_nightly', 'total cost']]
weekends.head(2)

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,$55,$245
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,$122,$58,,$58,$329


#### Check data types before performing calculations on prices

In [360]:
weekends.dtypes

name               object
category           object
superhost          object
url                object
avg_stars          object
review_count       object
guests             object
beds               object
bedroom            object
bath               object
checkin            object
checkout           object
previous_price     object
discounted         object
not_discounted     object
current_nightly    object
total cost         object
dtype: object

#### Remove extraneous characters from price columns and convert to integers

In [364]:
weekends['current_nightly'] = weekends['current_nightly'].astype(str)

weekends['current_nightly'] = weekends['current_nightly'].str.replace('$','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace(',','')
weekends['current_nightly'] = weekends['current_nightly'].str.replace('nan', '0')

weekends['current_nightly'] = weekends['current_nightly'].astype(int)

weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,55,$245,165
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,$122,$58,,58,$329,174
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$63,63,$302,189
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$54,54,$259,162
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,$87,$62,,62,$257,186


In [366]:
weekends['total cost'] = weekends['total cost'].astype(str)

weekends['total cost'] = weekends['total cost'].str.replace('$','')
weekends['total cost'] = weekends['total cost'].str.replace(',','')
weekends['total cost'] = weekends['total cost'].str.replace(' total','')
weekends['total cost'] = weekends['total cost'].str.replace('nan', '0')

weekends['total cost'] = weekends['total cost'].astype(int)

weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,55,245,165
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,$122,$58,,58,329,174
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$63,63,302,189
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$54,54,259,162
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,$87,$62,,62,257,186


In [373]:
weekends['previous_price'] = weekends['previous_price'].astype(str)

weekends['previous_price'] = weekends['previous_price'].str.replace('$','')
weekends['previous_price'] = weekends['previous_price'].str.replace(',','')
weekends['previous_price'] = weekends['previous_price'].str.replace(' total','')
weekends['previous_price'] = weekends['previous_price'].str.replace('nan', '0')

weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee,taxes_fees_cleaning
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,55.0,55,245,165,80
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,122.0,58.0,,58,329,174,155
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,63.0,63,302,189,113
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,54.0,54,259,162,97
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,87.0,62.0,,62,257,186,71


In [371]:
weekends['discounted'] = weekends['discounted'].astype(str)

weekends['discounted'] = weekends['discounted'].str.replace('$','')
weekends['discounted'] = weekends['discounted'].str.replace(',','')
weekends['discounted'] = weekends['discounted'].str.replace(' total','')
weekends['discounted'] = weekends['discounted'].str.replace('nan', '0')

weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee,taxes_fees_cleaning
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,55,245,165,80
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,122.0,58.0,,58,329,174,155
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$63,63,302,189,113
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$54,54,259,162,97
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,87.0,62.0,,62,257,186,71


In [372]:
weekends['not_discounted'] = weekends['not_discounted'].astype(str)

weekends['not_discounted'] = weekends['not_discounted'].str.replace('$','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(',','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace(' total','')
weekends['not_discounted'] = weekends['not_discounted'].str.replace('nan', '0')

weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee,taxes_fees_cleaning
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,55.0,55,245,165,80
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,122.0,58.0,,58,329,174,155
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,63.0,63,302,189,113
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,54.0,54,259,162,97
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,87.0,62.0,,62,257,186,71


#### Calculate the room fee (nightly rate * # of nights staying) and the additional fees

In [368]:
weekends['room_fee'] = weekends['current_nightly'] * 3 
weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,55,245,165
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,$122,$58,,58,329,174
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$63,63,302,189
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$54,54,259,162
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,$87,$62,,62,257,186


In [369]:
weekends['taxes_fees_cleaning'] = weekends['total cost'] - weekends['room_fee']
weekends.head()

Unnamed: 0,name,category,superhost,url,avg_stars,review_count,guests,beds,bedroom,bath,checkin,checkout,previous_price,discounted,not_discounted,current_nightly,total cost,room_fee,taxes_fees_cleaning
0,NC5 The Joint!! Downtown Nashville Studio!,Entire apartment in Nashville,,/rooms/46779266?check_in=2020-12-17&check_out=...,,,6,2,Studio,1 bath,2020-12-17,2020-12-20,,,$55,55,245,165,80
1,"Brand-New Condo, 2 Blocks to Centennial Park",Entire apartment in Nashville,,/rooms/35440277?check_in=2020-12-17&check_out=...,4.64,14.0,4,2,1 bedroom,1 bath,2020-12-17,2020-12-20,$122,$58,,58,329,174,155
2,DOWNTOWN NASHVILLE - COMFY & COZY (C),Entire apartment in Nashville,SUPERHOST,/rooms/29543248?check_in=2020-12-17&check_out=...,4.93,76.0,6,2,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$63,63,302,189,113
3,Tiny haven,Entire apartment in Nashville,SUPERHOST,/rooms/20137844?check_in=2020-12-17&check_out=...,4.93,193.0,4,4,2 bedrooms,1 bath,2020-12-17,2020-12-20,,,$54,54,259,162,97
4,1 BR Apartment!! Just 2 miles to Downtown!,Entire apartment in Nashville,,/rooms/44064578?check_in=2020-12-17&check_out=...,4.0,10.0,4,1,1 bedroom,1 bath,2020-12-17,2020-12-20,$87,$62,,62,257,186,71
