# Creating a dataframe containing Airbnb listings data from 2015 to 2017

In [1]:
# Dependencies
import pandas as pd
import os
import re # for filtering list of files

In [2]:
# List down csv files
files = os.listdir() # shows all the files inside the folder
r = re.compile(".*listings.csv") # filters the files to those with endings of the data files
newlist = list(filter(r.match, files)) # creates a list of csv files

In [3]:
# list of csv file contents
Airbnb_dflist = [pd.read_csv(file) for file in newlist] # runs pd.read_csv() for all files at the same time

  if self.run_code(code, result):


In [4]:
# parsing the names for the dataframes
file_names = [file_name[0:8] for file_name in newlist] # get only the extraction dates

In [5]:
# put two lists into a dictionary (first list has names of files; second list has the contents of the csv files)
Airbnb_df_dict = dict(zip(file_names,Airbnb_dflist))

In [6]:
# Create a function that extracts a wanted variable from the dictionary
def extract(dict_name, str): # metric of interest is what is written as string
    x = []
    for key,value in dict_name.items():
        for k,v in dict_name[key].items(): # key = extraction date, k = metric, v = series of values
            if k == str: 
                for i in v: # to extract the values of the metric from the series of metric (each item has an index)
                    x.append(i)
    return x

In [7]:
# Create a list of variables of interest
metrics = ["last_scraped", "neighbourhood_cleansed", "property_type", "price"]

In [8]:
# Use a for-loop to get the values for each of the items in the metrics list using the extract function
impt_metrics = [extract(Airbnb_df_dict,metric) for metric in metrics]

# Convert dates to year-month format
impt_metrics[0] = [date[0:7] for date in impt_metrics[0]] # last-scraped is the date; it's the 0th item in metrics

In [9]:
# Create a list of keys
keys = ["date", "neighbourhood", "property type", "daily rate"]

In [10]:
# Create a new dataframe containing the lists of dates, neighbourhoods, property types, and daily rates
Airbnb_df = pd.DataFrame(dict(zip(keys, impt_metrics)))
Airbnb_df.head()

Unnamed: 0,date,neighbourhood,property type,daily rate
0,2016-10,Seacliff,House,$105.00
1,2016-10,Seacliff,House,$300.00
2,2016-10,Seacliff,Apartment,$175.00
3,2016-10,Seacliff,House,$90.00
4,2016-10,Seacliff,Condominium,$400.00


In [11]:
# Convert the price to annual rate from daily rate
daily_rate = Airbnb_df["daily rate"].str.replace("$","") # data is string
daily_rate = daily_rate.str.replace(",","") # data is still string
daily_rate = daily_rate.astype(float)
Airbnb_df["annual rate (USD)"] = daily_rate * 365 # calculate the yearly rate

In [12]:
# view file 
Airbnb_df.head()

Unnamed: 0,date,neighbourhood,property type,daily rate,annual rate (USD)
0,2016-10,Seacliff,House,$105.00,38325.0
1,2016-10,Seacliff,House,$300.00,109500.0
2,2016-10,Seacliff,Apartment,$175.00,63875.0
3,2016-10,Seacliff,House,$90.00,32850.0
4,2016-10,Seacliff,Condominium,$400.00,146000.0


In [13]:
# save as csv in the Data folder
Airbnb_df.to_csv("../Data/Airbnb_listings.csv", sep = ",", encoding = "utf-8", index = False)