In [1]:
import pandas as pd

In [2]:
def concat_file(files):
    lfs = []
    for path in files:
        lfs.append(pd.read_csv("Data/"+path, skipinitialspace=True, nrows=500))
    return pd.concat(lfs)

In [3]:
def create_csv(data_frame, file_name, rules = {}):
    for  rule,attributes in rules.items():
        for attribute in attributes:
            data_frame[attribute] = data_frame[attribute].map(rule)
    data_frame.to_csv("Csv_to_database/"+file_name)

In [4]:
listingsFiles = ["barcelona_listings.csv","berlin_listings_filtered.csv","madrid_listings_filtered.csv"]
listings = concat_file(listingsFiles).rename(columns={
    'host_id':'user_id',
    'host_name':'user_name'}).set_index('id')

In [5]:
calendarFiles = ["barcelona_calendar.csv","berlin_calendar.csv","madrid_calendar.csv"]
calendars = concat_file(calendarFiles).set_index('listing_id')

In [6]:
reviewsFiles = ["barcelona_reviews.csv","berlin_reviews.csv","madrid_reviews.csv"]
reviews = concat_file(reviewsFiles).rename(columns={
    'reviewer_id':'user_id',
    'reviewer_name':'user_name'}).set_index('id')

In [7]:
users = []
users.append(reviews[['user_id', 'user_name']])
users.append(listings[['user_id', 'user_name']])
users = pd.concat(users).drop_duplicates()
users = users.set_index('user_id')

In [8]:
att_columns = ['neighbourhood', 'city', 'country_code', 'country']
# Selects columns from listings which appears in att_columns 
neighbourhood = listings[att_columns]
# Deletes duplicated rows
neighbourhood = neighbourhood[~neighbourhood.index.duplicated(keep='first')]
# write new csv file
create_csv(neighbourhood, "neighbourhood.csv")

In [9]:
att_columns = ['date', 'available', 'price']
# Selects columns from listings which appears in att_columns 
calendar = calendars[att_columns]
calendar.dropna(inplace=True)
calendar['price'] = calendar['price'].replace( '[\$,.)]','', regex=True ).astype(int)
create_csv(calendar, "calendar.csv")

In [10]:
def concat_str(x):
    return str(x)[:3999]

In [11]:
att_columns = ['neighbourhood', 'listing_url', 'name', 'summary', 'space']
listing = listings[att_columns]
create_csv(listing, "listing.csv")

In [12]:
att_columns = ['description', 'neighborhood_overview', 'notes', 'transit', 'access','interaction','house_rules','picture_url','latitude','longitude']
listing = listings[att_columns]
create_csv(listing, "Listing_descr.csv")

In [13]:
att_columns = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms','beds','bed_type','amenities','square_feet']
listing = listings[att_columns]
create_csv(listing, "Listing_detail.csv")

In [14]:
att_columns = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee','guests_included','extra_people']
listing = listings[att_columns]
listing = listing.applymap(lambda x: "$-1" if pd.isna(x) else x)
for att in att_columns:
    listing[att] = listing[att].replace( '[\$,.)]','', regex=True ).astype(int)
create_csv(listing, "Listing_price.csv")

In [15]:
att_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication','review_scores_location','review_scores_value']
listing = listings[att_columns]
listing.dropna(inplace=True)
create_csv(listing, "Listing_score.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
att_columns = ['minimum_nights', 'maximum_nights', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture','require_guest_phone_verification']
listing = listings[att_columns]
create_csv(listing, "Listing_cond.csv")

In [17]:
create_csv(users, "Airbnb_user.csv")

In [18]:
att_columns = ['user_id', 'host_url', 'host_since', 'host_about', 'host_response_time', 'host_response_rate','host_thumbnail_url','host_picture_url','host_neighbourhood']
listing = listings[att_columns]
listing = listing.set_index('user_id')
listing = listing[~listing.index.duplicated(keep='first')]
create_csv(listing, "Host.csv", { concat_str: ['host_about']})

In [19]:
att_columns = ['listing_id', 'user_id', 'date', 'comments']
review = reviews[att_columns]
create_csv(review,"Review.csv")