## Imports

In [1]:
import pandas as pd

In [3]:
def create_csv(data_frame, file_name, rules = {}):
    for  rule,attributes in rules.items():
        for attribute in attributes:
            data_frame[attribute] = data_frame[attribute].map(rule)
    data_frame.to_csv("Csv_to_database/"+file_name)

## Creating principal Dataframes

In [27]:
def concat_file(files):
    lfs = []
    for path in files:
        dataframe = pd.read_csv("Data/"+path, skipinitialspace=True, nrows=500)
        if "listing" in path :
            city_name = path.split('_')[0].capitalize()
            dataframe['city'] = city_name
        lfs.append(dataframe)
    return pd.concat(lfs)

In [119]:
listingsFiles = ["barcelona_listings.csv","berlin_listings_filtered.csv","madrid_listings_filtered.csv"]
listings = concat_file(listingsFiles).rename(columns={
    'host_id':'user_id',
    'host_name':'user_name'}).set_index('id')
listings.fillna('', inplace=True)


In [5]:
calendarFiles = ["barcelona_calendar.csv","berlin_calendar.csv","madrid_calendar.csv"]
calendars = concat_file(calendarFiles).set_index('listing_id')

In [6]:
reviewsFiles = ["barcelona_reviews.csv","berlin_reviews.csv","madrid_reviews.csv"]
reviews = concat_file(reviewsFiles).rename(columns={
    'reviewer_id':'user_id',
    'reviewer_name':'user_name'}).set_index('id')

In [7]:
users = []
users.append(reviews[['user_id', 'user_name']])
users.append(listings[['user_id', 'user_name']])
users = pd.concat(users).drop_duplicates()
users = users.set_index('user_id')

## Normalize columns
Please run once this part otherwise the result will be wrong

In [104]:
def find_normal_values(dataframe, attribute):
    df = dataframe[attribute]
    df = df.drop_duplicates()
    df = df.dropna()
    df.index = range(len(df))
    values = df.to_dict()
    values = {v: k for k, v in values.items()}
    return values, df

In [9]:
def normalize_value(val, normalized_values):
    if (val == ''):
        return val
    return normalized_values[val]

In [106]:
def create_normalized_value(dataframe, attribute):
    normalized_values, normal_entity = find_normal_values(dataframe, attribute)
    create_csv(normal_entity, attribute+".csv")
    return dataframe[attribute].apply(normalize_value, normalized_values = normalized_values)    

In [109]:
to_normalize_attributes = ['host_response_time','room_type',
                           'bed_type', 'cancellation_policy', 'neighbourhood']
for att in to_normalize_attributes:
    listings[att] = create_normalized_value(listings, att)

In [121]:
city_attributes = ['city', 'country', 'country_code']
df = listings[city_attributes]
df = df.drop_duplicates()
df = df.dropna()
df.index = range(len(df))
values = df[df.columns[0]].to_dict()
values = {v: k for k, v in values.items()}
create_csv(df, city_attributes[0]+".csv")
listings[city_attributes[0]] =listings[city_attributes[0]].apply(normalize_value, normalized_values = values) 
listings = listings.drop(columns=city_attributes[1:])

In [123]:
listings['city']

id
18666       0
18674       0
21605       0
25786       0
31377       0
31380       0
31823       0
31958       0
26033978    0
32471       0
32711       0
32868       0
34981       0
35379       0
35388       0
35390       0
35392       0
36763       0
40983       0
46153       0
49213       0
49968       0
50066       0
58512       0
61444       0
6464311     0
66037       0
67065       0
68547       0
70099       0
           ..
846558      2
847252      2
847273      2
1003602     2
848480      2
849939      2
852607      2
852623      2
852636      2
852654      2
852660      2
853804      2
855155      2
856669      2
859800      2
861590      2
862462      2
862752      2
864488      2
26379046    2
867089      2
867286      2
867629      2
868776      2
869276      2
869674      2
870153      2
870704      2
870964      2
871387      2
Name: city, Length: 1500, dtype: int64

In [125]:
create_csv(listings,"listings_temp.csv")

In [12]:
att_columns = ['date', 'available', 'price']
# Selects columns from listings which appears in att_columns 
calendar = calendars[att_columns]
calendar.dropna(inplace=True)
calendar['price'] = calendar['price'].replace( '[\$,.)]','', regex=True ).astype(int)
create_csv(calendar, "calendar.csv")

In [14]:
att_columns = ['neighbourhood', 'listing_url', 'name', 'summary', 'space']
listing = listings[att_columns]
create_csv(listing, "listing.csv")

In [15]:
att_columns = ['description', 'neighborhood_overview', 'notes', 'transit', 'access','interaction','house_rules','picture_url','latitude','longitude']
listing = listings[att_columns]
create_csv(listing, "Listing_descr.csv")

In [16]:
att_columns = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms','beds','bed_type','amenities','square_feet']
listing = listings[att_columns]
create_csv(listing, "Listing_detail.csv")

In [13]:
def concat_str(x):
    return str(x)[:3999]

In [19]:
def convert_currency(val):
    if (val == ''):
        return val
    new_val =  str(val).replace(',','').replace('$', '')
    return float(new_val)

In [20]:
def convert_percentage(val):
    if (val == ''):
        return val
    new_val =  str(val).replace('%','')
    return int(new_val)

In [21]:
def apply_conversion(dataframe, attributes, function):
    for att in attributes:
        dataframe[att] = dataframe[att].apply(function)
    return dataframe

In [22]:
listings['host_response_rate'] = listings['host_response_rate'].apply(convert_percentage)

In [23]:
prices = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee','extra_people']
for att in prices:
    listings[att] = listings[att].apply(convert_currency)

In [18]:
listings['host_about'] = listings['host_about'].apply(concat_str)

In [25]:
create_csv(listings, "listings_test.csv")

In [None]:
att_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication','review_scores_location','review_scores_value']
listing = listings[att_columns]
listing.dropna(inplace=True)
create_csv(listing, "Listing_score.csv")

In [None]:
att_columns = ['minimum_nights', 'maximum_nights', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture','require_guest_phone_verification']
listing = listings[att_columns]
create_csv(listing, "Listing_cond.csv")

In [None]:
create_csv(users, "Airbnb_user.csv")

In [None]:
att_columns = ['user_id', 'host_url', 'host_since', 'host_about', 'host_response_time', 'host_response_rate','host_thumbnail_url','host_picture_url','host_neighbourhood']
listing = listings[att_columns]
listing = listing.set_index('user_id')
listing = listing[~listing.index.duplicated(keep='first')]
create_csv(listing, "Host.csv", { concat_str: ['host_about']})

In [None]:
att_columns = ['listing_id', 'user_id', 'date', 'comments']
review = reviews[att_columns]
create_csv(review,"Review.csv")