## Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def create_csv(data_frame, file_name, rules = {}):
    for  rule,attributes in rules.items():
        for attribute in attributes:
            data_frame[attribute] = data_frame[attribute].map(rule)
    data_frame.to_csv("Csv_to_database/"+file_name, line_terminator='\r')

## Creating principal Dataframes

In [3]:
def concat_file(files):
    lfs = []
    for path in files:
        dataframe = pd.read_csv("Data/"+path, skipinitialspace=True)
        if "listing" in path :
            city_name = path.split('_')[0].capitalize()
            dataframe['city'] = city_name
        lfs.append(dataframe)
    return pd.concat(lfs)

In [4]:
listingsFiles = ["barcelona_listings.csv","berlin_listings_filtered.csv","madrid_listings_filtered.csv"]
listings = concat_file(listingsFiles).rename(columns={
    'host_id':'user_id',
    'host_name':'user_name'}).set_index('id')
listings.fillna('', inplace=True)


In [5]:
calendarFiles = ["barcelona_calendar.csv","berlin_calendar.csv","madrid_calendar.csv"]
calendars = concat_file(calendarFiles).set_index('listing_id')

In [6]:
reviewsFiles = ["barcelona_reviews.csv","berlin_reviews.csv","madrid_reviews.csv"]
reviews = concat_file(reviewsFiles).rename(columns={
    'reviewer_id':'user_id',
    'reviewer_name':'user_name'}).set_index('id')

## Normalize columns
Please run once this part otherwise the result will be wrong

In [7]:
def get_id(attribute):
    
    words = attribute.split('_')
    letters = [word[0] for word in words]
    return ("".join(letters).lower())+'id'

In [8]:
def find_normal_values(dataframe, attribute=''):
    if attribute != '':
        df = dataframe[attribute]
    else:
        df = dataframe
    df = df.drop_duplicates()
    df = df.dropna()
    df.index = range(len(df))
    values = df.to_dict()
    values = {v: k for k, v in values.items()}
    df = df.to_frame()
    if attribute != '':
        df = df.rename_axis(get_id(attribute))
    return values, df

In [9]:
def normalize_value(val, normalized_values):
    if (val == ''):
        return val
    return normalized_values[val]

In [10]:
def create_normalized_value(dataframe, attribute):
    normalized_values, normal_entity = find_normal_values(dataframe, attribute)
    create_csv(normal_entity, attribute+".csv")
    return dataframe[attribute].apply(normalize_value, normalized_values = normalized_values)    

In [11]:
to_normalize_attributes = ['host_response_time','room_type', 'property_type',
                           'bed_type', 'cancellation_policy']
for att in to_normalize_attributes:
    listings[att] = create_normalized_value(listings, att)

In [12]:
neighbourhoods = pd.concat([listings['neighbourhood'],listings['host_neighbourhood'].rename(columns={
    'host_neighbourhood':'neighbourhood'})])
normalized_values, normal_entity = find_normal_values(neighbourhoods)
create_csv(normal_entity, "neighbourhood.csv")
listings['neighbourhood'] = listings['neighbourhood'].apply(normalize_value, normalized_values = normalized_values)
listings['host_neighbourhood'] = listings['host_neighbourhood'].apply(normalize_value, normalized_values = normalized_values)

In [13]:
city_attributes = ['city', 'country', 'country_code']
df = listings[city_attributes]
df = df.drop_duplicates()
df = df.dropna()
df.index = range(len(df))
values = df[df.columns[0]].to_dict()
values = {v: k for k, v in values.items()}
create_csv(df, city_attributes[0]+".csv")
listings[city_attributes[0]] =listings[city_attributes[0]].apply(normalize_value, normalized_values = values) 
listings = listings.drop(columns=city_attributes[1:])

In [14]:
def concat_str(val, size = 3000):
    if (val == ''):
        return val
    return str(val)[:size]

In [15]:
def convert_currency(val):
    if (val == ''):
        return val
    new_val =  str(val).replace(',','').replace('$', '')
    return float(new_val)

In [16]:
def convert_percentage(val):
    if (val == ''):
        return val
    new_val =  str(val).replace('%','')
    return int(new_val)

In [17]:
def apply_conversion(dataframe, attributes, function):
    for att in attributes:
        dataframe[att] = dataframe[att].apply(function)
    return dataframe

In [18]:
listings['host_since'] = listings['host_since'].astype('datetime64[ns]')

In [19]:
listings['host_response_rate'] = listings['host_response_rate'].apply(convert_percentage)

In [20]:
prices = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee','extra_people']
for att in prices:
    listings[att] = listings[att].apply(convert_currency)

In [21]:
descriptions = ['summary', 'space', 'neighborhood_overview', 'description', 'notes', 'transit', 'access','interaction','house_rules']
for att in descriptions:
    listings[att] = listings[att].apply(concat_str, size = 500)

In [22]:
listings['host_about'] = listings['host_about'].apply(concat_str)

In [23]:
users = []
#users.append(reviews[['user_id', 'user_name']])
users.append(listings[['user_id', 'user_name']])
users = pd.concat(users).drop_duplicates()
users = users.set_index('user_id')

In [24]:
create_csv(users, "Airbnb_user.csv")
#listings = listings.drop(columns=['user_name'])

In [25]:
host_columns = ['user_id', 'user_name', 'host_url', 'host_since', 'host_about', 'host_response_time',
                'host_response_rate','host_thumbnail_url','host_picture_url',
                'host_neighbourhood','host_verifications']
host = listings[host_columns]
host = host.set_index('user_id')
host = host[~host.index.duplicated(keep='first')]
create_csv(host, "host.csv", { concat_str: ['host_about']})
listings = listings.drop(columns=host_columns[1:])

In [26]:
create_csv(listings,"listing.csv")

## Calendar

In [36]:
att_columns = ['date', 'available', 'price']
# Selects columns from listings which appears in att_columns 
calendar = calendars[att_columns]
calendar.dropna(inplace=True)
calendar['date'] = calendar['date'].astype('datetime64[ns]')
calendar['price'] = calendar['price'].apply(convert_currency)
create_csv(calendar, "calendar.csv")

In [37]:
print(len(calendar))

6093473


## Reviews

In [35]:
att_columns = ['listing_id', 'user_id', 'user_name', 'date', 'comments']
reviews['date'] = reviews['date'].astype('datetime64[ns]')
reviews['comments'] = reviews['comments'].apply(concat_str, size = 1000)
review = reviews[att_columns]
create_csv(review,"Review.csv")

## Test


In [29]:
A = listings.index.values
B = calendar[~calendar.index.duplicated(keep='first')]
B = B.index.values
print(len(np.intersect1d(A,B)) , len(B))

30801 30801


In [33]:
A = listings.index.values
B = reviews.set_index('listing_id')
B = B[~B.index.duplicated(keep='first')]
B = B.index.values
print(len(np.intersect1d(A,B)) , len(B))

35012 35012


In [30]:
def Repeat(x): 
    _size = len(x) 
    repeated = [] 
    for i in range(_size): 
        k = i + 1
        for j in range(k, _size): 
            if x[i] == x[j] and x[i] not in repeated: 
                repeated.append(x[i]) 
    return repeated 

In [31]:
#print(Repeat(users.index.values))

In [32]:
maxi = -1
for elem in listings['description']:
    if (elem != ''):
        if (len(elem) > maxi):
            maxi = len(elem)
print(maxi)

500
