# **Exploratory Data Analysis - Airbnb Mexico City Dataset**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.image as mpimg
import datetime as dt


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Carregando arquivos

In [None]:
data_calendar=pd.read_csv('/kaggle/input/airbnb-mexico-city/calendar.csv')
data_listings=pd.read_csv('/kaggle/input/airbnb-mexico-city/listings.csv')
data_reviews=pd.read_csv('/kaggle/input/airbnb-mexico-city/reviews.csv')
city_img = mpimg.imread('/kaggle/input/mexico-city-image-png/mexico_city.png')

### Ajustando tipos de dados em preços e datas

In [None]:
data_calendar['date'] = pd.to_datetime(data_calendar['date'], format='%Y-%m-%d', errors='ignore')
data_calendar['price'] = data_calendar['price'].str.replace('$', '', regex = 'true').str.replace(',', '', regex = 'true')
data_calendar['price'] = data_calendar['price'].astype(float)

data_listings['price'] = data_listings['price'].str.replace('$', '', regex = 'true').str.replace(',', '', regex = 'true')
data_listings['price'] = data_listings['price'].astype(float)
data_listings['host_since'] = pd.to_datetime(data_listings['host_since'], format='%Y-%m-%d', errors='ignore')

### Reviews dos usuários

In [None]:
reviews = data_listings[['review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'price', 'latitude', 'longitude']].dropna()

### Histogramas de cada tipo de review

In [None]:
n_bins = 20

fig, axs = plt.subplots(1, 6, sharey=True, tight_layout=True, figsize=(18, 4))
axs[0].hist(reviews['review_scores_accuracy'], bins=n_bins)
axs[1].hist(reviews['review_scores_cleanliness'], bins=n_bins)
axs[2].hist(reviews['review_scores_checkin'], bins=n_bins)
axs[3].hist(reviews['review_scores_communication'], bins=n_bins)
axs[4].hist(reviews['review_scores_location'], bins=n_bins)
axs[5].hist(reviews['review_scores_value'], bins=n_bins)

axs[0].set_title('Precisão')
axs[1].set_title('Limpeza')
axs[2].set_title('Check-in')
axs[3].set_title('Comunicação')
axs[4].set_title('Localização')
axs[5].set_title('Custo-benefício')

print('')

### Histograma de Rating

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(18, 6))
axs.hist(reviews['review_scores_rating'], bins=100)
axs.set_title('Histograma de rating total')
print('')

### Localização de quartos avaliados como limpos e sujos

In [None]:
new_reviews = reviews[(reviews['review_scores_cleanliness'] == 10) | (reviews['review_scores_cleanliness'] < 6)]

new_reviews['limpo'] = np.where(new_reviews['review_scores_cleanliness'] == 10, True, False)

color = ['blue', 'red']
label = ['Limpo', 'Sujo']
fig, ax = plt.subplots(figsize=(10, 10))
for i, limpo in enumerate([True, False]):
    scatter_x = new_reviews['longitude'][new_reviews['limpo'] == limpo]
    scatter_y = new_reviews['latitude'][new_reviews['limpo'] == limpo]
    ax.scatter(scatter_x, scatter_y, c = color[i], label = label[i], s = 10, alpha=0.5)
ax.legend()

plt.imshow(city_img, extent=[-99.33, -98.965, 19.17, 19.575], alpha=1)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title('Quartos limpos e sujos')
plt.show()

### Quartos avaliados como bem e mal localizados

In [None]:
new_reviews = reviews[(reviews['review_scores_location'] == 10) | (reviews['review_scores_location'] < 6)]

new_reviews['bem_loc'] = np.where(new_reviews['review_scores_location'] == 10, True, False)

color = ['blue', 'red']
label = ['Bem localizado', 'Mal localizado']
fig, ax = plt.subplots(figsize=(10, 10))
for i, bem_loc in enumerate([True, False]):
    scatter_x = new_reviews['longitude'][new_reviews['bem_loc'] == bem_loc]
    scatter_y = new_reviews['latitude'][new_reviews['bem_loc'] == bem_loc]
    ax.scatter(scatter_x, scatter_y, c = color[i], label = label[i], s = 10, alpha=0.5)
ax.legend()

plt.imshow(city_img, extent=[-99.33, -98.965, 19.17, 19.575], alpha=1)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.title('Bem localizados vs mal localizados')

plt.show()

In [None]:
def plot_map_prices(df, title=''):
    cmap = plt.get_cmap("plasma")
    ax = df.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,10),
                           s=np.sqrt(df['price']), label="Preço",
                           c="price", cmap=cmap,
                           colorbar=False, alpha=0.4,
                          )
    plt.imshow(city_img, extent=[-99.33, -98.965, 19.17, 19.575], alpha=0.8)
    plt.ylabel("Latitude", fontsize=14)
    plt.xlabel("Longitude", fontsize=14)

    prices = df["price"]
    tick_values = np.linspace(prices.min(), prices.max(), 6, endpoint=True)
    cbar = plt.colorbar(plt.cm.ScalarMappable(cmap=cmap), ax=ax)
    cbar.ax.set_yticklabels(["$%d"%(v) for v in tick_values], fontsize=14)
    cbar.set_label('Preços', fontsize=16)

    plt.legend(fontsize=16)
    plt.title(title)
    plt.show()

### Removendo outliers de preços

In [None]:
# Removing price outliers
new_reviews = reviews[reviews['price'].between(reviews['price'].quantile(.15), reviews['price'].quantile(.85))]


### Preços por região (bairro)

In [None]:
plot_map_prices(new_reviews, 'Preços por região')

### Join da tabela listings com calendar

In [None]:
df_cal_price = pd.merge(
    data_calendar[['listing_id', 'date', 'available']],
    data_listings[['id', 'price', 'latitude', 'longitude']],
    how="inner",
    on=None,
    left_on='listing_id',
    right_on='id',
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)


### DataFrame de calendar somente com os dias de Natal (25/12)

In [None]:
df_natal = df_cal_price[(df_cal_price['date'].map(lambda x: x.day) == 25) & (df_cal_price['date'].map(lambda x: x.month) == 12) & (df_cal_price['date'].map(lambda x: x.year) == 2021)]


In [None]:
df_natal_avail = df_natal[df_natal['available'] == 't']
df_natal_not = df_natal[df_natal['available'] == 'f']

### Lugares mais procurados no Natal com preços próximos da média

In [None]:
# Removing price outliers
df_new = df_natal_not[df_natal_not['price'].between(df_natal_not['price'].quantile(.35), df_natal_not['price'].quantile(.65))]
plot_map_prices(df_new, 'Lugares mais procurados no Natal com preços próximos da média')

### Lugares disponíveis no Natal com preços próximos da média

In [None]:
# Removing price outliers
df_new = df_natal_avail[df_natal_avail['price'].between(df_natal_avail['price'].quantile(.35), df_natal_avail['price'].quantile(.65))]
plot_map_prices(df_new, 'Lugares disponíveis no Natal com preços próximos da média')

In [None]:
df_corr = data_listings[['host_since','host_response_time','number_of_reviews', 'instant_bookable', 'review_scores_rating','price', 'latitude', 'longitude']].dropna()
# Removing price outliers
df_corr = df_corr[df_corr['price'].between(df_corr['price'].quantile(.15), df_corr['price'].quantile(.85))]

### Correlação entre Preço e número de reviews

In [None]:
plt.ylabel("Preço", fontsize=14)
plt.xlabel("Reviews", fontsize=14)
plt.title("Correlação entre Preço e número de reviews")
plt.scatter(df_corr['number_of_reviews'], df_corr['price'])

### Correlação entre Reviews e Data de entrada

In [None]:
plt.ylabel("Reviews", fontsize=14)
plt.xlabel("Data de entrada no Airbnb", fontsize=14)
plt.title("Correlação entre Reviews e Data de entrada")
plt.scatter(df_corr['host_since'], df_corr['number_of_reviews'])

### Disponibilidade instantânea

In [None]:
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.set_title('Disponibilidade instantânea')
ax1.bar(x=['Sim', 'Não'], height=[len(df_corr[df_corr['instant_bookable'] == 't']), len(df_corr[df_corr['instant_bookable'] == 'f'])])
plt.show()

### Tempo de resposta do anfitrião

In [None]:
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.set_title('Tempo de resposta do anfitrião')
ax1.bar(x=list(set(df_corr['host_response_time'].to_list())), height=[len(df_corr[df_corr['host_response_time'] == v]) for v in set(df_corr['host_response_time'])])
plt.show()

### Rating dos que demoram a responder

In [None]:
lerdos = df_corr[df_corr['host_response_time'] == 'a few days or more']
fig, axs = plt.subplots(1, 1, figsize=(18, 6))
axs.hist(lerdos['review_scores_rating'], bins=100)
axs.set_title('Rating dos que demoram a responder')
print('')

In [None]:
df_type = data_listings[['property_type', 'room_type', 'price']].dropna()
# Removing price outliers
df_type = df_type[df_type['price'].between(df_type['price'].quantile(.15), df_type['price'].quantile(.85))]

### Tipos de quarto

In [None]:
labels = set(df_type['room_type'])
sizes = [len(df_type[df_type['room_type'] == v]) for v in set(df_type['room_type'])]

fig1, ax1 = plt.subplots(figsize=(12,12))
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=0)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title('Tipos de quarto')
plt.show()

### Tipos de propriedade

In [None]:
labels = set(df_type['property_type'])
sizes = [len(df_type[df_type['property_type'] == v]) for v in set(df_type['property_type'])]

fig1, ax1 = plt.subplots(figsize=(12,12))
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=0)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Tipos de propriedade')
plt.show()

### Média de preço por tipo de quarto

In [None]:
fig1, ax1 = plt.subplots(figsize=(10,5))
ax1.set_title('Média de preço por tipo de quarto')
ax1.bar(x=list(set(df_type['room_type'].to_list())), height=[df_type[df_type['room_type'] == v]['price'].mean() for v in set(df_type['room_type'])])
plt.show()

### Taxa de ocupação ao longo do tempo

In [None]:
df_groupby = data_calendar[['date', 'available']]
df_groupby = df_groupby[df_groupby['available'] == 'f']
df_groupby = df_groupby.groupby(by='date', as_index=False).count()
plt.figure(figsize=(10, 7))
plt.title('Taxa de ocupação ao longo do tempo')
plt.plot(df_groupby['date'], df_groupby['available'])

### Média de preço ao longo do tempo

In [None]:
df_groupby = data_calendar[['date', 'price']]
df_groupby = df_groupby.groupby(by='date', as_index=False).mean()
plt.figure(figsize=(10, 7))
plt.title('Média de preço ao longo do tempo')
plt.plot(df_groupby['date'], df_groupby['price'])