In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
listing_df = pd.read_csv('data/listings.csv')
price_df = pd.read_csv('data/calendar.csv')

listing_df['price'] = listing_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
price_df['price'] = price_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Assuming price_df is a pandas DataFrame and 'id' and 'price' are column names
grouped_avail_price_df = price_df[price_df['available'] == 't'].groupby('listing_id')['price'].mean().reset_index()
grouped_avail_price_df.rename(columns={'price': 'average_price','listing_id': 'id'}, inplace=True)

grouped_all_price_df = price_df.groupby('listing_id')['price'].mean().reset_index()
grouped_all_price_df.rename(columns={'price': 'average_price','listing_id': 'id'}, inplace=True)

del price_df





  listing_df['price'] = listing_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
  price_df['price'] = price_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)


In [3]:
print (f' Number of rows before merge {listing_df.shape[0]}')
print (f'number of missed price: {listing_df['price'].isnull().sum()}')
avail_merged_df = pd.merge(listing_df, grouped_avail_price_df, on='id', how='left')
avail_merged_df['price'] = avail_merged_df['price'].fillna(avail_merged_df['average_price'])
avail_merged_df.drop(columns=['average_price'], inplace=True)
print (f' Number of rows after merge {avail_merged_df.shape[0]}')
print (f'number of missed price: {avail_merged_df['price'].isnull().sum()}')

#merged_df.to_csv('data/listings_avail_processed.csv', index=False)


all_merged_df = pd.merge(listing_df, grouped_all_price_df, on='id', how='left')
all_merged_df['price'] = all_merged_df['price'].fillna(all_merged_df['average_price'])
all_merged_df.drop(columns=['average_price'], inplace=True)
print (f' Number of rows after merge {all_merged_df.shape[0]}')
print (f'number of missed price: {all_merged_df['price'].isnull().sum()}')

#merged_df.to_csv('data/listings_all_processed.csv', index=False)

 Number of rows before merge 7281
number of missed price: 2554
 Number of rows after merge 7281
number of missed price: 2147
 Number of rows after merge 7281
number of missed price: 0


In [4]:
# Create a new column that groups accommodates values of 5 and more into a single category
avail_merged_df['accommodates_grouped'] = avail_merged_df['accommodates'].apply(lambda x: '6+' if x >= 6 else str(x))
all_merged_df['accommodates_grouped'] = all_merged_df['accommodates'].apply(lambda x: '6+' if x >= 6 else str(x))

#Remove outliers we don't want to consider (price > 1000 for one night)
avail_merged_df = avail_merged_df[all_merged_df['price'] <= 1000]
all_merged_df = all_merged_df[all_merged_df['price'] <= 1000]



In [5]:
# Coordinates of the center of Munich
munich_center = (48.137154, 11.576124)

# Function to calculate the Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

# Categorize the listings into different distance ranges
bins = [0, 3, 5, 10, np.inf]
labels = ['0-3 km', '3-5 km', '5-10 km', '10+ km']

In [6]:
# Calculate the distance of each listing from the center of Munich
avail_merged_df['distance_from_center'] = avail_merged_df.apply(lambda row: haversine(row['latitude'], row['longitude'], munich_center[0], munich_center[1]), axis=1)
avail_merged_df['distance_category'] = pd.cut(avail_merged_df['distance_from_center'], bins=bins, labels=labels)

all_merged_df['distance_from_center'] = all_merged_df.apply(lambda row: haversine(row['latitude'], row['longitude'], munich_center[0], munich_center[1]), axis=1)
all_merged_df['distance_category'] = pd.cut(all_merged_df['distance_from_center'], bins=bins, labels=labels)


In [8]:
avail_merged_df.to_csv('data/listings_avail_processed.csv', index=False)
all_merged_df.to_csv('data/listings_all_processed.csv', index=False)