In [1]:
import pandas as pd

In [2]:
# Load Airbnb dataset
airbnb_data = pd.read_csv('assets/Airbnb_Open_Data.csv', low_memory=False)

# Load Zillow Home Value Index (ZHVI) dataset
zhvi_data = pd.read_csv('assets/ZHVI_dataset.csv')

In [3]:
# Preliminary data exploration
print("Airbnb Dataset Preview:")
print(airbnb_data.head())
print("\nZHVI Dataset Preview:")
print(zhvi_data.head())

Airbnb Dataset Preview:
        id                                              NAME      host id  \
0  1001254                Clean & quiet apt home by the park  80014485718   
1  1002102                             Skylit Midtown Castle  52335172823   
2  1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3  1002755                                               NaN  85098326012   
4  1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   

  host_identity_verified host name neighbourhood group neighbourhood  \
0            unconfirmed  Madaline            Brooklyn    Kensington   
1               verified     Jenna           Manhattan       Midtown   
2                    NaN     Elise           Manhattan        Harlem   
3            unconfirmed     Garry            Brooklyn  Clinton Hill   
4               verified    Lyndon           Manhattan   East Harlem   

        lat      long        country  ... service fee minimum nights  \
0  40.64

In [4]:
# Handling missing values
airbnb_data.dropna(subset=['neighbourhood'], inplace=True)

# Removing currency symbols and converting 'price' to a numeric type
airbnb_data['price'] = airbnb_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Correcting the column name for price and removing any non-positive prices in Airbnb dataset
airbnb_data = airbnb_data[airbnb_data['price'] > 0]

In [5]:
# Handling missing values
airbnb_data.dropna(subset=['neighbourhood'], inplace=True)
airbnb_data.dropna(subset=['price'], inplace=True)

# Removing currency symbols and converting 'price' to a numeric type
airbnb_data['price'] = airbnb_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Correcting the column name for price and removing any non-positive prices in Airbnb dataset
airbnb_data = airbnb_data[airbnb_data['price'] > 0]

# Handling missing values
zhvi_data.dropna(subset=['RegionName'], inplace=True)

# Filtering Zillow dataset for New York City
ny_zhvi_data = zhvi_data[zhvi_data['City'] == 'New York']

In [6]:
merged_data = pd.merge(airbnb_data, ny_zhvi_data, left_on='neighbourhood', right_on='RegionName', how='inner')
merged_data.to_csv('cleaned_data.csv', index=False)

##Analysis

How does the prevalence of Airbnb listings correlate with the spatial distribution of housing prices across different neighborhoods in New York City? In the last 5 years?

In [11]:
#group by neighborhood size
# listings are depicted by density on map
# price
grouped_listings = merged_data.groupby('neighbourhood').size().reset_index(name='grouped_listings')

Unnamed: 0,neighbourhood,grouped_listings
0,Arden Heights,9
1,Arrochar,51
2,Astoria,1871
3,Bath Beach,48
4,Bay Ridge,304
...,...,...
159,Windsor Terrace,330
160,Woodhaven,191
161,Woodlawn,29
162,Woodrow,3


In [None]:
# take the average of all the date columns for past 5 years (1/31/2019 - )

# calculate average price (in zillow dataset) for each neighbourhood


mean_price = grouped_listings.groupby('price').mean().reset_index(name='avereage price')