In [1]:
import pandas as pd

## Data Loading and Cleaning
- Load the Airbnb and ZHVI datasets.
- Perform initial data cleaning and preparation.

In [None]:
# Load datasets
airbnb_data = pd.read_csv('assets/Airbnb_Open_Data.csv', low_memory=False)
zhvi_data = pd.read_csv('assets/ZHVI_dataset.csv')

# Drop rows with missing values in the 'neighbourhood' column
airbnb_data.dropna(subset=['neighbourhood'], inplace=True)

# Remove dollar signs and commas from the 'price' column and convert it to float
airbnb_data['price'] = airbnb_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Keep only rows where the 'price' is greater than 0
airbnb_data = airbnb_data[airbnb_data['price'] > 0]

# Drop rows with missing values in the 'RegionName' column
zhvi_data.dropna(subset=['RegionName'], inplace=True)

# Create a new DataFrame 'ny_zhvi_data' that contains only rows where the 'City' column is 'New York'
ny_zhvi_data = zhvi_data[zhvi_data['City'] == 'New York']

### Neighborhood Analysis
- Identify unique neighborhoods in Airbnb and ZHVI datasets.
- Check for discrepancies and find neighborhoods present in one dataset but not in the other.

In [26]:
# Unique neighborhoods in Airbnb dataset
airbnb_neighbourhoods = airbnb_data['neighbourhood'].unique()
zhvi_neighbourhoods = ny_zhvi_data['RegionName'].unique()

# Convert to sets for comparison
airbnb_set = set(airbnb_neighbourhoods)
zhvi_set = set(zhvi_neighbourhoods)

# Neighborhoods present in Airbnb but not in ZHVI
not_in_zhvi = airbnb_set.difference(zhvi_set)

# Neighborhoods present in ZHVI but not in Airbnb
not_in_airbnb = zhvi_set.difference(airbnb_set)

# Neighborhoods present in Airbnb but not in ZHVI
not_in_zhvi = airbnb_set.difference(zhvi_set)
print("Neighborhoods in Airbnb data not in ZHVI data:", not_in_zhvi)

# Neighborhoods present in ZHVI but not in Airbnb
not_in_airbnb = zhvi_set.difference(airbnb_set)
print("Neighborhoods in ZHVI data not in Airbnb data:", not_in_airbnb)

Neighborhoods in Airbnb data not in ZHVI data: {'Fieldston', 'Port Morris', 'Clason Point', 'Holliswood', 'Two Bridges', 'Concourse Village', 'Westchester Square', 'North Riverdale', 'Co-op City', 'Kips Bay', 'West Farms', "Bull's Head", 'Stuyvesant Town', 'Little Neck', 'University Heights', 'Arverne', 'Ditmars Steinway', 'Richmondtown', 'Randall Manor', 'South Slope', 'Bayswater', 'Rockaway Beach', 'Columbia St', 'Morrisania', 'Allerton', 'Sea Gate', 'NoHo', 'Edenwald', 'Cypress Hills', 'Claremont Village', 'Chelsea, Staten Island', 'Fort Wadsworth', 'Concord', 'Bronxdale', 'Chinatown', 'Mount Hope', 'Fort Hamilton', 'Breezy Point', 'Howland Hook', 'Navy Yard', 'Marble Hill', 'Edgemere', 'Vinegar Hill', 'Schuylerville', 'Civic Center', 'Spuyten Duyvil', 'East Morrisania', 'Prospect-Lefferts Gardens', 'Nolita', 'Theater District', 'Long Island City', 'Mariners Harbor', 'Downtown Brooklyn', 'Bay Terrace, Staten Island', 'Olinville', 'Mount Eden', 'Hunts Point', 'Unionport'}
Neighborhoo

#### Standardizing Neighborhood Names
Standardize neighborhood names by applying a mapping and re-extract unique neighborhoods.

In [17]:
# Standardizing names
rename_map = {
    'Battery Park City': 'Battery Park',
    'Throgs Neck': 'Throggs Neck',
}

# Apply mapping to Airbnb dataset
airbnb_data['neighbourhood'] = airbnb_data['neighbourhood'].replace(rename_map)

# Re-extract unique neighborhoods
airbnb_neighbourhoods = set(airbnb_data['neighbourhood'].unique())
zhvi_neighbourhoods = set(ny_zhvi_data['RegionName'].unique())

#### Common Neighborhoods
- Find the common neighborhoods between Airbnb and ZHVI datasets.
- Filter both datasets to include only the common neighborhoods.
- Merge the filtered Airbnb and ZHVI datasets based on the common neighborhood.

In [19]:
# Intersection of neighborhoods in both datasets
common_neighborhoods = airbnb_neighbourhoods.intersection(zhvi_neighbourhoods)

# Filter the Airbnb dataset to include only common neighborhoods
filtered_airbnb_data = airbnb_data[airbnb_data['neighbourhood'].isin(common_neighborhoods)]

# Filter the ZHVI dataset to include only common neighborhoods
filtered_zhvi_data = ny_zhvi_data[ny_zhvi_data['RegionName'].isin(common_neighborhoods)]

# Merge the datasets
merged_data = pd.merge(filtered_airbnb_data, filtered_zhvi_data, left_on='neighbourhood', right_on='RegionName')


### Yearly Analysis
- Find the minimum and maximum years in the 'last_review_year' column.
- Define a valid year range and filter the data accordingly.

In [20]:
# Convert 'last review' column to datetime format
merged_data['last review'] = pd.to_datetime(merged_data['last review'], errors='coerce')

# Extract the year from the 'last review' column and create a new column 'last_review_year'
merged_data['last_review_year'] = merged_data['last review'].dt.year

# Find the minimum and maximum years in the 'last_review_year' column
min_year = merged_data['last_review_year'].min()
max_year = merged_data['last_review_year'].max()

# Define the valid year range
valid_year_range = (2019, 2023)

# Filter the DataFrame to keep only rows with valid years
filtered_data = merged_data[
    (merged_data['last_review_year'] >= valid_year_range[0]) &
    (merged_data['last_review_year'] <= valid_year_range[1])
]

### Yearly Average ZHVI Calculation
- Define a year range from 2019 to 2023 and generate column names representing the last day of each month within this range.
- Calculate the yearly average ZHVI for neighborhoods by grouping and computing the mean values for each year.

In [22]:
# Define the range of years for the analysis
start_year = 2019
end_year = 2023

# Function to get the last day of the month
def last_day_of_month(year, month):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31  # Months with 31 days
    elif month in [4, 6, 9, 11]:
        return 30  # Months with 30 days
    else:
        # February: check for leap year
        return 29 if year % 4 == 0 else 28

# Generate a list of column names for each month of each year of interest
monthly_columns = {}
for year in range(start_year, end_year + 1):
    monthly_columns[year] = []
    for month in range(1, 13):
        day = last_day_of_month(year, month)
        column_name = f'{month}/{day}/{year}'
        if column_name in merged_data.columns:
            monthly_columns[year].append(column_name)

# Calculate the yearly average ZHVI for each neighborhood
yearly_avg_zhvi = {}
for year, columns in monthly_columns.items():
    if columns:  # Ensure there are columns for the year
        yearly_avg_zhvi[year] = merged_data.groupby('neighbourhood')[columns].mean().mean(axis=1)

# Convert the yearly averages to a DataFrame
yearly_avg_zhvi = pd.DataFrame(yearly_avg_zhvi)

### Airbnb Listing Count and Data Combination
- Calculate the count of Airbnb listings for each neighborhood in the filtered data using the groupby method.
- Combine the Airbnb listing count data with the yearly average ZHVI data based on the 'neighbourhood' column.

In [25]:
# Calculate Airbnb listing count for each neighborhood
airbnb_listing_count = filtered_data.groupby('neighbourhood').size().reset_index(name='airbnb_listing_count')

# Combine Airbnb listing count and yearly average ZHVI data
combined_data = pd.merge(airbnb_listing_count, yearly_avg_zhvi, on='neighbourhood')
combined_data

Unnamed: 0,neighbourhood,airbnb_listing_count,2019,2020,2021,2022,2023
0,Arden Heights,9,4.160248e+05,420641.463358,437785.774917,4.790372e+05,4.980696e+05
1,Arrochar,48,6.514846e+05,652610.795350,665944.429200,7.230428e+05,7.304496e+05
2,Astoria,1099,8.116429e+05,790801.140167,769233.179542,7.723108e+05,7.107342e+05
3,Bath Beach,41,8.154337e+05,771852.675717,758463.846733,7.792290e+05,7.546974e+05
4,Battery Park,38,1.043128e+06,998984.391575,962001.675417,1.044703e+06,1.027472e+06
...,...,...,...,...,...,...,...
171,Windsor Terrace,360,6.076832e+05,597522.099082,602737.405725,6.376818e+05,5.673517e+05
172,Woodhaven,725,2.742991e+05,288975.498747,312688.889565,3.515546e+05,3.632531e+05
173,Woodlawn,160,2.524543e+05,258477.365592,274234.074313,2.817326e+05,2.683101e+05
174,Woodrow,1,6.826850e+05,689971.712717,719158.754358,7.805972e+05,7.925477e+05
