In [1]:
import pandas as pd
from fuzzywuzzy import process

## Data Loading and Cleaning
- Load the Airbnb and ZHVI datasets.
- Perform initial data cleaning and preparation.

In [2]:
# Load datasets
airbnb_data = pd.read_csv('assets/Airbnb_Open_Data.csv', low_memory=False)
zhvi_data = pd.read_csv('assets/ZHVI_dataset.csv')

# Drop rows with missing values in the 'neighbourhood' column
airbnb_data.dropna(subset=['neighbourhood'], inplace=True)

# Remove dollar signs and commas from the 'price' column and convert it to float
airbnb_data['price'] = airbnb_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Keep only rows where the 'price' is greater than 0
airbnb_data = airbnb_data[airbnb_data['price'] > 0]

# Drop rows with missing values in the 'RegionName' column
zhvi_data.dropna(subset=['RegionName'], inplace=True)

# Create a new DataFrame 'ny_zhvi_data' that contains only rows where the 'City' column is 'New York'
ny_zhvi_data = zhvi_data[zhvi_data['City'] == 'New York']

### Neighborhood Analysis
- Identify unique neighborhoods in Airbnb and ZHVI datasets.
- Check for discrepancies and find neighborhoods present in one dataset but not in the other.

In [3]:
# Unique neighborhoods in Airbnb dataset
airbnb_neighbourhoods = airbnb_data['neighbourhood'].unique()
zhvi_neighbourhoods = ny_zhvi_data['RegionName'].unique()

# Convert to sets for comparison
airbnb_set = set(airbnb_neighbourhoods)
zhvi_set = set(zhvi_neighbourhoods)

# Neighborhoods present in Airbnb but not in ZHVI
not_in_zhvi = airbnb_set.difference(zhvi_set)

# Neighborhoods present in ZHVI but not in Airbnb
not_in_airbnb = zhvi_set.difference(airbnb_set)

# Neighborhoods present in Airbnb but not in ZHVI
not_in_zhvi = airbnb_set.difference(zhvi_set)
print("Neighborhoods in Airbnb data not in ZHVI data:", not_in_zhvi)

# Neighborhoods present in ZHVI but not in Airbnb
not_in_airbnb = zhvi_set.difference(airbnb_set)
print("Neighborhoods in ZHVI data not in Airbnb data:", not_in_airbnb)

Neighborhoods in Airbnb data not in ZHVI data: {'Edgemere', 'Stuyvesant Town', 'Prospect-Lefferts Gardens', 'Edenwald', 'Randall Manor', 'Breezy Point', 'Concourse Village', 'Downtown Brooklyn', 'NoHo', 'Clason Point', 'Bayswater', 'Claremont Village', 'Civic Center', 'Schuylerville', 'Chinatown', 'Long Island City', 'Spuyten Duyvil', 'Co-op City', 'Mount Eden', 'Richmondtown', 'Howland Hook', 'Port Morris', 'Two Bridges', 'West Farms', 'University Heights', 'Westchester Square', 'Marble Hill', 'South Slope', 'Cypress Hills', 'Kips Bay', 'Battery Park City', "Bull's Head", 'Navy Yard', 'Mount Hope', 'Vinegar Hill', 'Bronxdale', 'Mariners Harbor', 'Little Neck', 'Hunts Point', 'Arverne', 'Throgs Neck', 'North Riverdale', 'Morrisania', 'Sea Gate', 'Rockaway Beach', 'Allerton', 'Nolita', 'Bay Terrace, Staten Island', 'Fieldston', 'Olinville', 'Holliswood', 'Theater District', 'Fort Hamilton', 'Fort Wadsworth', 'Ditmars Steinway', 'Chelsea, Staten Island', 'Columbia St', 'Unionport', 'East

#### Standardizing Neighborhood Names
Standardize neighborhood names by applying a mapping and re-extract unique neighborhoods.

In [4]:
rename_map = {}
def get_matches(query, choices, score_cutoff=95):
    """
    Find best matches in choices for each query item.
    
    :param query: List of names to match.
    :param choices: List of possible matches.
    :param score_cutoff: Minimum score for a match. Matches below this are ignored.
    :return: A dictionary of query items and their best matches.
    """
    matches = {}
    for item in query:
        match = process.extractOne(item, choices, score_cutoff=score_cutoff)
        if match:
            matches[item] = match[0]
    return matches

# Convert set to list for processing
not_in_zhvi_list = list(not_in_zhvi)  
not_in_airbnb_list = list(not_in_airbnb)

# Get matches for neighborhoods in Airbnb but not in ZHVI
matches_airbnb_to_zhvi = get_matches(not_in_zhvi_list, zhvi_neighbourhoods)

# Get matches for neighborhoods in ZHVI but not in Airbnb
matches_zhvi_to_airbnb = get_matches(not_in_airbnb_list, airbnb_neighbourhoods)

# Combine the matches into a single rename_map
rename_map.update(matches_airbnb_to_zhvi)
rename_map.update(matches_zhvi_to_airbnb)

# Apply mapping to Airbnb dataset
airbnb_data['neighbourhood'] = airbnb_data['neighbourhood'].replace(rename_map)

# Re-extract unique neighborhoods
airbnb_neighbourhoods = set(airbnb_data['neighbourhood'].unique())
zhvi_neighbourhoods = set(ny_zhvi_data['RegionName'].unique())

#### Common Neighborhoods
- Find the common neighborhoods between Airbnb and ZHVI datasets.
- Filter both datasets to include only the common neighborhoods.
- Merge the filtered Airbnb and ZHVI datasets based on the common neighborhood.

In [5]:
# Intersection of neighborhoods in both datasets
common_neighborhoods = airbnb_neighbourhoods.intersection(zhvi_neighbourhoods)

# Filter the Airbnb dataset to include only common neighborhoods
filtered_airbnb_data = airbnb_data[airbnb_data['neighbourhood'].isin(common_neighborhoods)]

# Filter the ZHVI dataset to include only common neighborhoods
filtered_zhvi_data = ny_zhvi_data[ny_zhvi_data['RegionName'].isin(common_neighborhoods)]

# Merge the datasets
merged_data = pd.merge(filtered_airbnb_data, filtered_zhvi_data, left_on='neighbourhood', right_on='RegionName')


### Yearly Analysis
- Find the minimum and maximum years in the 'last_review_year' column.
- Define a valid year range and filter the data accordingly.

In [6]:
# Convert 'last review' column to datetime format
merged_data['last review'] = pd.to_datetime(merged_data['last review'], errors='coerce')

# Extract the year from the 'last review' column and create a new column 'last_review_year'
merged_data['last_review_year'] = merged_data['last review'].dt.year

# Find the minimum and maximum years in the 'last_review_year' column
min_year = merged_data['last_review_year'].min()
max_year = merged_data['last_review_year'].max()

# Define the valid year range
valid_year_range = (2019, 2023)

# Filter the DataFrame to keep only rows with valid years
filtered_data = merged_data[
    (merged_data['last_review_year'] >= valid_year_range[0]) &
    (merged_data['last_review_year'] <= valid_year_range[1])
]

In [7]:
import plotly.express as px

fig = px.histogram(merged_data, x='last_review_year',
                   title='Histogram of Reviews by Year',
                   labels={'last_review_year': 'Year', 'count': 'Number of Reviews'})


fig.update_layout(
    xaxis=dict(title='Year'),
    yaxis=dict(title='Number of Reviews'),
    showlegend=False
)


### Yearly Average ZHVI Calculation
- Define a year range from 2019 to 2023 and generate column names representing the last day of each month within this range.
- Calculate the yearly average ZHVI for neighborhoods by grouping and computing the mean values for each year.

In [8]:
# Define the range of years for the analysis
start_year = 2019
end_year = 2023

# Function to get the last day of the month
def last_day_of_month(year, month):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31  # Months with 31 days
    elif month in [4, 6, 9, 11]:
        return 30  # Months with 30 days
    else:
        # February: check for leap year
        return 29 if year % 4 == 0 else 28

# Generate a list of column names for each month of each year of interest
monthly_columns = {}
for year in range(start_year, end_year + 1):
    monthly_columns[year] = []
    for month in range(1, 13):
        day = last_day_of_month(year, month)
        column_name = f'{month}/{day}/{year}'
        if column_name in merged_data.columns:
            monthly_columns[year].append(column_name)

# Calculate the yearly average ZHVI for each neighborhood
yearly_avg_zhvi = {}
for year, columns in monthly_columns.items():
    if columns:  # Ensure there are columns for the year
        yearly_avg_zhvi[year] = merged_data.groupby('neighbourhood')[columns].mean().mean(axis=1)

# Convert the yearly averages to a DataFrame
yearly_avg_zhvi = pd.DataFrame(yearly_avg_zhvi)

### Airbnb Listing Count and Data Combination
- Calculate the count of Airbnb listings for each neighborhood in the filtered data using the groupby method.
- Combine the Airbnb listing count data with the yearly average ZHVI data based on the 'neighbourhood' column.

In [9]:
# Calculate Airbnb listing count for each neighborhood
airbnb_listing_count = filtered_data.groupby('neighbourhood').size().reset_index(name='airbnb_listing_count')

# Combine Airbnb listing count and yearly average ZHVI data
combined_data = pd.merge(airbnb_listing_count, yearly_avg_zhvi, on='neighbourhood')
combined_data

Unnamed: 0,neighbourhood,airbnb_listing_count,2019,2020,2021,2022,2023
0,Arden Heights,9,4.160248e+05,4.206415e+05,4.377858e+05,4.790372e+05,4.980696e+05
1,Arrochar,48,6.514846e+05,6.526108e+05,6.659444e+05,7.230428e+05,7.304496e+05
2,Astoria,1099,8.116429e+05,7.908011e+05,7.692332e+05,7.723108e+05,7.107342e+05
3,Bath Beach,41,8.154337e+05,7.718527e+05,7.584638e+05,7.792290e+05,7.546974e+05
4,Battery Park,38,1.043128e+06,9.989844e+05,9.620017e+05,1.044703e+06,1.027472e+06
...,...,...,...,...,...,...,...
165,Windsor Terrace,180,1.186847e+06,1.165920e+06,1.165856e+06,1.223001e+06,1.073213e+06
166,Woodhaven,145,6.185588e+05,6.423971e+05,6.620362e+05,6.977369e+05,6.928795e+05
167,Woodlawn,20,8.221766e+05,7.874717e+05,7.385032e+05,6.724403e+05,5.605188e+05
168,Woodrow,1,6.826850e+05,6.899717e+05,7.191588e+05,7.805972e+05,7.925477e+05


In [10]:
import geopandas as gpd

nyc_geo_data = gpd.read_file('assets/nyc_geo_export.shp')

# Display the first few rows of the geospatial data
print(nyc_geo_data.head())

merged_geo_data = nyc_geo_data.merge(combined_data, left_on='name', right_on='neighbourhood')
merged_geo_data

   annoangle    annoline1 annoline2 annoline3 borough         name  objectid  \
0        0.0    Wakefield      None      None   Bronx    Wakefield       1.0   
1        0.0        Co-op      City      None   Bronx   Co-op City       2.0   
2        0.0  Eastchester      None      None   Bronx  Eastchester       3.0   
3        0.0    Fieldston      None      None   Bronx    Fieldston       4.0   
4        0.0    Riverdale      None      None   Bronx    Riverdale       5.0   

   stacked                    geometry  
0      1.0  POINT (-73.84720 40.89471)  
1      2.0  POINT (-73.82994 40.87429)  
2      1.0  POINT (-73.82781 40.88756)  
3      1.0  POINT (-73.90564 40.89544)  
4      1.0  POINT (-73.91259 40.89083)  


Unnamed: 0,annoangle,annoline1,annoline2,annoline3,borough,name,objectid,stacked,geometry,neighbourhood,airbnb_listing_count,2019,2020,2021,2022,2023
0,0.0,Wakefield,,,Bronx,Wakefield,1.0,1.0,POINT (-73.84720 40.89471),Wakefield,135,477211.362925,505057.977775,549995.761633,5.770382e+05,5.696908e+05
1,0.0,Eastchester,,,Bronx,Eastchester,3.0,1.0,POINT (-73.82781 40.88756),Eastchester,22,447755.704525,483757.591817,509303.442900,5.350148e+05,5.378104e+05
2,0.0,Riverdale,,,Bronx,Riverdale,5.0,1.0,POINT (-73.91259 40.89083),Riverdale,9,379772.714608,389751.960275,420634.175967,4.231926e+05,3.761588e+05
3,0.0,Kingsbridge,,,Bronx,Kingsbridge,6.0,1.0,POINT (-73.90282 40.88169),Kingsbridge,107,386008.618792,382825.563817,402640.634275,3.786207e+05,3.086583e+05
4,0.0,Woodlawn,,,Bronx,Woodlawn,8.0,1.0,POINT (-73.86731 40.89827),Woodlawn,20,822176.614458,787471.702592,738503.219242,6.724403e+05,5.605188e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,0.0,Jamaica,Hills,,Queens,Jamaica Hills,266.0,2.0,POINT (-73.79646 40.71146),Jamaica Hills,14,763841.243575,758900.428733,765889.175967,8.094248e+05,8.065240e+05
157,0.0,Willowbrook,,,Staten Island,Willowbrook,288.0,1.0,POINT (-74.13208 40.60371),Willowbrook,3,562630.466342,564864.765033,577841.134817,6.274318e+05,6.358395e+05
158,0.0,Middle,Village,,Queens,Middle Village,293.0,2.0,POINT (-73.88114 40.71641),Middle Village,53,779191.097367,789792.278942,792410.380983,8.359975e+05,8.353228e+05
159,0.0,Prince's,Bay,,Staten Island,Prince's Bay,294.0,2.0,POINT (-74.20153 40.52626),Prince's Bay,3,708610.267808,716206.019975,736548.805942,8.002696e+05,8.112921e+05
