# Install packages

In [None]:
!pip3 install pandas
!pip3 install numpy
!pip3 install scikit-learn
!pip3 install plotly
!pip3 install shapely
!pip3 install geopandas

## How does the prevalence of Airbnb listings correlate with the spatial distribution of housing market trends across the diverse boroughs and neighbourhoods in New York City?

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import plotly.express as px
import plotly.graph_objects as go
from shapely.geometry import Point
import geopandas as gpd

df = pd.read_csv("assets/project_data.csv", low_memory=False)

### Data Preparation 
- Transform data from separate price columns per date to a single "average_price" column with corresponding dates, making it easier to track trends and compare prices.
- Remove duplicate rows based on neighbourhood and date.

In [19]:
melted_df = df.melt(
    id_vars=["neighbourhood", "borough", "geometry"],
    value_vars=[col for col in df.columns if "/" in col],
    var_name="date",
    value_name="average_price",
)

# Drop date and neighbourhood duplicates
melted_df = melted_df.drop_duplicates(subset=["neighbourhood", "date"])

# Convert 'date' to datetime
melted_df["date"] = pd.to_datetime(melted_df["date"], format="%m/%d/%Y")

melted_df

Unnamed: 0,neighbourhood,borough,geometry,date,average_price
0,Kensington,Brooklyn,POINT (-73.98042110559481 40.64238195800357),2019-01-31,7.054118e+05
199,Midtown,Manhattan,POINT (-73.98166882730311 40.75469110270627),2019-01-31,1.731643e+06
1938,Clinton Hill,Brooklyn,POINT (-73.96784306216374 40.69322942188155),2019-01-31,1.020662e+06
2571,Murray Hill,Manhattan,POINT (-73.97833207924134 40.74830307725222),2019-01-31,1.007439e+06
3509,Upper West Side,Manhattan,POINT (-73.9770592363061 40.7876579985349),2019-01-31,1.475808e+06
...,...,...,...,...,...
2743372,Westerleigh,Staten Island,POINT (-74.13304143951713 40.62109047275413),2023-12-31,6.870352e+05
2743373,Glen Oaks,Queens,POINT (-73.71548118999152 40.74944079974336),2023-12-31,3.503167e+05
2743375,Gerritsen Beach,Brooklyn,POINT (-73.93010170691203 40.59084843390208),2023-12-31,6.344947e+05
2743378,Woodrow,Staten Island,POINT (-74.22135055371616 40.53745265012707),2023-12-31,7.921059e+05


### Calculating Price Trends within Neighborhoods
- Group data based on neighborhood, borough, and geometry
- Trend Calculation
    - Sorts data chronologically for each group.
    - Applies linear regression to model price changes over time.
    - Extracts the slope  as the 'price trend'.
- Normalization applied to scale price trends for relative comparison

In [20]:
# Group by neighbourhood and borough to calculate trends
grouped_df = melted_df.groupby(["neighbourhood", "borough", "geometry"])
trends = []

# Calculate trend for each group
for name, group in grouped_df:
    # Ensure the data is sorted by date
    group = group.sort_values("date")

    # Prepare data for linear regression
    X = np.array((group["date"] - group["date"].min()).dt.days).reshape(
        -1, 1
    )  # Convert dates to ordinal
    y = group["average_price"].values

    # Perform linear regression
    model = LinearRegression().fit(X, y)

    # Extract the slope (price trend)
    slope = model.coef_[0]

    # Append the results
    trends.append((name[0], name[1], name[2], slope))

# Convert trends list to DataFrame
trends_df = pd.DataFrame(trends, columns=["neighbourhood", "borough", "geometry", "price_trend"])

# Normalize the price_trend data
trends_df["normalized_price_trend"] = (
    trends_df["price_trend"] - trends_df["price_trend"].min()
) / (trends_df["price_trend"].max() - trends_df["price_trend"].min())
trends_df

Unnamed: 0,neighbourhood,borough,geometry,price_trend,normalized_price_trend
0,Arden Heights,Staten Island,POINT (-74.185886745839 40.549285822783254),60.568368,0.828564
1,Arrochar,Staten Island,POINT (-74.06712363225581 40.596312571276776),62.589755,0.832133
2,Astoria,Queens,POINT (-73.91565374304241 40.768508593354966),-61.206646,0.613548
3,Bath Beach,Brooklyn,POINT (-73.99875221443526 40.599518702822415),-32.476624,0.664276
4,Bay Ridge,Brooklyn,POINT (-74.03062069353821 40.62580106501069),-134.348024,0.484403
...,...,...,...,...,...
147,Windsor Terrace,Brooklyn,POINT (-73.98007340430179 40.65694583575108),-46.517918,0.639483
148,Woodhaven,Queens,POINT (-73.85811046554328 40.689886879157925),55.334887,0.819324
149,Woodlawn,Bronx,POINT (-73.86731496814183 40.898272612138086),-175.933390,0.410976
150,Woodrow,Staten Island,POINT (-74.22135055371616 40.53745265012707),84.505881,0.870830


### Interactive Plotly Visualization: Borough and Neighborhood Trends
- Calculates borough averages and trends using linear regression.
- Prepares neighborhood data and computes trend lines.
- Dropdown menu enables toggling visibility of borough and neighborhood trends.
- Allows comparison of average prices and trends for specific neighborhoods.

In [21]:
# Create a Plotly figure
fig = go.Figure()

# Generate a sorted list of unique neighborhoods and boroughs from the melted DataFrame
neighborhoods = sorted(melted_df["neighbourhood"].unique())
boroughs = melted_df["borough"].unique()

# Calculate borough averages and add traces for borough averages
borough_averages = melted_df.groupby(["borough", "date"])["average_price"].mean().reset_index()

# To store trend lines for each borough
borough_trends = []

# Iterate over each borough
for borough in boroughs:
    # Filter data for the current borough
    df_borough_avg = borough_averages[borough_averages["borough"] == borough]
    dates = df_borough_avg["date"]
    prices = df_borough_avg["average_price"]

    # Calculate trend for borough averages
    X_borough = np.array((dates - dates.min()).dt.days).reshape(-1, 1)
    y_borough = prices.values
    model_borough = LinearRegression().fit(X_borough, y_borough)
    trend_line_borough = model_borough.predict(X_borough)

    # Store borough trend data for use in neighborhood plots
    borough_trends.append((borough, dates, trend_line_borough))

    # Add borough averages to the figure
    fig.add_trace(
        go.Scatter(x=dates, y=prices, mode="lines+markers", name=f"{borough} Average", visible=True)
    )
    # Add borough trend lines to the figure
    fig.add_trace(
        go.Scatter(
            x=dates, y=trend_line_borough, mode="lines", name=f"{borough} Trend", visible=True
        )
    )

# Add neighborhood data and trends, including borough trend for each neighborhood
for neighborhood in neighborhoods:
    # Filter data for the current neighborhood
    df_filtered = melted_df[melted_df["neighbourhood"] == neighborhood]
    borough_name = df_filtered["borough"].iloc[0]  # Get borough name for the neighborhood

    # Find the borough trend data
    borough_trend_data = next((item for item in borough_trends if item[0] == borough_name), None)

    dates = df_filtered["date"]
    prices = df_filtered["average_price"]

    # Add neighborhood data points to the figure (invisible initially)
    fig.add_trace(go.Scatter(x=dates, y=prices, mode="markers", name=neighborhood, visible=False))

    # Calculate and add neighborhood trend
    X_neighborhood = np.array((dates - dates.min()).dt.days).reshape(-1, 1)

    # Filter the DataFrame to get data specific to the current neighborhood
    trend_data = trends_df[trends_df["neighbourhood"] == neighborhood]

    # Retrieve the slope of the trend line for the current neighborhood
    slope = trend_data["price_trend"].values[0]

    # Calculate the y-intercept of the trend line for the neighborhood
    intercept = prices.iloc[0] - slope * X_neighborhood[0]

    # Calculate the y-values for the trend line of the neighborhood
    trend_line_neighborhood = slope * X_neighborhood + intercept

    # Add a scatter trace representing the trend line for the neighborhood to the Plotly figure
    fig.add_trace(
        go.Scatter(
            x=dates,
            y=trend_line_neighborhood.flatten(),
            mode="lines",
            name=f"{neighborhood} Trend",
            visible=False,
        )
    )
    # Add borough trend line for the neighborhood
    if borough_trend_data:
        fig.add_trace(
            go.Scatter(
                x=borough_trend_data[1],
                y=borough_trend_data[2],
                mode="lines",
                name=f"{borough_name} Trend",
                visible=False,
            )
        )

# Update the dropdown to control visibility of the borough and neighborhood trends
buttons = [
    dict(
        label="Borough Averages",
        method="update",
        args=[
            {"visible": [True] * len(boroughs) * 2 + [False] * len(neighborhoods) * 3},
            {"title": "Borough Averages and Trends"},
        ],
    )
]

# Create buttons for each neighborhood to toggle visibility
for i, neighborhood in enumerate(neighborhoods):
    button_visible = [False] * len(boroughs) * 2 + [False] * len(
        neighborhoods
    ) * 3  # Hide all initially
    # Show specific neighborhood, its trend, and the corresponding borough trend
    button_visible[len(boroughs) * 2 + 3 * i : len(boroughs) * 2 + 3 * i + 3] = [True, True, True]
    buttons.append(
        dict(
            label=neighborhood,
            method="update",
            args=[
                {"visible": button_visible},
                {"title": f"Average Price and Trend for {neighborhood}"},
            ],
        )
    )

# Update layout for dropdown
fig.update_layout(
    updatemenus=[dict(active=0, buttons=buttons, x=0.0, xanchor="left", y=1.1, yanchor="top")],
    title="Select a Neighborhood or View Borough Averages and Trends",
)

# Show the figure
fig.show()

### Interactive Plotly Visualization: Airbnb Listings by Neighborhood and Borough
- Group Airbnb listings by neighborhood, borough, and geometry, computing the count of listings.
- Generate a scatter plot with neighborhoods on the x-axis and listing counts on the y-axis.
- Points are colored by borough and sized based on the number of listings.

In [22]:
airbnb_listings_count = (
    df.groupby(["neighbourhood", "borough", "geometry"])
    .agg(listing_count=("id", "count"))
    .reset_index()
)
# Create a scatter plot using Plotly Express
fig = px.scatter(
    airbnb_listings_count,
    x="neighbourhood",
    y="listing_count",
    color="borough",  # Color points by borough
    size="listing_count",  # Size points by the number of listings
    hover_name="neighbourhood",  # Display neighborhood name on hover
    hover_data={
        "borough": True,
        "listing_count": True,
    },  # Display borough and listing count on hover
    title="Airbnb Listings by Neighborhood and Borough",
    labels={"neighbourhood": "Neighborhood", "listing_count": "Listing Count"},
)

# Customize layout
fig.update_layout(xaxis_title="Neighborhood", yaxis_title="Listing Count", legend_title="Borough")


fig.update_traces(
    marker=dict(opacity=0.7, line=dict(width=1, color="Black"))
)  # Add border to markers


# Show the figure
fig.show()

### GeoDataFrame Creation: Merging and Geospatial Conversion
- Merge trends data (trends_df) and Airbnb listings count data (airbnb_listings_count) based on neighborhood, borough, and geometry.
- Geospatial Conversion
    - Define a function (convert_to_point) to convert geometry strings to Point objects.
    - Apply the conversion function to the 'geometry' column of the merged DataFrame.

In [23]:
analysis_df = pd.merge(
    trends_df, airbnb_listings_count, on=["neighbourhood", "borough", "geometry"], how="left"
)


# Convert 'geometry' column to Point
def convert_to_point(geometry_str):
    # Strip "POINT (" and ")" and split by space to get coordinates
    coords = geometry_str.replace("POINT (", "").replace(")", "").split()
    # Convert string coordinates to float and create a Point object
    return Point(float(coords[0]), float(coords[1]))


# Apply the conversion function to 'geometry' column
analysis_df["geometry"] = analysis_df["geometry"].apply(convert_to_point)

# Convert DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(analysis_df, geometry="geometry")

# Ensure the CRS (Coordinate Reference System) is set for geographic operations
gdf.crs = "EPSG:4326"  # EPSG:4326 is a common geographic coordinate system

gdf

Unnamed: 0,neighbourhood,borough,geometry,price_trend,normalized_price_trend,listing_count
0,Arden Heights,Staten Island,POINT (-74.18589 40.54929),60.568368,0.828564,9
1,Arrochar,Staten Island,POINT (-74.06712 40.59631),62.589755,0.832133,47
2,Astoria,Queens,POINT (-73.91565 40.76851),-61.206646,0.613548,1094
3,Bath Beach,Brooklyn,POINT (-73.99875 40.59952),-32.476624,0.664276,40
4,Bay Ridge,Brooklyn,POINT (-74.03062 40.62580),-134.348024,0.484403,203
...,...,...,...,...,...,...
147,Windsor Terrace,Brooklyn,POINT (-73.98007 40.65695),-46.517918,0.639483,180
148,Woodhaven,Queens,POINT (-73.85811 40.68989),55.334887,0.819324,143
149,Woodlawn,Bronx,POINT (-73.86731 40.89827),-175.933390,0.410976,20
150,Woodrow,Staten Island,POINT (-74.22135 40.53745),84.505881,0.870830,1


### Interactive Map Visualization: NYC Airbnb Listings and Housing Price Trends
- Plot each Airbnb listing as a circle on a map, with the size of the circle representing the listing count.
- Color the circles based on the normalized price trend of the neighborhood.
- Display neighborhood and borough names, normalized price trends, and listing counts upon hover.
- Add a color bar to indicate the range of normalized price trends. 

In [24]:
fig = px.scatter_mapbox(
    gdf,
    lat=gdf.geometry.y,
    lon=gdf.geometry.x,
    color="normalized_price_trend",
    size="listing_count",
    color_continuous_scale=px.colors.cyclical.IceFire,
    size_max=25,  # Size to make the circles more visible
    zoom=10,
    mapbox_style="carto-positron",  # Map style
    title="NYC Airbnb Listings and Housing Price Trends",
    hover_name="neighbourhood",
    hover_data={
        "neighbourhood": True,
        "borough": True,
        "normalized_price_trend": ":.2f",
        "listing_count": True,
    },
    color_continuous_midpoint=0.5,  # This sets the midpoint of the color scale for the diverging colormap
)

# Update the layout for the color bar title
fig.update_layout(coloraxis_colorbar=dict(title="Normalized Price Trend"))

# Define hover template
fig.update_traces(
    hovertemplate="<br>".join([
        "Neighbourhood: %{customdata[0]}",
        "Borough: %{customdata[1]}",
        "Normalized Price Trend: %{customdata[2]:.2f}",
        "Listing Count: %{customdata[3]}",
    ])
)

fig.show()

### Correlation Analysis: Price Trend vs. Listing Count
- Calculate the correlation coefficient between the price trend and listing count for NYC and by bourughs.
- Visualize correlation coefficients.

In [25]:
# Calculate correlation coefficients by borough
grouped_by_borough = gdf.groupby("borough")

correlation_by_borough = {}
total_correlation = gdf[["price_trend", "listing_count"]].corr().iloc[0, 1]

for borough, data in grouped_by_borough:
    correlation_coefficient = data[["price_trend", "listing_count"]].corr().iloc[0, 1]
    correlation_by_borough[borough] = correlation_coefficient

# Add total correlation coefficient for New York City to the dictionary
correlation_by_borough["New York City"] = total_correlation

# Create DataFrame from the dictionary
correlation_df = pd.DataFrame(
    list(correlation_by_borough.items()), columns=["Region", "Correlation"]
)

# Round the correlation coefficients
correlation_df["Correlation"] = correlation_df["Correlation"].round(4)

# Sort the DataFrame by correlation coefficient for better visualization
correlation_df_sorted = correlation_df.sort_values(by="Correlation", ascending=False)

# Plotting
fig = px.bar(
    correlation_df_sorted,
    x="Region",
    y="Correlation",
    labels={"Correlation": "Correlation Coefficient", "Region": "Borough"},
    color="Correlation",
    color_continuous_scale="RdBu_r",
)

fig.update_layout(
    title="Correlation Coefficients by Borough",
    xaxis_title="Borough",
    yaxis_title="Correlation Coefficient",
)

fig.show()

### Conclusion
New York City: Considering all boroughs together, the correlation coefficient indicates a moderate negative correlation (r = -0.221353) between the price trend and listing count for New York City. This means that as the price trend increases, there is a tendency for the listing count to decrease, and vice versa.

Brooklyn: The correlation coefficient between price trend and listing count in Brooklyn indicates a weak positive correlation (r = 0.143765). This suggests that as the price trend increases, there is a slight tendency for the listing count to also increase in Brooklyn.

Bronx: Similar to Brooklyn, the correlation coefficient for the Bronx indicates a weak positive correlation (r = 0.123113) between price trend and listing count. As the price trend increases, there is a slight tendency for the listing count to increase in the Bronx.

Manhattan: The correlation coefficient for Manhattan suggests that there is almost no correlation (r = -0.002640) between the price trend and listing count. This means that changes in the price trend do not seem to be associated with changes in the listing count in Manhattan.

Staten Island: The correlation coefficient for Staten Island suggests a moderate negative correlation (r = -0.281329) between the price trend and listing count. As the price trend increases, there is a tendency for the listing count to decrease in Staten Island.

Queens: Similar to Staten Island, the correlation coefficient for Queens indicates a moderate negative correlation (r = -0.433780) between the price trend and listing count. As the price trend increases, there is a tendency for the listing count to decrease in Queens.