In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Mount Google Drive because my dataset is in my google drive and using the cleaned dataset
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
airbnb = pd.read_csv('/content/drive/My Drive/airbnb.csv', low_memory=False)

Mounted at /content/drive


In [None]:
#identify missing value/ just to make sure
missing_values = airbnb.isnull().sum()
missing_values[missing_values>0]

Unnamed: 0,0


zero so we are good.

In [None]:
!pip install dash

Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-2.18.2-py3-none-any.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloadi

In [None]:
import plotly.express as px
import pandas as pd


# Ensure necessary columns exist
if 'location_review_score' not in airbnb.columns:
    # Mock 'location_review_score' column for demonstration if not available
    airbnb['location_review_score'] = (airbnb['reviews_per_month'] * 2).fillna(0).astype(float)

if 'accommodates' not in airbnb.columns:
    # Mock 'accommodates' column for demonstration if not available
    airbnb['accommodates'] = (airbnb['availability_365'] / 30).astype(int)

# Create the scatter plot
fig = px.scatter(
    airbnb,
    x='longitude',
    y='latitude',
    size='reviews_per_month',
    color='location_review_score',
    hover_data={
        'name': True,
        'price': True,
        'room_type': True,
        'neighbourhood': True,
        'location_review_score': ':.2f',
        'accommodates': True,
        'latitude': False,  # Disable latitude in hover since it's the Y-axis
        'longitude': False  # Disable longitude in hover since it's the X-axis
    },
    title="Airbnb Listings in NYC with Review Sentiments",
    labels={
        'longitude': 'Longitude',
        'latitude': 'Latitude',
        'reviews_per_month': 'Reviews Per Month',
        'location_review_score': 'Location Review Score'
    }
)

# Update map style
fig.update_layout(
    mapbox_style="carto-positron",
    mapbox=dict(zoom=10, center={"lat": 40.7128, "lon": -74.0060}),
    margin={"r":0,"t":50,"l":0,"b":0}
)

# Add instructions
fig.update_traces(marker=dict(opacity=0.7), selector=dict(mode='markers'))
fig.update_layout(
    annotations=[
        dict(
            xref='paper', yref='paper',
            x=0.5, y=1.1,  # Place above the chart
            showarrow=False,
            text="Hover over points to view detailed listing information",
            font=dict(size=14)
        )
    ]
)

interactive plot to show the review score

In [None]:
import dash
from dash import dcc, html
import plotly.express as px
import pandas as pd

# Load and preprocess the data


# Host-level analysis
host_listings = airbnb.groupby('host_id').agg(
    total_listings=('id', 'count'),
    avg_price=('price', 'mean'),
    avg_reviews=('reviews_per_month', 'mean')
).reset_index()

# Identify single vs multi-listing hosts
host_listings['host_type'] = host_listings['total_listings'].apply(lambda x: 'Single Listing' if x == 1 else 'Multi-Listing')

# Top 10 hosts with most listings
top_hosts = host_listings.sort_values(by='total_listings', ascending=False).head(10)

# Initialize Dash app
app = dash.Dash(__name__)

# App layout
app.layout = html.Div([
    html.H1("Host Activity and Listing Analysis"),
    dcc.Tabs([
        # Tab 1: Listings per host
        dcc.Tab(label='Host Listing Distribution', children=[
            dcc.Graph(
                id='host-distribution',
                figure=px.histogram(
                    host_listings,
                    x='total_listings',
                    nbins=50,
                    title="Distribution of Total Listings per Host",
                    labels={'total_listings': 'Number of Listings'},
                    log_y=True
                )
            )
        ]),
        # Tab 2: Top 10 Hosts
        dcc.Tab(label='Top Hosts', children=[
            dcc.Graph(
                id='top-hosts',
                figure=px.bar(
                    top_hosts,
                    x='host_id',
                    y='total_listings',
                    title="Top 10 Hosts with the Most Listings",
                    labels={'host_id': 'Host ID', 'total_listings': 'Number of Listings'},
                    text='total_listings'
                )
            )
        ]),
        # Tab 3: Single vs Multi-Listing Hosts
        dcc.Tab(label='Single vs Multi-Listing Hosts', children=[
            dcc.Graph(
                id='host-type-comparison',
                figure=px.box(
                    host_listings,
                    x='host_type',
                    y='avg_price',
                    title="Average Price: Single vs Multi-Listing Hosts",
                    labels={'host_type': 'Host Type', 'avg_price': 'Average Price'}
                )
            ),
            dcc.Graph(
                id='reviews-comparison',
                figure=px.box(
                    host_listings,
                    x='host_type',
                    y='avg_reviews',
                    title="Average Reviews: Single vs Multi-Listing Hosts",
                    labels={'host_type': 'Host Type', 'avg_reviews': 'Average Reviews Per Month'}
                )
            )
        ])
    ])
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


<IPython.core.display.Javascript object>

Summary of Insights
1.	Host Landscape:
o	The majority of Airbnb hosts in NYC are small-scale operators with one or two listings.
o	However, a small number of commercial hosts dominate the market, managing 50+ listings and some with over 300 listings.
2.	Pricing Patterns:
o	Listings managed by multi-listing hosts tend to include luxury or high-priced properties, particularly outliers above $5,000.
o	Despite this, most listings fall into a similar price range regardless of host type.
3.	Market Disparity:
o	There is a clear divide between individual hosts (1-2 listings) and commercial operators, which could impact market dynamics and competition.


In [None]:
import pandas as pd
import plotly.express as px


# Ensure the dataset has proper latitude/longitude and numeric values
airbnb = airbnb.dropna(subset=['latitude', 'longitude', 'reviews_per_month'])
airbnb['reviews_per_month'] = pd.to_numeric(airbnb['reviews_per_month'], errors='coerce')

# Define demand as reviews_per_month (higher reviews indicate high demand)
airbnb['demand'] = airbnb['reviews_per_month']

# Create a density heatmap to show areas with high demand
fig = px.density_mapbox(
    airbnb,
    lat='latitude',
    lon='longitude',
    z='demand',
    radius=10,
    center=dict(lat=40.7128, lon=-74.0060),  # Center map to New York City
    zoom=10,
    mapbox_style="carto-positron",
    title="High-Demand Areas for Airbnb Listings in NYC (Based on Reviews Per Month)"
)

# Show the heatmap
fig.show()


In [None]:
import pandas as pd



# Drop missing values in relevant columns
airbnb = airbnb.dropna(subset=['neighbourhood_group', 'reviews_per_month'])

# Group by neighbourhood_group and calculate total demand (sum of reviews_per_month)
high_demand_city = airbnb.groupby('neighbourhood_group')['reviews_per_month'].sum().reset_index()

# Sort to find the city with the highest demand
high_demand_city = high_demand_city.sort_values(by='reviews_per_month', ascending=False)

# Display the top high-demand city
print("Top High-Demand Cities/Neighborhoods:")
print(high_demand_city.head())

# Optional: Plot the results using Plotly
import plotly.express as px

fig = px.bar(
    high_demand_city,
    x='neighbourhood_group',
    y='reviews_per_month',
    title='High-Demand Cities/Neighborhoods Based on Reviews Per Month',
    labels={'reviews_per_month': 'Total Reviews Per Month', 'neighbourhood_group': 'City/Neighborhood'},
    color='reviews_per_month'
)

fig.show()


Top High-Demand Cities/Neighborhoods:
  neighbourhood_group  reviews_per_month
1            Brooklyn            9796.01
2           Manhattan            9217.72
3              Queens            5506.99
0               Bronx            1163.97
4       Staten Island             427.00


In [None]:
import pandas as pd
import plotly.express as px



# Drop missing values in relevant columns
airbnb = airbnb.dropna(subset=['neighbourhood', 'reviews_per_month'])

# Group by city and calculate total reviews_per_month as demand
high_demand_cities = airbnb.groupby('neighbourhood')['reviews_per_month'].sum().reset_index()

# Sort the cities by demand in descending order
high_demand_cities = high_demand_cities.sort_values(by='reviews_per_month', ascending=False)

# Display the top high-demand cities
print("Top High-Demand Cities:")
print(high_demand_cities.head())

# Visualize the results using Plotly
fig = px.bar(
    high_demand_cities.head(10),  # Show top 10 cities
    x='neighbourhood',
    y='reviews_per_month',
    title='Top High-Demand Cities Based on Reviews Per Month',
    labels={'reviews_per_month': 'Total Reviews Per Month', 'city': 'City'},
    color='reviews_per_month'
)

fig.show()


Top High-Demand Cities:
          neighbourhood  reviews_per_month
13   Bedford-Stuyvesant            2190.50
95               Harlem            1278.97
128             Midtown            1190.10
96       Hell's Kitchen            1049.63
214        Williamsburg             986.62


Befford is the famous city followed by harlem and east flatbush is the least famous based on the reviews