# Analysis of airbnb(NEW YORK CITY):


The dataset contains detailed information on Airbnb listings, reviews, and neighbourhoods in new york city. It includes listing-related attributes such as geographical coordinates, room types, pricing, host details, and booking specifics. Additionally, it captures neighbourhood classifications and comprehensive reviews from guests, including textual feedback and ratings. With multiple entries across various global cities, this dataset offers a comprehensive view of the Airbnb market, providing insights into hospitality trends, guest preferences, and regional accommodation specifics.


# DATASET ATTRIBUTES:

# LISTINGS.CSV

id                               : Listing ID  
name                             : Listing Name  
host_id                          : Host ID  
host_name                        : Host Name  
neighbourhood_group              : Neighbourhood Group (Name of the specific neighbourhood)    
neighbourhood                    : Neighbourhood (Name of the city or area within the city)    
latitude                         : Latitude (Coordinates of the place)  
longitude                        : Longitude (Coordinates of the place)  
room_type                        : Room Type  
price                            : Price per Night  
minimum_nights                   : Minimum Nights Stayed  
number_of_reviews                : Number of Reviews Given  
last_review                      : Date of the Last Review  
reviews_per_month                : Reviews Per Month  
calculated_host_listings_count   : Count of Host Listings  
availability_365                 : Availability of the Room (Number of days available in a year)  
number_of_reviews_ltm            : Number of Reviews Last Twelve Months  
license                          : License of the listing  

# NEIGHBOURHOODS.CSV


neighbourhood_group    : Neighbourhood Group  
neighbourhood          : Neighbourhood  

# REVIEWS.CSV

listing_id         : Listing ID  
date               : Date of Review  

# IMPORTING LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


# LOADING THE DATA SET

In [None]:
df1=pd.read_csv(r'listings.csv')

In [None]:
df2=pd.read_csv(r'neighbourhoods.csv')

In [None]:
df3=pd.read_csv(r'reviews.csv')

# DATA PREPROCESSING

# LISTINGS.CSV

In [None]:
df1

In [None]:
df1['calculated_host_listings_count'].value_counts()

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
#checking for null values
df1.isnull().any()

In [None]:
#checking for how many missing values
df1.isnull().sum()

In [None]:
# dropping the columns

In [None]:
df1.drop(columns=['last_review', 'license'], inplace=True)

In [None]:
df1['reviews_per_month'].fillna(0, inplace=True)

In [None]:
df1.isnull().sum()

In [None]:
# Calculate the average price for each combination of 'neighbourhood_group' and 'room_type'
avg_prices = df1.groupby(['neighbourhood_group', 'room_type'])['price'].mean()

# Replace NaN values in 'price' based on these averages
df1['price'] = df1.apply(
    lambda row: avg_prices[row['neighbourhood_group'], row['room_type']] if pd.isnull(row['price']) else row['price'],
    axis=1
)

In [None]:
df1['name'].fillna("Unknown", inplace=True)
df1['host_name'].fillna("Unknown", inplace=True)

In [None]:
df1

In [None]:
df1.isnull().sum()

# neighbourhoods.csv

In [None]:
df2

In [None]:
df2.shape

In [None]:
df2.info()

In [None]:
df2.isnull().any()

In [None]:
df2.isnull().sum()

# Reviews.csv


In [None]:
df3

In [None]:
df3.isnull().any()

# Merging

In [None]:
df4 = pd.merge(df1, df2, on=['neighbourhood_group', 'neighbourhood'], how='left')
df4



In [None]:
df4.isnull().any()

# DATA ANALYSIS USING DESCRIPTIVE STATISTICS

# Summary Statistics

In [None]:
df4.drop(columns=['id','host_id','latitude','longitude']).describe()

# DATA VISUALIZATION

# Map Visualizations

In [None]:
#pip install pandas numpy shapely fiona descartes


In [None]:
#pip install geopandas


# Choropleth Map

# How does the average price vary across different neighbourhood groups in New York City?

In [None]:
import folium
import pandas as pd

# Load GeoJSON file containing New York neighborhood boundaries
geo_json_path = 'neighbourhoods.geojson'

# Calculate the average price by neighborhood group
avg_price_by_group = df4.groupby('neighbourhood_group')['price'].mean().reset_index()

# Create a Folium map centered at New York
ny_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

# Add choropleth layer to the map
folium.Choropleth(
    geo_data=geo_json_path,
    name="Average Price of Airbnb Listings",
    data=avg_price_by_group,
    columns=["neighbourhood_group", "price"],
    key_on="feature.properties.neighbourhood_group",
    fill_color='YlOrRd',
    fill_opacity=0.9,
    line_opacity=0.5,
    legend_name="Average Price of Airbnb Listings"
).add_to(ny_map)

# Display the map
ny_map



In [None]:
avg_price_by_group

# Bubble Map


# Can we visualize the distribution of listings based on their room type and price?

In [None]:
import plotly.express as px

df = df4[['latitude', 'longitude', 'room_type', 'price']]

# Create the bubble map using Plotly Express
fig = px.scatter_mapbox(df,
                         lat="latitude",
                         lon="longitude",
                         color="room_type",
                         size="price",
                         color_discrete_sequence=["blue", "green", "red", "brown"],
                         size_max=15,  # Maximum bubble size
                         hover_name="room_type",
                         hover_data={"latitude": False, "longitude": False, "price": ":.2f"}, 
                         mapbox_style="carto-positron",  # Map style
                         zoom=10,  # Initial zoom level
                         title="Distribution of Listings by Room Type and Price"
                        )

# Update layout
fig.update_layout(
    margin=dict(l=0, r=0, t=50, b=0),  # Adjust margins
    legend_title="Room Type",  # Legend title
    showlegend=True,  # Show legend
    updatemenus=[
        {
            "buttons": [
                {
                    "label": "All",
                    "method": "update",
                    "args": [{"visible": [True, True, True, True]}, {"title": "All Listings"}],
                },
                {
                    "label": "Entire home/apt",
                    "method": "update",
                    "args": [{"visible": [True, False, False, False]}, {"title": "Entire Home/Apt Listings(Blue Color)"}],
                },
                {
                    "label": "Private room",
                    "method": "update",
                    "args": [{"visible": [False, True, False, False]}, {"title": "Private Room Listings(Green Color)"}],
                },
                {
                    "label": "Shared room",
                    "method": "update",
                    "args": [{"visible": [False, False, True, False]}, {"title": "Shared Room Listings(Red Color)"}],
                },
                {
                    "label": "Hotel room",
                    "method": "update",
                    "args": [{"visible": [False, False, False, True]}, {"title": "Hotel Room Listings(Brown Color)"}],
                },
            ]
        }
    ]
)

# Show the bubble map
fig.show()


# INTERACTIVE VISUALIZATION

# Interactive Aggregation Visualization

# Are there any noticeable differences in the average price or room type distribution between Manhattan, Brooklyn, Queens, and other boroughs?

In [None]:
import pandas as pd
import plotly.graph_objs as go
import dash
from dash import dcc, html, Input, Output


df = df4

# Initialize Dash app
app = dash.Dash(__name__)

# Define layout
app.layout = html.Div([
    html.Label('Select Neighbourhood Group:'),
    dcc.Dropdown(
        id='neighbourhood-dropdown',
        options=[{'label': group, 'value': group} for group in df['neighbourhood_group'].unique()],
        value=df['neighbourhood_group'].unique(),  # Set default value to all groups
        multi=True  # Allow multiple selections
    ),
    html.Div(id='charts-container')
])

# Define callback for updating charts
@app.callback(
    Output('charts-container', 'children'),
    [Input('neighbourhood-dropdown', 'value')]
)
def update_charts(selected_neighbourhoods):
    charts = []
    for neighbourhood in selected_neighbourhoods:
        # Filter data for selected neighbourhood
        filtered_df = df[df['neighbourhood_group'] == neighbourhood]

        # Create bar chart for average price by room type
        avg_price_by_room = filtered_df.groupby('room_type')['price'].mean().reset_index()
        bar_chart = dcc.Graph(
            figure={
                'data': [go.Bar(x=avg_price_by_room['room_type'], y=avg_price_by_room['price'])],
                'layout': {
                    'title': f'Average Price by Room Type in {neighbourhood}',
                    'xaxis': {'title': 'Room Type'},
                    'yaxis': {'title': 'Average Price'}
                }
            }
        )
        charts.append(bar_chart)

        # Create pie chart for distribution of room types
        room_type_distribution = filtered_df['room_type'].value_counts()
        pie_chart = dcc.Graph(
            figure={
                'data': [go.Pie(labels=room_type_distribution.index, values=room_type_distribution.values)],
                'layout': {
                    'title': f'Distribution of Room Types in {neighbourhood}'
                }
            }
        )
        charts.append(pie_chart)

    return charts

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)

# Interactive Map Visualization
## Can we add interactivity to allow users to filter neighborhood groups by price ranges?


In [None]:
import plotly.graph_objects as go

df = df4

# Define colors for different neighbourhood groups
neighborhood_colors = {
    "Manhattan": "rgb(255, 127, 14)",  # Orange
    "Queens": "rgb(44, 160, 44)",       # Green
    "Brooklyn": "rgb(31, 119, 180)",    # Blue
    "Bronx": "rgb(214, 39, 40)",        # Red
    "Staten Island": "rgb(148, 103, 189)"  # Purple
}

# Define price ranges
price_ranges = {
    "All": (0, float('inf')),
    "Below 100": (0, 100),
    "100-200": (100, 200),
    "200-500": (200, 500),
    "500-1000": (500, 1000),
    "1000-5000": (1000,5000),
    "Above 5000": (5000, float('inf'))
}

# Create initial map figure
fig = go.Figure()

# Add traces for each neighbourhood group
for neighborhood, color in neighborhood_colors.items():
    # Add "All" option for each neighborhood group
    all_data = df[(df['neighbourhood_group'] == neighborhood)]
    fig.add_trace(go.Scattermapbox(
        lat=all_data['latitude'],
        lon=all_data['longitude'],
        mode='markers',
        marker=dict(
            size=8,
            color=color,
            opacity=0.7
        ),
        visible=True,  # Initially display "All" for each neighborhood group
        name=f"{neighborhood} - All",
        hoverinfo='text',
        text=all_data.apply(lambda row: f"Neighbourhood: {row['neighbourhood_group']}<br>Price: ${row['price']}<br>Listing Name: {row['name']}", axis=1)
    ))

    # Add traces for other price ranges
    for price_range, (min_price, max_price) in price_ranges.items():
        if price_range != "All":
            filtered_data = df[(df['neighbourhood_group'] == neighborhood) &
                               (df['price'] >= min_price if min_price is not None else True) &
                               (df['price'] < max_price if max_price is not None else True)]
            fig.add_trace(go.Scattermapbox(
                lat=filtered_data['latitude'],
                lon=filtered_data['longitude'],
                mode='markers',
                marker=dict(
                    size=8,
                    color=color,
                    opacity=0.7
                ),
                visible=False,
                name=f"{neighborhood} - {price_range}",
                hoverinfo='text',
                text=filtered_data.apply(lambda row: f"Neighbourhood: {row['neighbourhood_group']}<br>Price: ${row['price']}<br>Listing Name: {row['name']}", axis=1)
            ))

# Define layout
fig.update_layout(
    mapbox=dict(
        style="carto-positron",
        zoom=10,
        center=dict(lat=40.7128, lon=-74.0060)
    ),
    margin=dict(l=50, r=0, t=100, b=0),
    title="Distribution of Listings by Neighbourhood Group w.r.t Price Ranges"
)

# Create dropdown menu for price ranges
buttons = [
    dict(label=price_range,
         method="update",
         args=[{"visible": [(price_range in trace.name) for trace in fig.data]},
               {}]
         )
    for price_range in price_ranges.keys()
]
# Add dropdown menu to update the visibility of traces
fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=0.65,
            xanchor="right",
            y=1.08,
            yanchor="top"
        ),
    ]
)
# Show the interactive map
fig.show()


# Aggregation visualisation
## BOX PLOT
### How does the distribution of prices vary among different room types?

In [None]:
#df4.groupby('room_type')['price'].describe()

In [None]:
# Create a box plot for the price distribution by room type
plt.figure(figsize=(10, 6))
sns.boxplot(x='room_type', y='price', data=df4)
plt.title('Distribution of Prices by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price ($)')
plt.ylim(0, 1000)
plt.show()


# SCATTER PLOT
## Is there any correlation between the price of listings and the number of reviews they receive?

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define predefined colors for each room type
room_type_colors = {
    'Entire home/apt': 'blue',
    'Hotel room': 'green',
    'Private room': 'red',
    'Shared room': 'purple'
}

# Create a continuous scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='number_of_reviews', data=df4, hue='room_type', palette=room_type_colors)
plt.xlabel('Price ($)')
plt.ylabel('Number of Reviews')
plt.title('Continuous Scatter Plot of Price vs Number of Reviews')
plt.legend(title='Room Type', loc='upper right')
plt.xlim(0, 2000)  # Limiting the x-axis to focus on a typical price range and improve readability
plt.ylim(0, 1000) # Limiting the y-axis to a reasonable number of reviews for better visualization
plt.show()
