In [72]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Overview

In [73]:
df = pd.read_csv('../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv')

In [74]:
df.shape

In [75]:
df.columns

In [76]:
df.head(50)

In [77]:
df.info()

In [78]:
df.isnull().sum()

# Data Wrangling

In [79]:
# Make a copy
df_airbnb = df.copy()

In [80]:
# Because of these columns with null values are not useful, so fill in with 0
df_airbnb['name'].fillna(0, inplace=True)
df_airbnb['host_name'].fillna(0, inplace=True)
df_airbnb['reviews_per_month'].fillna(0, inplace=True)

# Drop 
df_airbnb.drop('reviews_per_month', axis=1, inplace=True)

In [81]:
# Check how many time 0 appears in the price column
df_airbnb.groupby(['price']).size()[0]

In [82]:
# View outliers in min_night
nights = df_airbnb[['minimum_nights']]

# Sort the values
nights.sort_values(by='minimum_nights', ascending=False).head(50)

In [83]:
df_airbnb.loc[df_airbnb['minimum_nights'] > 365, 'minimum_nights'] = 365

In [84]:
df_airbnb.describe()

# Visualization

## number of reviews vs. neighborhood

In [85]:
df_airbnb1 = df_airbnb[['host_id', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'number_of_reviews', 'availability_365']]

In [86]:
# Check the number of reviews based on neighbourhood
numb_reviews = df_airbnb1['number_of_reviews'].groupby(df_airbnb['neighbourhood_group'])
numb_reviews.sum()

In [87]:
reviews = pd.DataFrame(numb_reviews.sum())
reviews.sort_values(by='number_of_reviews', ascending=False, inplace=True)

In [88]:
reviews

In [89]:
# Set up the figure plot
fig = plt.figure()

reviews_barh = reviews['number_of_reviews'].sort_values().plot.barh(fontsize=10)
reviews_barh

# Title and labels
props = {
    'title':'Sum of reviews of each neighbor',
    'xlabel':'reviews',
    'ylabel': 'location',
    }
reviews_barh.set(**props)

In [90]:
# Check out the price for each neighbor
neighbor_prices = df_airbnb1['price'].groupby(df_airbnb['neighbourhood_group'])
neighbor_prices.describe()

### Conclusion
Brooklyn has the most reviews, followed by Manhattan. Manhattan has the highest average price, followed by Brooklyn and Bronx

## Room type vs. Avg price

In [91]:
# Count total number of each room type
df_airbnb1['room_type'].value_counts()

In [92]:
# The Avg price for each roome type
group_prices = df_airbnb1['price'].groupby(df_airbnb['room_type'])
group_prices.mean()

In [93]:
# Check out number of reviews of each room type
popular_room = df_airbnb1['number_of_reviews'].groupby(df_airbnb['room_type'])
popular_room.sum()

In [94]:
room_type_reviews = df_airbnb1.pivot_table('number_of_reviews', index='neighbourhood_group', columns='room_type', aggfunc='sum')
room_type_reviews

In [95]:
room_type_reviews.plot.bar(fontsize=10, rot=0)

plt.title('Sum of reviews for each room type in different neighbourhood group', fontsize=10)
plt.ylabel('Sum_of_reviews')

In [96]:
# Check the Avg price for each room type in different neighborhood
neigh_avg_price = df_airbnb1.groupby([df_airbnb1['neighbourhood_group'],df_airbnb1['room_type']])['price'].mean().unstack()
neigh_avg_price

In [97]:
neigh_avg_price.plot.bar(fontsize=10, rot=0)

plt.title('Sum of reviews for each room type in different neighbourhood group', fontsize=10)
plt.ylabel('Avg_price')

### Conclusion
1. Entire home/apt is the most popular room type in Brooklyn and Manhattan. The average price is the highest;
2. At Bronx and Queens, Private room is relatively popular which the price is in the middle;
3. Manhattan has the highest prices, and entire room/apt helps to increase Manhattan price
4. Entire home/apt has the highest prices no matter where people live in

## Feature of popular housing

In [102]:
popular_housing = df_airbnb.sort_values(by='number_of_reviews', ascending = False).head(10)
popular_housing

In [103]:
popular_housing_type = pd.crosstab(popular_housing['neighbourhood_group'], popular_housing['room_type'])
popular_housing_type

In [104]:
popular_housing['price'].mean()

### Conclusion
1. Manhattan and Queens have both 4 popular housing that room type is procate room
2. The average popular housing price is $65.4 which satisfy the needs of tourists