In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Project Declaration
* This project is focusing on the housing prices in Beijing, including second-hand housing prices from Lianjia (one of the most famous Chinese online housing platform) and short-rent prices from Airbnb.
* The purpose of this project is to analyze each dataframe individually, as well as finding the potential relationship between the two. 
* Question 1: Is there any relationship petween data from Airbnb and Lianjia?
* Questions 2: What are some potential independent variables for those prices in Beijing?
* *some background info: the Lianjia dataset is from 2016 and the price is much higher now. 1 USD = 6.5 RMB estimate.*

# 1. Data profiles and data cleaning

In [None]:
'''Import useful libraries and csv files for further analyzation and cleaning'''

import plotly.express as px
import matplotlib.pyplot as plt
airbnb = pd.read_csv (r'../input/beijing/listings-2.csv') #read csv file
housing = pd.read_csv (r'../input/lianjia/new.csv') #read csv file

In [None]:
'''Cleaning data files, deleting data columns that obviously won't be used'''

airbnb.drop(['id','neighbourhood_group','last_review'],axis='columns',inplace=True)
housing.drop(['url','id','Cid','tradeTime','ladderRatio','communityAverage','buildingType','constructionTime','renovationCondition'],axis='columns',inplace=True)

In [None]:
'''Deleting some outliers in dataframes for better visualization'''

index_names = airbnb[airbnb['price'] > 10000].index
airbnb.drop(index_names, inplace = True)

index_names = housing[housing['square'] > 750].index
housing.drop(index_names, inplace = True)

# 2. Analyzation

In [None]:
'''Getting some overviews for both dataframes. We can clearly tell that there latitude 39.75-40.1, longitude 116.25-116.7
is the most popular palce in the Airbnb market.'''

print(airbnb.columns)
hist1 = airbnb.hist(figsize = (15,12))

In [None]:
'''The popular range stays the same for the housing market. Furthermore, we can tell that the mode price per m^2 is about 40K,
and most of the houses are between 0-200 m^2. Most of them are over 5 years, and has convenient public transportation options
(subway).'''

print(housing.columns)
hist2 = housing.hist(figsize = (15,12))

Other than histograms above, I use scatter plots as my major data visualization method. I've tried 3D plots, but the dataframs is too big and my computer can't handle the real-time interaction with the plot. 

In [None]:
"""for further analyzation, I've plotted the district on map and found their corresponding name. I won't plot ot here because
my laptop got really slow when having to many plots open altogether. I substitude district number with their names here."""

map_dict = {1:"Dongcheng", 2:'Fengtai', 3:'Daxing', 4:'Daxing', 5:'Fangshan', 6:'Changping', 7:'Chaoyang', 8:'Haidian', 9:'Shijingshan', 10:'Xicheng', 11:'Tongzhou', 12:'Shunyi'}
housing["District"] = housing["district"].map(map_dict)
housing.drop('district',axis='columns',inplace=True)

let's plot prices on the map!

In [None]:
"""Plotting houses for rent on the map, color represent price."""

scatter1 = px.scatter_mapbox(airbnb, lat="latitude", lon="longitude", hover_name='name', hover_data=["neighbourhood", "number_of_reviews", "reviews_per_month"],
                        color="price", zoom=8, height=600, width=800, opacity=0.2)
scatter1.update_layout(mapbox_style="open-street-map")
scatter1.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
scatter1.show()

In [None]:
"""Plotting houses for sell on the map, color represent price."""

scatter2 = px.scatter_mapbox(housing, lat="Lat", lon="Lng", hover_name='totalPrice', hover_data=["square", "followers", "floor","District"],
                        color="price", zoom=8, height=600, width=800, opacity=0.1)
scatter2.update_layout(mapbox_style="open-street-map")
scatter2.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
scatter2.show()

It seems that there are not huge relationship between pricings on the two map above. The Airbnb is more focused on tourist attractions and probably more expensive if the house itself is fancier, whereas the pricing for buying houses completely depends on where the city center is. 
* *But is that all?*

In [None]:
"""Plotting houses for rent on the map, color represent renting type."""
scatter3 = px.scatter_mapbox(airbnb, lat="latitude", lon="longitude", color="room_type", zoom=8, height=600, width=800, opacity=0.2)
scatter3.update_layout(mapbox_style="open-street-map")
scatter3.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
scatter3.show()

* As we can see here, although prices are pretty much the same, the more airbnbs close to center, the more shares rooms or private rooms are on provided. In other words, same price, you can only get smaller rooms in Airbnb in downtown Beijing. If you are a house owner, you can earn more if your house is in downtown (because you dan host more people!)
* That answers our question. The short rent market in Beijing absolutely relates to the sellin market. 

*Now let's look at some other interesting visualizations. Because there are so many datapoints, we use some violin graphs here for a clearer view.
* First is some inner relationship between district, size, and price. 

In [None]:
"""Plotting housing prices on the plot, bigger circles means larger houses."""

scatter4 = px.scatter(housing, x='District', y='price', size='square', size_max=60, opacity=0.1)
scatter4.show()

By looking at teh scatter plot above, we can tell which districts are old city center and which districts are relatively new. Districts that seems "thinner" and "taller" are old downtowns, such as Dongcheng and Xicheng. Old downtowns are always crowded.Chaoyang and Haidian have some bigger houses but are also expensive. Areas like Changping and Daxing hold big houses with cheaper price per square meter, which are some newly developed areas that might contain some houses instead of apartments. Finally, Fangshan and Shunyi are not yet developed.
* Below is a clearer view for quantity and prices in each district.

In [None]:
violin1 = px.violin(housing, x='District', y='price', box=True, hover_data=housing.columns)
violin1.show()

Now let's take a final look at data from Airbnb. 

In [None]:
"""Violin plot for most popular district in Aribnb"""
violin2 = px.violin(airbnb, y='reviews_per_month', x='neighbourhood', box=True)
violin2.show()

Some of the labels are in Chinese and without English translations, but I can read Chinese, so it's fine. Here is the translation. (left to right)
* Chaoyang, Haidian, Shijingshan, Huairou, Shunyi, Changping, Tongzhou, Fengtai, Xicheng, Daxing, Dongcheng, Miyun, Yanqing, Fangshan, Mentougou, Pinggu
It seems that Chaoayng, Dongcheng, Xicheng and Fengtai are most popular among tourists. It's reasonable because Haidian is basically a CBD and don't have much tourist attractions. Fengtai, however, has train station and historical attractions such as WWII museum and monument.

# Conclusions / Directions for future work
* The result is at first suprising to me, because housing and renting on't seems to have much relation on prices. But after further digging into it, I found deeper correltaions between the two, such as renting types versus housing price, and relationships within location, price, and tourist attractions. Some places are inconvenient to live but popular for tourism, and vise versa.
* For future directions, I want to look into different cities to see if the pattern retains. I'm also curious about the comparison between housing prices in 2016 (which is this dataset) and pricing in 2021. I also want to find a better way to visualize my data, because only plotting half of them out already slows my computer down. it takes about 10 seconds to display whatever words I type in now.