In [29]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
from shapely import wkt
import re

## This is the code book of the data

- 'building_type': The classification of a particular building.
- 'village': The region that it belongs to.
- 'district': administrative division that the Community belongs to.
- 'floor_level': The level on which a particular room or apartment is, within a building.
- 'new_lng': the longitude coordinates.
- 'new_lat': the latitude coordinates.
- 'year': time id.
- 'floor_ratio': The ratio of the floor area to the total plot area.
- 'green_ratio': The ratio of the green space to the total plot area.
- 'nego_times': The number of times a negotiation was held.
- 'lead_times': The time it takes before a deal is made.
- 'total_building': The total number of buildings in an area.
- 'total_resident': The total number of residents in an area.
- 'watching_people': The number of people watching a listing.
- 'watched_times': The number of times a listing is watched.
- 'striker_price': The initial asking price.
- 'striker_price_pers': The asking price per square foot.
- 'end_price': The final agreed price.
- 'end_price_pers': The final agreed price per square foot.
- 'area': The area of a property.
- 'nego_period': The period over which negotiations took place.
- 'bedroom': The number of bedrooms in a property.
- 'living_room': The number of living rooms in a property.
- 'kitchen': The number of kitchens in a property.
- 'toilet': The number of toilets in a property.
- 'total_floor_number': The number of floors in a building.
- 'elevator_ratio': The ratio of elevators to the total number of floors.
- 'house_age': The age of the house.
- 'income': The income lianjia in this given district.
- 'number': The number lianjia in this given district.
- 'super': referring to proximity to supermarkets (measured by number within given distance).
- 'sub': referring to proximity to subway stations.
- 'hotel': referring to proximity to hotels
- 'kind': referring to proximity to kindergartens
- 'prim': referring to primary schools.
- 'mid': referring to middle schools.
- 'shop_mall': referring to shopping mall.
- 'west_food': referring to the availability of western food nearby.
- 'park': referring to parks.
- 'museum': Distance to the nearest museum.
- 'ktv': referring to KTV and some entertainment venues.
- 'jiadian': referring to electronic shops.
- 'old': referring to old care systems.
- 'other': other real estate brokerages within 1km.
- 'other_5': other real estate brokerages within 0.5km.
- 'lianjia': lianjia's number within 1km.
- 'lianjia_5': lianjia's number within 0.5km.
- 'beke': beke's number within 1km.
- 'beke_5': beke's number within 0.5km.
- 'geometry': geometry information.
- 'light': night time lights.
- 'pop': population density.
- 'pm25': Air quality measure.
- 'region': city name.
- 'id': unique id.
- 'business_area': business area.
- 'index_right': unique index id 
- 'num': transaction number within 1km
- 'prft': lianjia's income within 1km
- 'price' housing price within 1km

now we extract the number of transactions within the given 1km geometry to map with the community level data.

## the below codes are for merging the 1km data to the district level data

you do not need to execute them because I have already merged it

In [2]:
data = pd.read_csv('cleaned_1km.csv')
df = pd.read_csv('cleaned_district.csv')
data.drop(columns = 'index', inplace=True)

# Create GeoDataFrames
df_copy = gpd.GeoDataFrame(df.copy(), geometry=df['geometry'].apply(wkt.loads))
data_copy = gpd.GeoDataFrame(data.copy(), geometry=data['geometry'].apply(wkt.loads))
des = gpd.GeoDataFrame(data.drop_duplicates(subset=['geometry'], keep='first').copy(), 
                       geometry=data.drop_duplicates(subset=['geometry'], keep='first')['geometry'].apply(wkt.loads))



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
df_copy = df_copy.set_crs('epsg:4326')
data_copy = data_copy.set_crs('epsg:4326')
des = des.set_crs('epsg:4326')

In [7]:
data_copy['price'] = data_copy['price'] / data_copy['area']
# this would guarantee that the result is the average price per square meter

In [8]:
if 'index_left' in df_copy.columns:
    df_copy.drop(columns=['index_left'], inplace=True)
if 'index_right' in df_copy.columns:
    df_copy.drop(columns=['index_right'], inplace=True)

# Check and rename/drop 'index_left' and 'index_right' in des
if 'index_left' in des.columns:
    des.drop(columns=['index_left'], inplace=True)
if 'index_right' in des.columns:
    des.drop(columns=['index_right'], inplace=True)
    
joined_gdf = gpd.sjoin(df_copy, des, how="left", op='within')


  if await self.run_code(code, result, async_=asy):


In [9]:
# joined_gdf['id_unique', 'year'] # id_unique is the unique identification of the data object
# and now we shall join the data to the district data

data_copy['unique_key'] = data_copy['id_unique'].astype(str) + '_' + data_copy['year'].astype(str)
joined_gdf['unique_key'] = joined_gdf['id_unique'].astype(str) + '_' + joined_gdf['year_left'].astype(str)
data_relevant = data_copy[['unique_key', 'prft', 'num', 'price']]

joined_gdf = joined_gdf.merge(data_relevant, on='unique_key', how='left')
joined_gdf.drop(columns=['unique_key'], inplace=True)

In [10]:
df_copy['unique_key'] = df_copy['id'].astype(str) + '_' + df_copy['year'].astype(str)
joined_gdf['unique_key'] = joined_gdf['id_left'].astype(str) + '_' + joined_gdf['year_left'].astype(str)

In [11]:
joined_gdf.rename(columns = {'prft': 'region_income', 'num': 'region_num', 'price': 'region_price'}, inplace=True)
data_relevant = joined_gdf[['unique_key', 'region_income', 'region_num', 'region_price']]

In [12]:
df_copy = df_copy.merge(data_relevant, on='unique_key', how='left')

In [14]:
df_copy.drop(columns=['unique_key', 'num', 'prft', 'price'], inplace=True)

In [15]:
df_copy.to_csv('cleaned_district_Jan.csv', index = False)

In [None]:
data = pd.read_csv('cleaned_district_Jan.csv')

In [None]:
data.sort_values(by = ['id', 'year'], inplace = True)
data['lag_lianjia'] = data.groupby('id')['lianjia_5'].shift(1)
data['lag_lianjia'] = data['lag_lianjia'].fillna(data['lianjia_5'])
data[['id', 'lianjia_5', 'lag_lianjia']]
data['entry'] = (data['lianjia_5'] > data['lag_lianjia']).astype(int)

In [None]:
n = 3
for i in range(1, n + 1):
    data[f'post{i}'] = data.groupby('id')['entry'].shift(i).fillna(0)
for i in range(1, n + 1):
    data[f'pre{i}'] = data.groupby('id')['entry'].shift(-i).fillna(0)

In [None]:
data.to_csv('cleaned_district_Jan_2.csv', index = None)

## now we construct the RD design dataset

the RD design dataset contains the following properties:

First, we extract a list of csv files located in the lianjia_beke directory within the given working path. Then we map these files to the design of communities and extract each lianjia store with its nearest community or nearest two communities respectively. Then we conduct the RD analysis in the stata file.

In [2]:
data = pd.read_csv('cleaned_district_Jan_2.csv')

In [3]:

# Define the directory path
directory_path = "lianjia_beke"
filenames = []
# List all files and directories in the given path
for filename in os.listdir(directory_path):
    filenames.append(filename)
# data['region'].unique()

In [4]:
city_name_mapping = {
    'beijing': '北京市',
    'chengdu': '成都市',
    'chongqing': '重庆市',
    'guangzhou': '广州市',
    'hangzhou': '杭州市',
    'nanjing': '南京市',
    'shanghai': '上海市',
    'shenzhen': '深圳市',
    'tianjin': '天津市',
    'wuhan': '武汉市',
    'xian': '西安市'
}

In [5]:
def map_to_chinese_csv(row):
    year_suffix = str(row['year'])[-2:]  # Extract the last two digits of the year
    chinese_city = city_name_mapping[row['region']]  # Map to Chinese city name
    return f"{chinese_city}{year_suffix}.csv"  # Combine to form the Chinese CSV file name

In [6]:
data['chinese_csv'] = data.apply(map_to_chinese_csv, axis=1)

In [7]:
dataframes_list = []
for i in data['chinese_csv'].unique():
    dataframes_list.append(data[data['chinese_csv'] == i])

for i in range(len(dataframes_list)):
    dataframes_list[i] = gpd.GeoDataFrame(dataframes_list[i], geometry=dataframes_list[i]['geometry'].apply(wkt.loads))
    dataframes_list[i] = dataframes_list[i].set_crs('epsg:4326')

## NOTE

the code below extract the number of nearest community for each lianjia's store. It used nested for loop to achieve this and this code runs pretty long time, we can optimize it using Cpp.

For computing purpose, I suppose to use the Cpp codes in the second block

procedure to build it (you should revise the location in your computer):



```
pip install pybind11

python3-config --cflags

pybind11-config --includes

cd the/file/path/RealEstateBrokerage/

g++ -O3 -shared -std=c++11 -fPIC -I/usr/include/python3.8 -I/home/xuyuan/.local/lib/python3.8/site-packages/pybind11/include -o nearest_community_cpp.so nearest_community_cpp.cpp
```

the running time is less than 25 seconds

```
for i in range(0, len(dataframes_list)):
    communities_gdf = dataframes_list[i]
    df = pd.read_csv('lianjia_beke/' + communities_gdf['chinese_csv'].unique()[0])
    store_locations_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['gpsx'], df['gpsy']))
    # store_locations_gdf['nearest_community_index'] = -1
    
    num_nearest_communities = 2
    
    nearest_community_indices_list = []
    
    for store_index, store_row in store_locations_gdf.iterrows():
        store_location = store_row['geometry']
        
        # Calculate distances to all communities and store them in a Series
        distances = communities_gdf.geometry.apply(lambda x: store_location.distance(x))
        
        # Sort the distances and select the indices of the nearest communities
        nearest_community_indices = distances.argsort()[:num_nearest_communities].tolist()
        
        # Append the list of nearest community indices to the list
        nearest_community_indices_list.append(nearest_community_indices)
        
    # Assign the list to the 'nearest_community_indices' column
    store_locations_gdf['nearest_community_indices'] = nearest_community_indices_list
    
    store_locations_gdf.to_csv('nearest_community/' + communities_gdf['chinese_csv'].unique()[0], index = False)
```

In [9]:
import nearest_community_cpp

directory_path = "nearest_community"
if not os.path.exists(directory_path):
    # If it doesn't exist, create the directory
    os.makedirs(directory_path)

for i in range(0, len(dataframes_list)):
    communities_gdf = dataframes_list[i]
    community_locations_cpp = [list(point.coords[0]) for point in communities_gdf['geometry']]
    df = pd.read_csv('lianjia_beke/' + communities_gdf['chinese_csv'].unique()[0])
    store_locations_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['gpsx'], df['gpsy']))
    store_locations_cpp = [list(point.coords[0]) for point in store_locations_gdf['geometry']]
    
    num_nearest_communities = 2 # you may change this number whatever you want
    nearest_community_indices_tuple = \
        nearest_community_cpp.find_nearest_communities(store_locations_cpp, community_locations_cpp, num_nearest_communities)
    
    nearest_community_indices_list, nearest_community_distances_list = nearest_community_indices_tuple

    # Assign the list to the 'nearest_community_indices' column
    store_locations_gdf['nearest_community_indices'] = nearest_community_indices_list
    store_locations_gdf['nearest_community_distances'] = nearest_community_distances_list
    
    store_locations_gdf.to_csv('nearest_community/' + communities_gdf['chinese_csv'].unique()[0], index = False)

now we merge it back to our dataframe to get the original dataframe. Note that the order gonna be:

1. The index of the nearest community to the store location.
2. The index of the second nearest community to the store location.
3. If there are more communities to consider (i.e., num_nearest_communities is greater than 2), the indices of the subsequent nearest communities will follow.

In [12]:
communities_gdf = dataframes_list[0]
df = pd.read_csv('nearest_community/' + communities_gdf['chinese_csv'].unique()[0])
df.head(5)

Unnamed: 0,name,type,gpsx,gpsy,lianjia2,geometry,nearest_community_indices,nearest_community_distances
0,链家(潘家园店),生活服务;中介机构;中介机构|购物服务;购物相关场所;购物相关场所,116.456334,39.875065,链家(潘家园店),POINT (116.456333588003 39.8750648736051),"[3016, 870]","[0.017490703063189963, 0.05503946806844407]"
1,链家(团结湖店),生活服务;中介机构;中介机构|购物服务;购物相关场所;购物相关场所,116.461314,39.927974,链家(团结湖店),POINT (116.461313947982 39.9279739298663),"[1591, 1578]","[0.19038161099272874, 0.23991664301711862]"
2,链家(团结湖路),生活服务;中介机构;中介机构|购物服务;购物相关场所;购物相关场所,116.461018,39.926613,链家(团结湖路),POINT (116.461018145355 39.9266132293729),"[1591, 1594]","[0.08929879133855328, 0.14133020776835956]"
3,链家(金星园店),生活服务;中介机构;中介机构|购物服务;购物相关场所;购物相关场所,116.449045,39.966118,链家(金星园店),POINT (116.449045305641 39.9661181413891),"[4814, 2183]","[0.023880360645829844, 0.13137326907335756]"
4,链家(花家地西里店),生活服务;中介机构;中介机构|购物服务;购物相关场所;购物相关场所,116.451689,39.987442,链家(花家地西里店),POINT (116.451689294448 39.987442214481696),"[519, 4217]","[0.22282636588868104, 0.26696901573145504]"


In [32]:
revised_dataframe = pd.DataFrame()
for i in range(0, len(dataframes_list)):
    communities_gdf = pd.DataFrame(dataframes_list[i])
    
    df = pd.read_csv('nearest_community/' + communities_gdf['chinese_csv'].unique()[0])
    
    communities_gdf['nearest_index_1'] = 0
    communities_gdf['nearest_index_2'] = 0
    
    for _, row in df.iterrows():
        nearest_indices_str = row['nearest_community_indices']
        nearest_indices = [int(num) for num in re.findall(r'\d+', nearest_indices_str)]
        
        # Update communities_gdf for the first nearest community
        if len(nearest_indices) > 0 and nearest_indices[0] in communities_gdf.index:
            communities_gdf.loc[nearest_indices[0], 'nearest_index_1'] = 1
            communities_gdf.loc[nearest_indices[0], 'nearest_index_2'] = 1
        
        # Update communities_gdf for the second nearest community
        if len(nearest_indices) > 1 and nearest_indices[1] in communities_gdf.index:
            communities_gdf.loc[nearest_indices[1], 'nearest_index_2'] = 1
    
    revised_dataframe = pd.concat([communities_gdf, revised_dataframe])

In [36]:
revised_dataframe.to_csv('cleaned_district_Jan_3.csv', index = False)