In [10]:
import scipy
import networkx as nx
import pybind11
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import geopandas as gpd
import shapely 
import os
import sys
from shapely import wkt
from shapely.geometry import Point
import random
from geopy.distance import geodesic
import re

In [11]:
os.chdir('/home/xuyuan/Desktop/2024 summer/real estate paper/writing/RealEstateBrokerage/network_estimation')
import network_formulation
os.chdir('/home/xuyuan/Desktop/2024 summer/real estate paper/oritignal cleaning/RealEstateBrokerage')

In [12]:
data = pd.read_stata('template.dta')
codebook = {
    1: '北京市',
    2: '成都市',
    3: '重庆市',
    4: '广州市',
    5: '杭州市',
    6: '南京市',
    7: '上海市',
    8: '深圳市',
    9: '天津市',
    10: '武汉市'
}

data['city_id'] = data['city_id'].map(codebook)
print(len(data))

217200


In [13]:
map_data = data[(data['city_id'] == '北京市') & (data['year'] == 2021)]
print(len(map_data))

0


In [14]:
def construct_graph(df_network, df_edges):
    # now we need to convert the problem using cpp
    G = nx.Graph()

    store_ids = set(df_network['store_id']).union(df_edges['store_id_1']).union(df_edges['store_id_2'])
    for store_id in store_ids:
        G.add_node(f'Store {store_id}', color='red', shape='o')

    # Add nodes for communities
    community_ids = set(df_network['community_id'])
    for community_id in community_ids:
        G.add_node(f'Community {community_id}', color='blue', shape='s')

    # Add edges based on df_network
    for _, row in df_network.iterrows():
        store_id = row['store_id']
        community_id = row['community_id']
        effect = row['effect']
        G.add_edge(f'Store {store_id}', f'Community {community_id}', weight=1, color='gray', style='dotted')
        # we can also assign weight 1 for this result
    # Add edges based on df_edges
    for _, row in df_edges.iterrows():
        store_id_1 = row['store_id_1']
        store_id_2 = row['store_id_2']
        effect = row['effect']
        G.add_edge(f'Store {store_id_1}', f'Store {store_id_2}', weight=effect, color='black', style='solid')

    return G

In [15]:
def calculate_summary_statistics(centrality_dict):
    values = list(centrality_dict.values())
    return {
        'max': max(values),
        'mean': np.mean(values),
        'median': np.median(values)
    }

In [16]:
results = []

for i in range(0, len(data['city_id'].unique())):
    map_data = data[data['city_id'] == data['city_id'].unique()[i]]
    for j in range(0, len(map_data['year'].unique())):
        print('now working on {} in year {}'.format(data['city_id'].unique()[i], data['year'].unique()[j]))
        map_data = data[(data['city_id'] == data['city_id'].unique()[i]) & (data['year'] == data['year'].unique()[j])]
        map_data = gpd.GeoDataFrame(map_data, geometry = map_data.geometry.apply(wkt.loads))
        df = pd.read_csv("classifying brokerages/processed/{}_{}.csv".format(data['year'].unique()[j] - 2000, data['city_id'].unique()[i]))
        map_data['longitude'] = map_data['geometry'].apply(lambda point: point.x)
        map_data['latitude'] = map_data['geometry'].apply(lambda point: point.y)
        
        effects = map_data['number'].values
        stores = df[['gpsx', 'gpsy']].values.tolist()
        communities = map_data[['longitude', 'latitude']].values.tolist()
        
        within_distance_meters = 410.0 # this is what we find in the RD design
        # Perform network formation
        network, edges = network_formulation.network_formation(stores, communities, effects, within_distance_meters)
        
        df_edges = pd.DataFrame(edges, columns=["store_id_1", "store_id_2", "effect"])
        df_network = []
        for x, comm_effects in enumerate(network):
            for comm, effect in comm_effects:
                df_network.append((x, comm, effect))
        df_network = pd.DataFrame(df_network, columns=["store_id", "community_id", "effect"])
        
        G = construct_graph(df_network, df_edges)
        
        
        local_clustering = nx.clustering(G, weight='weight')
        average_clustering = nx.average_clustering(G, weight='weight')
        global_clustering = nx.transitivity(G)
        
        degree_centrality = nx.degree_centrality(G)
        betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
        closeness_centrality = nx.closeness_centrality(G, distance='weight')
        
        # Calculate summary statistics for degree centrality
        degree_summary = calculate_summary_statistics(degree_centrality)
        # Calculate summary statistics for betweenness centrality
        betweenness_summary = calculate_summary_statistics(betweenness_centrality)
        # Calculate summary statistics for closeness centrality
        closeness_summary = calculate_summary_statistics(closeness_centrality)
        
        pagerank = nx.pagerank(G, weight='weight')
        pagerank_summary = calculate_summary_statistics(pagerank)
        
        connectivity = nx.node_connectivity(G)
        
        result = {
            'city_id': data['city_id'].unique()[i],
            'year': data['year'].unique()[j],
            'average_clustering': average_clustering,
            'global_clustering': global_clustering,
            'degree_centrality_max': degree_summary['max'],
            'degree_centrality_mean': degree_summary['mean'],
            'degree_centrality_median': degree_summary['median'],
            'betweenness_centrality_max': betweenness_summary['max'],
            'betweenness_centrality_mean': betweenness_summary['mean'],
            'betweenness_centrality_median': betweenness_summary['median'],
            'closeness_centrality_max': closeness_summary['max'],
            'closeness_centrality_mean': closeness_summary['mean'],
            'closeness_centrality_median': closeness_summary['median'],
            'pagerank_max': pagerank_summary['max'],
            'pagerank_mean': pagerank_summary['mean'],
            'pagerank_median': pagerank_summary['median'],
            'connectivity': connectivity
        }
        
        results.append(result)

now working on 北京市 in year 2016


In [None]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Save to a CSV file if needed
results_df.to_csv('network_measures_summary.csv', index=False)