In [1]:
import networkx as nx
import pybind11
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import geopandas as gpd
import shapely 
import os
import sys
from shapely import wkt
from shapely.geometry import Point
import random
from geopy.distance import geodesic
import re




In [2]:
os.chdir('/home/xuyuan/Desktop/2024 summer/real estate paper/writing/RealEstateBrokerage/network_estimation')
import network_formulation
os.chdir('/home/xuyuan/Desktop/2024 summer/real estate paper/oritignal cleaning/RealEstateBrokerage')

In [14]:
data = pd.read_stata('template.dta')
codebook = {
    1: '北京市',
    2: '成都市',
    3: '重庆市',
    4: '广州市',
    5: '杭州市',
    6: '南京市',
    7: '上海市',
    8: '深圳市',
    9: '天津市',
    10: '武汉市'
}

data['city_id'] = data['city_id'].map(codebook)
print(len(data))

217200


In [15]:
if 'influence' in data.columns:
    data.drop(columns = ['influence'], inplace = True)
if 'community' in data.columns:
    data.drop(columns = ['community'], inplace = True)

In [16]:
def calculate_influential_effects(df_network, df_edges):
    # Create a graph
    G = nx.Graph()

    # Add nodes for stores and communities
    stores = df_network['store_id'].unique()
    communities = df_network['community_id'].unique()
    for store in stores:
        G.add_node(f'Store {store}', type='store')
    for community in communities:
        G.add_node(f'Community {community}', type='community')

    # Add edges based on df_network
    for _, row in df_network.iterrows():
        store_id = f'Store {row["store_id"]}'
        community_id = f'Community {row["community_id"]}'
        effect = row['effect']
        G.add_edge(store_id, community_id, weight=1) # we can specify the effect with weight effect
        # print(f"Edge added: {store_id} <-> {community_id}, Weight: {effect}")

    # Add edges based on df_edges
    for _, row in df_edges.iterrows():
        store_id_1 = f'Store {row["store_id_1"]}'
        store_id_2 = f'Store {row["store_id_2"]}'
        effect = row['effect']
        G.add_edge(store_id_1, store_id_2, weight=effect)
        # print(f"Edge added: {store_id_1} <-> {store_id_2}, Weight: {effect}")

    # Initialize community influence
    community_influence = {f'Community {i}.0': 0 for i in communities}
    # BFS to propagate influence
    # Function to propagate influence from a community through the network of stores
    def propagate_influence(community):
        # print(community)
        queue = []
        visited = set()

        # Initialize the queue with stores directly connected to the community
        for store_id in G.neighbors(community):
            if store_id.startswith('Store'):
                initial_effect = G[community][store_id]['weight']
                queue.append((store_id, initial_effect)) # (store_id, cumulative_effect)
                visited.add(store_id)
                # print(f"Initial: Community {community} -> Store {store_id}, Effect: {initial_effect}")


        # Perform BFS to propagate the influence
        while queue:
            current_store, current_effect = queue.pop(0)
            for neighbor in G.neighbors(current_store):
                if neighbor not in visited:
                    if neighbor.startswith('Store'):
                        # Calculate the propagated effect
                        edge_weight = G[current_store][neighbor]['weight']
                        new_effect = current_effect * edge_weight
                        queue.append((neighbor, new_effect))
                        visited.add(neighbor)
                        # print(f"Propagate: Store {current_store} -> Store {neighbor}, Effect: {new_effect}")
            # Update the influence for the initial community
            community_influence[community] += current_effect
            # print(f"Accumulate: Community {community}, Current Store {current_store}, Effect: {current_effect}")

    # Calculate the influence for each community
    for community in community_influence.keys():
        propagate_influence(community)

    return community_influence

In [17]:
for i in range(0, len(data['city_id'].unique())):
    map_data = data[data['city_id'] == data['city_id'].unique()[i]]
    for j in range(0, len(map_data['year'].unique())):
        map_data = data[(data['city_id'] == data['city_id'].unique()[i]) & (data['year'] == data['year'].unique()[j])]
        map_data = gpd.GeoDataFrame(map_data, geometry = map_data.geometry.apply(wkt.loads))
        df = pd.read_csv("classifying brokerages/processed/{}_{}.csv".format(data['year'].unique()[j] - 2000, data['city_id'].unique()[i]))
        map_data['longitude'] = map_data['geometry'].apply(lambda point: point.x)
        map_data['latitude'] = map_data['geometry'].apply(lambda point: point.y)
        
        
        effects = map_data['number'].values
        stores = df[['gpsx', 'gpsy']].values.tolist()
        communities = map_data[['longitude', 'latitude']].values.tolist()

        within_distance_meters = 410.0 # this is what we find in the RD design
        # Perform network formation
        network, edges = network_formulation.network_formation(stores, communities, effects, within_distance_meters)
        
        df_edges = pd.DataFrame(edges, columns=["store_id_1", "store_id_2", "effect"])
        
        df_network = []
        for x, comm_effects in enumerate(network):
            for comm, effect in comm_effects:
                df_network.append((x, comm, effect))
        df_network = pd.DataFrame(df_network, columns=["store_id", "community_id", "effect"])


        # Calculate the average influential effect for each community
        community_influence = calculate_influential_effects(df_network, df_edges)
        
        community_influence_df = pd.DataFrame(list(community_influence.items()), columns=['community', 'influence'])
        community_influence_df['community_id'] = community_influence_df['community'].str.extract(r'(\d+)').astype(int)
        
        map_data.reset_index(drop = True, inplace = True)
        map_data['community_id'] = map_data.index
        merged_data = pd.merge(map_data, community_influence_df, on='community_id', how='left')

        # print(len(community_influence_df['community_id'].unique()))
        # print(merged_data['influence'].isnull().sum())

        merged_data['influence'] = merged_data['influence'].fillna(0)
        
        if i == 0 and j == 0:
            combined_result = merged_data
        else:
            combined_result = pd.concat([combined_result, merged_data], ignore_index=True, verify_integrity=True)

In [18]:
print(combined_result.isnull().sum())

building_type        0
village              0
district             0
floor_level          0
new_lng              0
                 ...  
longitude            0
latitude             0
community_id         0
community        74466
influence            0
Length: 118, dtype: int64


In [19]:
print(len(combined_result))

217200


In [21]:
combined_result = combined_result[['id', 'year', 'influence']]

## NOTE

this is the exported network effect data and we should merge it back

This should be useable after the analysis part is done

In [22]:
combined_result.to_csv('combined_result-with-network.csv', encoding='utf-8', index=False)

// merge the result back to the file

import delimited "to_merged_with_network.csv", clear
* Sort the imported CSV file by id and year
sort id year

save "temp_csv.dta", replace

use "template.dta", clear
	
sort id year

merge 1:1 id year using "temp_csv.dta"
save "template.dta", replace

// this is the stata code to merge back the original file

use "individual.dta", clear

merge n:1 id year using "temp_csv.dta"
save "individual.dta", replace