<a href="https://colab.research.google.com/github/sayarghoshroy/LoClus/blob/main/cluster_neighborhoods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import math
import tqdm
import json
import pandas as pd

In [2]:
sys.setrecursionlimit(int(1e7))

In [3]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

In [4]:
path = 'drive/MyDrive/place2crash_data/'

with open(path + 'train_data.json', 'r+') as f:
  train_data = json.load(f)

with open(path + 'test_data.json', 'r+') as f:
  test_data = json.load(f)

with open(path + 'head_to_col_id.json', 'r+') as f:
  mapping = json.load(f)

raw_data = train_data + test_data
# Not utilizing price labels

In [5]:
# Uncomment to view the Feature name to ID mapping:
# mapping

In [6]:
# Raw data format:
print(raw_data[0])

[30913224, 'Cozy and Sunny Room Williamsburg, Luxury Building', 'Brooklyn', 'Williamsburg', 40.70959, -73.94652, 'Private room', 80, 3, 2, 0.31, 0]


In [7]:
data = []
size = 0

for item in raw_data:
  point = {}
  point['id'] = item[0]
  point['representative_point'] = {'latitude': float(item[4]),
                                    'longitude': float(item[5])}
  point['neighborhood'] = item[3]
  point['room_type'] = item[6]
  
  data.append(point)
            
size = len(data)

In [8]:
# Verifying correctness
print('First Datapoint:')
print(data[0])

First Datapoint:
{'id': 30913224, 'representative_point': {'latitude': 40.70959, 'longitude': -73.94652}, 'neighborhood': 'Williamsburg', 'room_type': 'Private room'}


In [9]:
# Number of items
print('Number of Points: ' + str(len(data)))

Number of Points: 48895


In [10]:
# Categorizing points

argument = 'neighborhood'

category_dict = {}
valid_datapoints = 0

for item in data:
    if argument not in item.keys():
        continue
    if item[argument] not in category_dict.keys():
        category_dict[item[argument]] = []

    category_dict[item[argument]].append(item)
    valid_datapoints += 1
    
print('Number of datapoints having ' + argument + ' = ' + str(valid_datapoints))

Number of datapoints having neighborhood = 48895


In [11]:
print('Number of categories: ' + str(len(category_dict.keys())))
stats = []

for key in category_dict.keys():
    stats.append([key, len(category_dict[key])])
    
df = pd.DataFrame(stats, columns = [argument, 'count'])
df.sort_values(by = 'count', inplace = True, ascending = False)
df.reset_index(drop = True, inplace = True)
display(df)
df.to_csv(path + 'results/' + argument + '-wise-stats.csv', index = False)

Number of categories: 221


Unnamed: 0,neighborhood,count
0,Williamsburg,3920
1,Bedford-Stuyvesant,3714
2,Harlem,2658
3,Bushwick,2465
4,Upper West Side,1971
...,...,...
216,Richmondtown,1
217,Willowbrook,1
218,Rossville,1
219,New Dorp,1


In [12]:
def comp_distance(A, B):
    lat_A = math.radians(A[0])
    lat_B = math.radians(B[0])
    long_A = math.radians(A[1])
    long_B = math.radians(B[1])
    
    diff_lat = lat_B - lat_A
    diff_long = long_B - long_A
    
    step_A = math.sin(diff_lat / 2) ** 2
    step_B = math.cos(lat_A) * math.cos(lat_B) * math.sin(diff_long / 2) ** 2
    
    comp = 2 * math.asin(math.sqrt(step_A + step_B))
    radius = 6.3781 * 1e6
    # Radius of Earth in meters
    distance = radius * comp
    
    # Returns distance in meters
    return distance

def get_point(point_struct):
    try:
        lat = point_struct['latitude']
        lon = point_struct['longitude']
    
    except Exception as E:
        return [0, 0]
    
    return [lat, lon]

def get_distance(point_struct_A, point_struct_B):
    return comp_distance(get_point(point_struct_A), get_point(point_struct_B))

def get_point_distance(item_A, item_B):
    point_struct_A = item_A['representative_point']
    point_struct_B = item_B['representative_point']
    
    return get_distance(point_struct_A, point_struct_B)

# Testing
print('comp_distance([53.32, -1.73], [53.321, -1.69]): '
      + str(comp_distance([53.32, -1.73], [53.321, -1.69]))
      + ' meters.')

print('get_point_distance(data[0], data[1]): ' + str(get_point_distance(data[0], data[1])) + ' meters.')

comp_distance([53.32, -1.73], [53.321, -1.69]): 2662.128538636821 meters.
get_point_distance(data[0], data[1]): 9487.843744689368 meters.


In [13]:
# To run clustering for a particular neighborhood
argument == 'neighborhood'

# # Enter neighborhood
neighborhood_name = 'Harlem'

items = category_dict[neighborhood_name]

In [14]:
# To run for all neighborhoods
# neighborhood = 'all'
# items = data

In [15]:
threshold = 50
# Set a threshold distance in meters

In [16]:
# Testing out pairwise distance computation

# Mark True for testing:
check_distances = False

if check_distances == True:
    
    distances = []
    count = 0

    print('Neighborhood: ' + str(neighborhood_name), flush = True)

    for item_A in tqdm.tqdm(items):
        for item_B in items:
            if item_A['id'] == item_B['id']:
                continue
            else:
                distances.append(get_point_distance(item_A, item_B))
                if distances[-1] <= threshold:
                    count += 1

    print('Total number of distances: ' + str(len(distances)), flush = True)
    print('Number of pairwise distances <= ' + str(threshold) + ' meters : ' + str(count), flush = True)

In [17]:
# Preparing global data structures

map_items = {}
adj_items = {}

def initialize():
    global map_items, item_adj
    map_items = {}
    adj_items = {}
    return
    
def build_map():
    global items, map_items
    for item in items:
        map_items[item['id']] = item
    return

def build_adj():
    global items, adj_items
    
    for key in map_items.keys():
        adj_items[key] = []
    
    for item_A in tqdm.tqdm(items):
        for item_B in items:
            if item_A['id'] == item_B['id']:
                continue
            else:
                distance = get_point_distance(item_A, item_B)
                if distance <= threshold:
                    # Creating the adjacency lists
                    adj_items[item_A['id']].append(item_B['id'])
    return

# Uncomment for testing:
# initialize()
# build_map()
# build_adj()

In [18]:
# Checking correctness of adjacency list creation
check = True

if check == True:
    for key in adj_items.keys():
        if len(adj_items[key]) < 4:
            continue
        
        print(str(key) + str(': '))
        print(adj_items[key])
        break

In [19]:
degree_items = {}
cluster_items = {}
cluster_to_size_map = {}

def compute_degrees():
    global adj_items, degree_items
    for key in adj_items.keys():
        degree_items[key] = len(adj_items[key])
    return
        
cluster_ID = 0

def dft(item_ID):
    # Runs a depth first traversal of the graph from item_ID
    # Marks items with the present cluster ID
    global adj_items, cluster_items, cluster_ID
    
    if item_ID in cluster_items.keys():
        return
    
    cluster_items[item_ID] = cluster_ID
    
    for link in adj_items[item_ID]:
        dft(link)
    
    return
    
def connected_components():
    global map_items, adj_items, cluster_items, cluster_ID
    display = False
    
    cluster_ID = 0
    
    for node in map_items.keys():
        if node in cluster_items.keys():
            continue
        else:
            cluster_ID += 1
            dft(node)
    
    if display == True:
        print('Number of Clusters = ' + str(cluster_ID))
    return

def map_cluster_to_size():
    global cluster_items, cluster_to_size_map
    
    for item in cluster_items.keys():
        cluster_ID = cluster_items[item]
        
        if cluster_ID not in cluster_to_size_map.keys():
            cluster_to_size_map[cluster_ID] = 1
        else:
            cluster_to_size_map[cluster_ID] += 1
            
    return

# Uncomment for testing:
# compute_degrees()
# connected_components()
# map_cluster_to_size()

In [20]:
all_neighborhood_info = []
save_jsons = True

def store():
    global degree_items, cluster_items, map_items, save_jsons, cluster_to_size_map
    global neighborhood_name, threshold, all_neighborhood_info
    
    display_df = False
    item_info = []
    
    for key in map_items.keys():
        map_items[key]['degree'] = degree_items[key]
        map_items[key]['cluster'] = cluster_items[key]
        
        cluster_ID = cluster_items[key]
        cluster_size = cluster_to_size_map[cluster_ID]
        
        item_info.append([key, cluster_items[key], degree_items[key], 
                          map_items[key]['representative_point']['latitude'], 
                          map_items[key]['representative_point']['longitude'], 
                          map_items[key]['neighborhood'], cluster_size,
                          str(str(cluster_items[key]) + '_' + map_items[key]['neighborhood'])])
    
    all_neighborhood_info += item_info
    if save_jsons:
        with open(path + 'results/' + argument + '-' + neighborhood_name + '-' + str(threshold) + '-map_cluster_info.json', 'w+') as f:
            json.dump(map_items, f)
    
    df = pd.DataFrame(item_info, columns = ['item ID', 'Cluster ID', 'Degree', 'latitude', 'longitude', 'Neighborhood', 'Cluster_Size', 'ID_Neighborhood'])
    df.sort_values(['Neighborhood', 'Cluster_Size', 'Cluster ID', 'Degree'], ascending = (True, False, True, False), inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    if display_df == True:
        display(df)
    
    df.to_csv(path + 'results/' + argument + '-' + neighborhood_name + '-' + str(threshold) + '-cluster_info.csv', index = False)
    return
    
# Uncomment for testing:
# store()

In [21]:
all_neighborhood_info = []

hood_count = int(1e3)
consider_limit = int(1e4)

for serial, neighborhood_name in enumerate(category_dict.keys()):
    if serial > hood_count:
        break

    items = category_dict[neighborhood_name]
    
    if len(items) > consider_limit:
        continue
    
    tqdm.tqdm.write('Neighborhood: ' + neighborhood_name)

    map_items = {}
    adj_items = {}
    
    initialize()
    build_map()
    build_adj()

    degree_items = {}
    cluster_items = {}
    cluster_to_size_map = {}
    
    compute_degrees()
    connected_components()
    map_cluster_to_size()

    store()
    tqdm.tqdm.write('')

Neighborhood: Williamsburg


100%|██████████| 3920/3920 [00:52<00:00, 74.94it/s]



Neighborhood: Upper West Side


100%|██████████| 1971/1971 [00:11<00:00, 170.28it/s]



Neighborhood: Clinton Hill


100%|██████████| 572/572 [00:00<00:00, 642.71it/s]



Neighborhood: Harlem


100%|██████████| 2658/2658 [00:22<00:00, 119.46it/s]



Neighborhood: Bedford-Stuyvesant


100%|██████████| 3714/3714 [00:45<00:00, 81.01it/s]



Neighborhood: Long Island City


100%|██████████| 537/537 [00:00<00:00, 675.16it/s]



Neighborhood: Rosedale


100%|██████████| 59/59 [00:00<00:00, 4754.35it/s]



Neighborhood: Cypress Hills


100%|██████████| 135/135 [00:00<00:00, 2108.83it/s]



Neighborhood: Nolita


100%|██████████| 253/253 [00:00<00:00, 1492.92it/s]



Neighborhood: Midtown


100%|██████████| 1545/1545 [00:07<00:00, 218.94it/s]



Neighborhood: Elmhurst


100%|██████████| 237/237 [00:00<00:00, 1591.72it/s]



Neighborhood: Bushwick


100%|██████████| 2465/2465 [00:18<00:00, 130.25it/s]



Neighborhood: Hell's Kitchen


100%|██████████| 1958/1958 [00:11<00:00, 169.01it/s]



Neighborhood: St. George


100%|██████████| 48/48 [00:00<00:00, 6441.01it/s]



Neighborhood: Arverne


100%|██████████| 77/77 [00:00<00:00, 4612.35it/s]



Neighborhood: Park Slope


100%|██████████| 506/506 [00:00<00:00, 723.48it/s]



Neighborhood: Crown Heights


100%|██████████| 1564/1564 [00:07<00:00, 216.65it/s]



Neighborhood: East Village


100%|██████████| 1853/1853 [00:10<00:00, 179.62it/s]



Neighborhood: Washington Heights


100%|██████████| 899/899 [00:02<00:00, 391.53it/s]



Neighborhood: Brownsville


100%|██████████| 61/61 [00:00<00:00, 5270.42it/s]



Neighborhood: Morris Heights


100%|██████████| 17/17 [00:00<00:00, 11971.65it/s]



Neighborhood: Greenpoint


100%|██████████| 1115/1115 [00:04<00:00, 271.75it/s]



Neighborhood: Chelsea


100%|██████████| 1113/1113 [00:03<00:00, 302.63it/s]



Neighborhood: Financial District


100%|██████████| 744/744 [00:01<00:00, 475.25it/s]



Neighborhood: Upper East Side


100%|██████████| 1798/1798 [00:13<00:00, 136.63it/s]



Neighborhood: SoHo


100%|██████████| 358/358 [00:00<00:00, 1018.37it/s]



Neighborhood: Parkchester


100%|██████████| 39/39 [00:00<00:00, 5035.64it/s]



Neighborhood: Theater District


100%|██████████| 288/288 [00:00<00:00, 1223.63it/s]



Neighborhood: Kensington


100%|██████████| 175/175 [00:00<00:00, 1936.29it/s]



Neighborhood: Prospect Heights


100%|██████████| 357/357 [00:00<00:00, 1003.95it/s]



Neighborhood: Windsor Terrace


100%|██████████| 157/157 [00:00<00:00, 2403.49it/s]



Neighborhood: Astoria


100%|██████████| 900/900 [00:02<00:00, 385.19it/s]



Neighborhood: Sunset Park


100%|██████████| 390/390 [00:00<00:00, 929.31it/s]



Neighborhood: Ridgewood


100%|██████████| 423/423 [00:00<00:00, 836.90it/s]



Neighborhood: Jackson Heights


100%|██████████| 186/186 [00:00<00:00, 1956.74it/s]



Neighborhood: East Harlem


100%|██████████| 1117/1117 [00:03<00:00, 312.02it/s]



Neighborhood: Fresh Meadows


100%|██████████| 32/32 [00:00<00:00, 7075.64it/s]



Neighborhood: North Riverdale


100%|██████████| 10/10 [00:00<00:00, 17303.23it/s]



Neighborhood: Woodside


100%|██████████| 235/235 [00:00<00:00, 1510.87it/s]



Neighborhood: West Village


100%|██████████| 768/768 [00:01<00:00, 463.78it/s]



Neighborhood: Ditmars Steinway


100%|██████████| 309/309 [00:00<00:00, 1156.57it/s]



Neighborhood: East Flatbush


100%|██████████| 500/500 [00:00<00:00, 696.04it/s]



Neighborhood: Brighton Beach


100%|██████████| 75/75 [00:00<00:00, 3563.72it/s]



Neighborhood: East New York


100%|██████████| 218/218 [00:00<00:00, 1461.88it/s]



Neighborhood: Midwood


100%|██████████| 109/109 [00:00<00:00, 3724.14it/s]



Neighborhood: Flushing


100%|██████████| 426/426 [00:00<00:00, 829.25it/s]



Neighborhood: Corona


100%|██████████| 64/64 [00:00<00:00, 4235.07it/s]



Neighborhood: Brooklyn Heights


100%|██████████| 154/154 [00:00<00:00, 2581.81it/s]



Neighborhood: Clifton


100%|██████████| 15/15 [00:00<00:00, 11733.41it/s]



Neighborhood: Pelham Bay


100%|██████████| 17/17 [00:00<00:00, 9535.06it/s]



Neighborhood: South Slope


100%|██████████| 284/284 [00:00<00:00, 1284.79it/s]



Neighborhood: Two Bridges


100%|██████████| 72/72 [00:00<00:00, 4524.94it/s]



Neighborhood: Highbridge


100%|██████████| 27/27 [00:00<00:00, 2856.58it/s]



Neighborhood: Bay Ridge


100%|██████████| 141/141 [00:00<00:00, 2633.82it/s]



Neighborhood: Fort Greene


100%|██████████| 489/489 [00:00<00:00, 713.51it/s]



Neighborhood: Chinatown


100%|██████████| 368/368 [00:00<00:00, 956.58it/s]



Neighborhood: Queens Village


100%|██████████| 60/60 [00:00<00:00, 4276.41it/s]



Neighborhood: Kips Bay


100%|██████████| 470/470 [00:00<00:00, 748.36it/s]



Neighborhood: Lower East Side


100%|██████████| 911/911 [00:02<00:00, 383.29it/s]



Neighborhood: Morningside Heights


100%|██████████| 346/346 [00:00<00:00, 1067.69it/s]



Neighborhood: Jamaica


100%|██████████| 231/231 [00:00<00:00, 1580.85it/s]



Neighborhood: Gramercy


100%|██████████| 338/338 [00:00<00:00, 1040.89it/s]



Neighborhood: Boerum Hill


100%|██████████| 177/177 [00:00<00:00, 2052.60it/s]



Neighborhood: Sheepshead Bay


100%|██████████| 164/164 [00:00<00:00, 2238.10it/s]



Neighborhood: Murray Hill


100%|██████████| 485/485 [00:00<00:00, 763.48it/s]



Neighborhood: Grymes Hill


100%|██████████| 7/7 [00:00<00:00, 19891.69it/s]



Neighborhood: Middle Village


100%|██████████| 31/31 [00:00<00:00, 6432.03it/s]



Neighborhood: Sunnyside


100%|██████████| 363/363 [00:00<00:00, 952.08it/s]



Neighborhood: Little Italy


100%|██████████| 121/121 [00:00<00:00, 3166.30it/s]



Neighborhood: Ozone Park


100%|██████████| 62/62 [00:00<00:00, 4546.59it/s]



Neighborhood: Riverdale


100%|██████████| 11/11 [00:00<00:00, 12901.94it/s]



Neighborhood: Prospect-Lefferts Gardens


100%|██████████| 535/535 [00:00<00:00, 660.75it/s]



Neighborhood: Unionport


100%|██████████| 7/7 [00:00<00:00, 17783.24it/s]



Neighborhood: Tribeca


100%|██████████| 177/177 [00:00<00:00, 2110.50it/s]



Neighborhood: Inwood


100%|██████████| 252/252 [00:00<00:00, 1446.33it/s]



Neighborhood: Throgs Neck


100%|██████████| 24/24 [00:00<00:00, 5875.40it/s]



Neighborhood: Red Hook


100%|██████████| 79/79 [00:00<00:00, 3166.66it/s]



Neighborhood: Columbia St


100%|██████████| 42/42 [00:00<00:00, 7883.68it/s]



Neighborhood: East Elmhurst


100%|██████████| 185/185 [00:00<00:00, 2112.79it/s]



Neighborhood: Maspeth


100%|██████████| 110/110 [00:00<00:00, 2732.11it/s]



Neighborhood: Tompkinsville


100%|██████████| 42/42 [00:00<00:00, 5333.20it/s]



Neighborhood: Borough Park


100%|██████████| 136/136 [00:00<00:00, 2432.54it/s]



Neighborhood: Dongan Hills


100%|██████████| 7/7 [00:00<00:00, 19354.07it/s]



Neighborhood: Gowanus


100%|██████████| 247/247 [00:00<00:00, 1477.63it/s]



Neighborhood: Far Rockaway


100%|██████████| 29/29 [00:00<00:00, 9139.29it/s]



Neighborhood: Canarsie


100%|██████████| 147/147 [00:00<00:00, 2403.60it/s]



Neighborhood: Flatbush


100%|██████████| 621/621 [00:01<00:00, 547.99it/s]



Neighborhood: Bensonhurst


100%|██████████| 75/75 [00:00<00:00, 4481.80it/s]



Neighborhood: Port Richmond


100%|██████████| 9/9 [00:00<00:00, 19152.07it/s]



Neighborhood: Stuyvesant Town


100%|██████████| 37/37 [00:00<00:00, 4323.78it/s]



Neighborhood: NoHo


100%|██████████| 78/78 [00:00<00:00, 5013.27it/s]



Neighborhood: Jamaica Hills


100%|██████████| 8/8 [00:00<00:00, 19273.08it/s]



Neighborhood: Forest Hills


100%|██████████| 144/144 [00:00<00:00, 2551.11it/s]



Neighborhood: Fort Hamilton


100%|██████████| 55/55 [00:00<00:00, 6131.05it/s]



Neighborhood: Civic Center


100%|██████████| 52/52 [00:00<00:00, 4062.58it/s]



Neighborhood: Edenwald


100%|██████████| 13/13 [00:00<00:00, 15129.29it/s]



Neighborhood: Carroll Gardens


100%|██████████| 233/233 [00:00<00:00, 1508.01it/s]



Neighborhood: Morrisania


100%|██████████| 18/18 [00:00<00:00, 10132.53it/s]



Neighborhood: Springfield Gardens


100%|██████████| 85/85 [00:00<00:00, 3630.99it/s]



Neighborhood: New Springville


100%|██████████| 8/8 [00:00<00:00, 19611.01it/s]



Neighborhood: Flatiron District


100%|██████████| 80/80 [00:00<00:00, 4112.57it/s]



Neighborhood: Cobble Hill


100%|██████████| 99/99 [00:00<00:00, 3603.01it/s]



Neighborhood: Soundview


100%|██████████| 15/15 [00:00<00:00, 6297.75it/s]



Neighborhood: Greenwich Village


100%|██████████| 392/392 [00:00<00:00, 867.36it/s]



Neighborhood: Briarwood


100%|██████████| 56/56 [00:00<00:00, 4775.07it/s]



Neighborhood: Woodlawn


100%|██████████| 11/11 [00:00<00:00, 15277.27it/s]



Neighborhood: Pelham Gardens


100%|██████████| 28/28 [00:00<00:00, 7781.12it/s]



Neighborhood: Howard Beach


100%|██████████| 20/20 [00:00<00:00, 3737.07it/s]



Neighborhood: Laurelton


100%|██████████| 18/18 [00:00<00:00, 11717.75it/s]



Neighborhood: Fordham


100%|██████████| 63/63 [00:00<00:00, 4162.13it/s]



Neighborhood: Woodhaven


100%|██████████| 88/88 [00:00<00:00, 3628.04it/s]



Neighborhood: Eastchester


100%|██████████| 13/13 [00:00<00:00, 15141.89it/s]



Neighborhood: Downtown Brooklyn


100%|██████████| 83/83 [00:00<00:00, 2690.26it/s]



Neighborhood: Richmond Hill


100%|██████████| 94/94 [00:00<00:00, 3154.55it/s]



Neighborhood: Rockaway Beach


100%|██████████| 56/56 [00:00<00:00, 4558.50it/s]



Neighborhood: Mount Hope


100%|██████████| 20/20 [00:00<00:00, 10468.75it/s]



Neighborhood: St. Albans


100%|██████████| 76/76 [00:00<00:00, 2765.47it/s]



Neighborhood: Hunts Point


100%|██████████| 18/18 [00:00<00:00, 10774.58it/s]



Neighborhood: Glendale


100%|██████████| 54/54 [00:00<00:00, 4392.96it/s]



Neighborhood: Concord


100%|██████████| 26/26 [00:00<00:00, 6041.66it/s]



Neighborhood: Norwood


100%|██████████| 31/31 [00:00<00:00, 7719.73it/s]



Neighborhood: Mill Basin


100%|██████████| 4/4 [00:00<00:00, 19737.90it/s]



Neighborhood: Rego Park


100%|██████████| 106/106 [00:00<00:00, 2666.07it/s]



Neighborhood: Bayswater


100%|██████████| 17/17 [00:00<00:00, 4381.42it/s]



Neighborhood: West Brighton


100%|██████████| 18/18 [00:00<00:00, 11275.01it/s]



Neighborhood: DUMBO


100%|██████████| 36/36 [00:00<00:00, 5193.29it/s]



Neighborhood: Belmont


100%|██████████| 24/24 [00:00<00:00, 6768.65it/s]



Neighborhood: East Morrisania


100%|██████████| 10/10 [00:00<00:00, 16644.06it/s]



Neighborhood: Oakwood


100%|██████████| 5/5 [00:00<00:00, 19691.57it/s]



Neighborhood: Shore Acres


100%|██████████| 7/7 [00:00<00:00, 20705.31it/s]



Neighborhood: Schuylerville


100%|██████████| 13/13 [00:00<00:00, 15499.13it/s]



Neighborhood: Port Morris


100%|██████████| 46/46 [00:00<00:00, 7776.94it/s]



Neighborhood: Fieldston


100%|██████████| 12/12 [00:00<00:00, 16018.98it/s]



Neighborhood: Longwood


100%|██████████| 62/62 [00:00<00:00, 4266.00it/s]



Neighborhood: Wakefield


100%|██████████| 50/50 [00:00<00:00, 4595.79it/s]



Neighborhood: Claremont Village


100%|██████████| 28/28 [00:00<00:00, 3363.61it/s]



Neighborhood: Bath Beach


100%|██████████| 17/17 [00:00<00:00, 12646.89it/s]



Neighborhood: Jamaica Estates


100%|██████████| 19/19 [00:00<00:00, 4123.98it/s]



Neighborhood: Kew Gardens


100%|██████████| 32/32 [00:00<00:00, 6682.82it/s]



Neighborhood: Kingsbridge


100%|██████████| 70/70 [00:00<00:00, 3866.53it/s]



Neighborhood: Flatlands


100%|██████████| 83/83 [00:00<00:00, 2144.46it/s]



Neighborhood: Bergen Beach


100%|██████████| 10/10 [00:00<00:00, 18315.74it/s]



Neighborhood: Allerton


100%|██████████| 42/42 [00:00<00:00, 2323.93it/s]



Neighborhood: Vinegar Hill


100%|██████████| 34/34 [00:00<00:00, 5418.38it/s]



Neighborhood: Williamsbridge


100%|██████████| 40/40 [00:00<00:00, 6686.01it/s]



Neighborhood: South Beach


100%|██████████| 8/8 [00:00<00:00, 19000.24it/s]



Neighborhood: Stapleton


100%|██████████| 27/27 [00:00<00:00, 8431.08it/s]



Neighborhood: Gravesend


100%|██████████| 68/68 [00:00<00:00, 3676.04it/s]



Neighborhood: University Heights


100%|██████████| 21/21 [00:00<00:00, 15873.20it/s]



Neighborhood: Battery Park City


100%|██████████| 70/70 [00:00<00:00, 2397.02it/s]



Neighborhood: Mott Haven


100%|██████████| 60/60 [00:00<00:00, 4243.39it/s]



Neighborhood: Van Nest


100%|██████████| 11/11 [00:00<00:00, 18724.57it/s]



Neighborhood: Kew Gardens Hills


100%|██████████| 26/26 [00:00<00:00, 7351.48it/s]



Neighborhood: Bayside


100%|██████████| 39/39 [00:00<00:00, 3798.48it/s]



Neighborhood: Bronxdale


100%|██████████| 19/19 [00:00<00:00, 10949.68it/s]



Neighborhood: Great Kills


100%|██████████| 10/10 [00:00<00:00, 18428.40it/s]



Neighborhood: Olinville


100%|██████████| 4/4 [00:00<00:00, 21592.30it/s]



Neighborhood: Randall Manor


100%|██████████| 19/19 [00:00<00:00, 8381.55it/s]



Neighborhood: Roosevelt Island


100%|██████████| 77/77 [00:00<00:00, 3119.10it/s]



Neighborhood: Little Neck


100%|██████████| 5/5 [00:00<00:00, 21013.55it/s]



Neighborhood: Concourse


100%|██████████| 50/50 [00:00<00:00, 4168.79it/s]



Neighborhood: Grant City


100%|██████████| 6/6 [00:00<00:00, 11066.77it/s]



Neighborhood: Concourse Village


100%|██████████| 32/32 [00:00<00:00, 7887.74it/s]



Neighborhood: Tottenville


100%|██████████| 7/7 [00:00<00:00, 23118.21it/s]



Neighborhood: Todt Hill


100%|██████████| 4/4 [00:00<00:00, 19395.63it/s]



Neighborhood: South Ozone Park


100%|██████████| 40/40 [00:00<00:00, 8480.62it/s]



Neighborhood: Lighthouse Hill


100%|██████████| 2/2 [00:00<00:00, 11444.21it/s]



Neighborhood: Tremont


100%|██████████| 11/11 [00:00<00:00, 15141.89it/s]



Neighborhood: City Island


100%|██████████| 18/18 [00:00<00:00, 8255.60it/s]



Neighborhood: Belle Harbor


100%|██████████| 8/8 [00:00<00:00, 19173.96it/s]



Neighborhood: Sea Gate


100%|██████████| 7/7 [00:00<00:00, 22327.09it/s]



Neighborhood: College Point


100%|██████████| 19/19 [00:00<00:00, 10321.43it/s]



Neighborhood: Coney Island


100%|██████████| 17/17 [00:00<00:00, 8405.42it/s]



Neighborhood: Bellerose


100%|██████████| 14/14 [00:00<00:00, 15157.53it/s]



Neighborhood: Midland Beach


100%|██████████| 6/6 [00:00<00:00, 22529.83it/s]



Neighborhood: Melrose


100%|██████████| 10/10 [00:00<00:00, 7447.27it/s]



Neighborhood: Cambria Heights


100%|██████████| 26/26 [00:00<00:00, 7593.09it/s]



Neighborhood: Clason Point


100%|██████████| 21/21 [00:00<00:00, 3684.14it/s]



Neighborhood: Dyker Heights


100%|██████████| 12/12 [00:00<00:00, 16121.60it/s]



Neighborhood: Edgemere


100%|██████████| 11/11 [00:00<00:00, 13193.41it/s]



Neighborhood: Arrochar


100%|██████████| 21/21 [00:00<00:00, 5649.08it/s]



Neighborhood: Baychester


100%|██████████| 7/7 [00:00<00:00, 20192.66it/s]



Neighborhood: Castle Hill


100%|██████████| 9/9 [00:00<00:00, 18139.71it/s]



Neighborhood: Mount Eden


100%|██████████| 6/6 [00:00<00:00, 17213.29it/s]



Neighborhood: Rosebank


100%|██████████| 7/7 [00:00<00:00, 22009.09it/s]



Neighborhood: Silver Lake


100%|██████████| 2/2 [00:00<00:00, 12671.61it/s]



Neighborhood: Morris Park


100%|██████████| 15/15 [00:00<00:00, 11082.36it/s]



Neighborhood: Whitestone


100%|██████████| 11/11 [00:00<00:00, 13830.14it/s]



Neighborhood: Westchester Square


100%|██████████| 10/10 [00:00<00:00, 14634.70it/s]



Neighborhood: Eltingville


100%|██████████| 3/3 [00:00<00:00, 13162.04it/s]



Neighborhood: Huguenot


100%|██████████| 3/3 [00:00<00:00, 16131.94it/s]



Neighborhood: Marble Hill


100%|██████████| 12/12 [00:00<00:00, 12751.87it/s]



Neighborhood: Navy Yard


100%|██████████| 14/14 [00:00<00:00, 14112.05it/s]



Neighborhood: Rossville


100%|██████████| 1/1 [00:00<00:00, 7854.50it/s]



Neighborhood: Bull's Head


100%|██████████| 6/6 [00:00<00:00, 16644.06it/s]



Neighborhood: Castleton Corners


100%|██████████| 4/4 [00:00<00:00, 19239.93it/s]



Neighborhood: Holliswood


100%|██████████| 4/4 [00:00<00:00, 17385.72it/s]



Neighborhood: Arden Heights


100%|██████████| 4/4 [00:00<00:00, 18216.30it/s]



Neighborhood: Hollis


100%|██████████| 14/14 [00:00<00:00, 15579.80it/s]



Neighborhood: Emerson Hill


100%|██████████| 5/5 [00:00<00:00, 21799.92it/s]



Neighborhood: Willowbrook


100%|██████████| 1/1 [00:00<00:00, 8665.92it/s]



Neighborhood: Bay Terrace


100%|██████████| 6/6 [00:00<00:00, 22982.49it/s]



Neighborhood: Manhattan Beach


100%|██████████| 8/8 [00:00<00:00, 16139.70it/s]



Neighborhood: West Farms


100%|██████████| 2/2 [00:00<00:00, 13797.05it/s]



Neighborhood: New Dorp Beach


100%|██████████| 5/5 [00:00<00:00, 16953.53it/s]



Neighborhood: Breezy Point


100%|██████████| 3/3 [00:00<00:00, 14908.66it/s]



Neighborhood: Douglaston


100%|██████████| 8/8 [00:00<00:00, 19206.89it/s]



Neighborhood: Neponsit


100%|██████████| 3/3 [00:00<00:00, 3267.44it/s]



Neighborhood: Mariners Harbor


100%|██████████| 8/8 [00:00<00:00, 13236.46it/s]



Neighborhood: Westerleigh


100%|██████████| 2/2 [00:00<00:00, 13315.25it/s]



Neighborhood: New Brighton


100%|██████████| 5/5 [00:00<00:00, 24188.60it/s]



Neighborhood: Woodrow


100%|██████████| 1/1 [00:00<00:00, 9177.91it/s]



Neighborhood: Prince's Bay


100%|██████████| 4/4 [00:00<00:00, 19463.13it/s]



Neighborhood: Graniteville


100%|██████████| 3/3 [00:00<00:00, 16822.07it/s]



Neighborhood: Richmondtown


100%|██████████| 1/1 [00:00<00:00, 9731.56it/s]



Neighborhood: Co-op City


100%|██████████| 2/2 [00:00<00:00, 10645.44it/s]



Neighborhood: Bay Terrace, Staten Island


100%|██████████| 2/2 [00:00<00:00, 10192.72it/s]



Neighborhood: New Dorp


100%|██████████| 1/1 [00:00<00:00, 8490.49it/s]



Neighborhood: Spuyten Duyvil


100%|██████████| 4/4 [00:00<00:00, 19328.59it/s]



Neighborhood: Fort Wadsworth


100%|██████████| 1/1 [00:00<00:00, 6831.11it/s]



Neighborhood: Howland Hook


100%|██████████| 2/2 [00:00<00:00, 13189.64it/s]





In [22]:
# All neighborhood details in one files

if save_jsons:
    with open(path + 'results/' + 'all_neighborhood_wise' + '-' + str(threshold) + '-map_cluster_info.json', 'w+') as f:
        json.dump(map_items, f)

df = pd.DataFrame(all_neighborhood_info, columns = ['item ID', 'Cluster ID', 'Degree', 'latitude', 'longitude', 'Neighborhood', 'Cluster_Size', 'ID_Neighborhood'])
df.sort_values(['Cluster_Size', 'Cluster ID', 'Degree', 'ID_Neighborhood'], ascending = (False, False, False, True), inplace = True)
df.reset_index(drop = True, inplace = True)

display(df)

df.to_csv(path + 'results/' + 'all_neighborhood_wise' + '-' + str(threshold) + '-cluster_info.csv', index = False)

Unnamed: 0,item ID,Cluster ID,Degree,latitude,longitude,Neighborhood,Cluster_Size,ID_Neighborhood
0,31994556,1,22,40.71815,-73.95834,Williamsburg,3456,1_Williamsburg
1,34181609,1,22,40.71808,-73.95839,Williamsburg,3456,1_Williamsburg
2,17415890,1,22,40.71813,-73.95822,Williamsburg,3456,1_Williamsburg
3,11960466,1,22,40.71214,-73.95837,Williamsburg,3456,1_Williamsburg
4,32191213,1,21,40.71211,-73.96276,Williamsburg,3456,1_Williamsburg
...,...,...,...,...,...,...,...,...
48890,35496842,1,0,40.88125,-73.84922,Williamsbridge,1,1_Williamsbridge
48891,27088022,1,0,40.59886,-74.13217,Willowbrook,1,1_Willowbrook
48892,23105516,1,0,40.89870,-73.86389,Woodlawn,1,1_Woodlawn
48893,1798271,1,0,40.53884,-74.19826,Woodrow,1,1_Woodrow


In [23]:
# Done