In [2]:
# table of types clustered by predicted ratings
import pandas as pd
import numpy as np
import pickle
from k_means_constrained import KMeansConstrained

In [3]:
# file paths
base_file = '/share/garg/311_data/sb2377/clean_codebase/three_year_base.csv'
results_dir = '/share/garg/311_data/sb2377/results'

# user specified arguments
agency_replacements = {'NYPD': 'New York City Police Department',
                       'DSNY': 'Sanitation',
                       'DOHMH': 'Health and Mental Hygiene',
                       'DOT': 'Transportation',
                       'TLC': 'Taxi and Limousine Commission',
                       'EDC': 'Economic Development Corporation',
                       'DHS': 'Homeless Services',
                       'DPR': 'Parks and Recreation',
                       'DOB': 'Buildings',
                       'DEP': 'Environmental Protection',
                       'DCA': 'Consumer Affairs',
                       'HPD': 'Housing Preservation and Development',
                       'OSE': 'Mayor\'s Office of Special Enforcement',
                       'DFTA': 'Department for the Aging',
                       'OTI': 'Office of Technology and Innovation',
                       'DOE': 'Education',
                       'NYC311-PRD': 'Environmental Protection',
                       'DCWP': 'Consumer and Worker Protection'}
# Convert the list to a dictionary
agency_dict = dict(agency_replacements)
job_id = 3000
epoch = '59'
num_clusters = 8
size_min=8
size_max = 30
cluster_label = 'cluster'

In [4]:
# load files
base_df = pd.read_csv(base_file)

In [5]:
# cluster ratings by type
clustered_types_df = base_df[['type_idxs', 'typeagency', 'Agency', 'Complaint Type']].drop_duplicates()

# Replace acronyms with full names in the 'Agency' column of your DataFrame
clustered_types_df['Agency Name'] = clustered_types_df['Agency'].replace(agency_dict)

# get ratings
with open('{}/job{}/epoch={}_test.pkl'.format(results_dir, job_id, epoch), 'rb') as file:
    _, _, _, node_embedding, type_embedding, _, _, _, _, _ = pickle.load(file)
pred_ratings = np.dot(node_embedding, type_embedding.T).T

# cluster ratings
kmeans = KMeansConstrained(
    n_clusters=num_clusters,
    size_min=size_min, 
    size_max=size_max,
    random_state=0
)
kmeans.fit_predict(pred_ratings)
clustered_types_df[cluster_label] = kmeans.labels_

In [8]:
# reorder clusters by number of items
# Step 1: Count the number of items in each cluster
cluster_counts = clustered_types_df[cluster_label].value_counts().sort_values()

# Step 2: Create a mapping that reorders clusters by the number of items
cluster_order_mapping = {cluster: idx for idx, cluster in enumerate(cluster_counts.index)}

# Step 3: Apply the mapping to reorder clusters
clustered_types_df[cluster_label] = clustered_types_df[cluster_label].map(cluster_order_mapping)

# Step 4: Sort the DataFrame based on the reordered clusters if needed
clustered_types_df_sorted = clustered_types_df.sort_values(cluster_label)

In [10]:
# cluster 0
clustered_types_df[clustered_types_df[cluster_label] == 0][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
19,Street Light Condition,DOT
55,Missed Collection,DSNY
1,HEAT/HOT WATER,HPD
14,UNSANITARY CONDITION,HPD
0,Blocked Driveway,NYPD
4,Illegal Parking,NYPD
3,Noise - Residential,NYPD
5,Noise - Street/Sidewalk,NYPD


In [11]:
# cluster 1
clustered_types_df[clustered_types_df[cluster_label] == 1][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
40,APPLIANCE,HPD
46,DOOR/WINDOW,HPD
12,ELECTRIC,HPD
8,FLOORING/STAIRS,HPD
13,GENERAL,HPD
15,PAINT/PLASTER,HPD
11,PLUMBING,HPD
9,WATER LEAK,HPD


In [12]:
# cluster 2
clustered_types_df[clustered_types_df[cluster_label] == 2][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
47,Noise,DEP
16,Water System,DEP
20,General Construction/Plumbing,DOB
36,Derelict Vehicles,DSNY
61,Dirty Condition,DSNY
56,Illegal Dumping,DSNY
28,Abandoned Vehicle,NYPD
30,Noise - Commercial,NYPD
37,Noise - Vehicle,NYPD


In [14]:
# cluster 3
clustered_types_df[clustered_types_df[cluster_label] == 3][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
48,Sewer,DEP
7,Building/Use,DOB
33,Rodent,DOHMH
39,Sidewalk Condition,DOT
2,Damaged Tree,DPR
6,Dead/Dying Tree,DPR
18,New Tree Request,DPR
34,Overgrown Tree/Branches,DPR
64,Root/Sewer/Sidewalk Condition,DPR
52,Dead Animal,DSNY


In [15]:
# cluster 4
clustered_types_df[clustered_types_df[cluster_label] == 4][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
22,Consumer Complaint,DCA
43,Encampment,DHS
58,Homeless Person Assistance,DHS
90,Elevator,DOB
53,Outdoor Dining,DOT
21,Traffic Signal Condition,DOT
44,Encampment,NYPD
49,Non-Emergency Police Matter,NYPD
97,Panhandling,NYPD
96,For Hire Vehicle Complaint,TLC


In [16]:
# cluster 5
clustered_types_df[clustered_types_df[cluster_label] == 5][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
136,Consumer Complaint,DCWP
57,Air Quality,DEP
17,Lead,DEP
42,Water Conservation,DEP
76,Boilers,DOB
60,Emergency Response Team (ERT),DOB
29,Plumbing,DOB
86,Real Time Enforcement,DOB
45,Special Projects Inspection Team (SPIT),DOB
138,Food Establishment,DOHMH


In [17]:
# cluster 6
clustered_types_df[clustered_types_df[cluster_label] == 6][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
114,Asbestos,DEP
71,Hazardous Materials,DEP
92,Water Quality,DEP
104,BEST/Site Safety,DOB
75,Electrical,DOB
119,School Maintenance,DOE
111,Construction Lead Dust,DOHMH
103,Face Covering Violation,DOHMH
35,Non-Residential Heat,DOHMH
105,Standing Water,DOHMH


In [18]:
# cluster 7
clustered_types_df[clustered_types_df[cluster_label] == 7][['Complaint Type', 'Agency']].sort_values(by=['Agency', 'Complaint Type'])

Unnamed: 0,Complaint Type,Agency
79,Industrial Waste,DEP
108,AHV Inspection Unit,DOB
99,Investigations and Discipline (IAD),DOB
27,Scaffold Safety,DOB
127,Asbestos,DOHMH
122,Beach/Pool/Sauna Complaint,DOHMH
87,Day Care,DOHMH
112,Drinking Water,DOHMH
101,Harboring Bees/Wasps,DOHMH
121,Illegal Animal Kept as Pet,DOHMH
