In [1]:
import pandas as pd
# import numpy as np
# import statsmodels.api as sm
# import matplotlib.pyplot as plt
# import torch

In [3]:
covar_path = '/share/garg/311_data/sb2377/tract_demographics.csv'
base_data_path = '/share/garg/311_data/sb2377/two_year_base_tract.csv'
reporting_data_path = '/share/garg/311_data/sb2377/two_year_tract_reports_full.h5'
street_rating_path = '/share/garg/311_data/sb2377/processed_streets_two_year.h5'
park_rating_path = '/share/garg/311_data/sb2377/processed_parks_two_year.h5'
rodent_rating_path = '/share/garg/311_data/sb2377/processed_rodents_two_year.h5'
restaurant_rating_path = '/share/garg/311_data/sb2377/processed_restaurant_two_year.h5'
dcwp_rating_path = '/share/garg/311_data/sb2377/processed_dcwp_two_year.h5'
save_path = '/share/garg/311_data/sb2377/two_year_full.h5'

In [4]:
cols_to_save = ['GEOID', 
                'report_week', 
                'report_week_of_year',
                'normalized_rating', 
                'finegrained_reported', 
                'real_rating_observed', 
                'typeagency',
                'type_idxs',
                'node_idxs',
                'rating_week',
                'rating_week_of_year',
                'finegrained_id'
               ]

In [5]:
covariates_arr = pd.read_csv(covar_path)
base_df = pd.read_csv(base_data_path)
reporting_df = pd.read_hdf(reporting_data_path, 'df')
street_df = pd.read_hdf(street_rating_path, 'df')
park_df = pd.read_hdf(park_rating_path, 'df')
rodent_df = pd.read_hdf(rodent_rating_path, 'df')
restaurant_df = pd.read_hdf(restaurant_rating_path, 'df')
dcwp_df = pd.read_hdf(dcwp_rating_path, 'df')

In [6]:
# get type indices
type_df = base_df[['typeagency', 'type_idxs']].drop_duplicates()
street_idx = type_df[type_df['typeagency'] == 'StreetConditionDOT']['type_idxs'].iloc[0]
park_idx = type_df[type_df['typeagency'] == 'MaintenanceorFacilityDPR']['type_idxs'].iloc[0]
rodent_idx = type_df[type_df['typeagency'] == 'RodentDOHMH']['type_idxs'].iloc[0]
restaurant_idx = type_df[type_df['typeagency'] == 'FoodDOHMH']['type_idxs'].iloc[0]
dcwp_idx = type_df[type_df['typeagency'] == 'ConsumerComplaintDCWP']['type_idxs'].iloc[0]

In [7]:
# set null values in reporting data for rating specific columns
reporting_df['normalized_rating'] = 0
reporting_df['finegrained_reported'] = reporting_df['reported']
reporting_df['real_rating_observed'] = 0

In [10]:
street_df.columns

Index(['finegrained_id', 'report_week', 'reported', 'finegrained_reported',
       'score', 'rating_week', 'rating_week_of_year', 'real_rating_observed',
       'report_week_of_year', 'GEOID', 'Unnamed: 0', 'log_population_density',
       'log_population', 'log_income_median', 'education_bachelors_pct',
       'race_white_nh_pct', 'age_median', 'households_renteroccupied_pct',
       'normalized_log_population_density', 'normalized_log_population',
       'normalized_log_income_median', 'normalized_education_bachelors_pct',
       'normalized_race_white_nh_pct', 'normalized_age_median',
       'normalized_households_renteroccupied_pct', 'typeagency', 'type_idxs',
       'rating', 'normalized_rating'],
      dtype='object')

In [11]:
# for each complaint type with observed ratings, combine data with and without ratings
dfs = [street_df, park_df, rodent_df, restaurant_df, dcwp_df]
indices = [street_idx, park_idx, rodent_idx, restaurant_idx, dcwp_idx]
full_dfs = []

for i in range(len(dfs)):
    print(i)
    type_df = dfs[i]
    type_idx = indices[i]
    
    # get data with ratings
    # only include data with valid geoids
    valid_geoid = set(base_df['GEOID'].unique())
    type_rating_df = type_df[type_df['GEOID'].isin(valid_geoid)]

    # add node_idxs
    type_rating_df = pd.merge(type_rating_df, base_df[['node_idxs', 'GEOID']].drop_duplicates(), on='GEOID')
    type_rating_df = type_rating_df[cols_to_save]

    # get data without ratings
    type_no_rating_df = reporting_df[reporting_df['type_idxs'] == type_idx]
    all_geoids = set(type_no_rating_df['GEOID'].unique())
    rating_geoids = set(type_rating_df['GEOID'].unique())
    no_rating_geoids = all_geoids.difference(rating_geoids)
    type_no_rating_df = type_no_rating_df[type_no_rating_df['GEOID'].isin(no_rating_geoids)]
    type_no_rating_df['rating_week'] = -1
    type_no_rating_df['rating_week_of_year'] = -1
    type_no_rating_df['finegrained_id'] = -1
    type_no_rating_df = type_no_rating_df[cols_to_save]

    # combine
    full_type_df = pd.concat([type_rating_df, type_no_rating_df])
    full_dfs.append(full_type_df)

0
1
2
3
4


In [12]:
# get data for types with observed ratings
rating_observed_df = pd.concat(full_dfs)
rating_observed_df['rating_observed'] = 1

# get data for types with unobserved ratings
all_types = set(base_df['type_idxs'].unique())
rating_observed_types = set(rating_observed_df['type_idxs'].unique())
rating_unobserved_types = all_types.difference(rating_observed_types)
rating_unobserved_df = reporting_df[reporting_df['type_idxs'].isin(rating_unobserved_types)]
rating_unobserved_df['rating_week'] = -1
rating_unobserved_df['rating_week_of_year'] = -1
rating_unobserved_df['finegrained_id'] = -1
rating_unobserved_df = rating_unobserved_df[cols_to_save]
rating_unobserved_df['rating_observed'] = 0

# combine
full_df = pd.concat([rating_observed_df, rating_unobserved_df])

# get covariates
full_df = pd.merge(full_df, covariates_arr, on='GEOID')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_unobserved_df['rating_week'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_unobserved_df['rating_week_of_year'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_unobserved_df['finegrained_id'] = -1


In [None]:
full_df.to_hdf(save_path, key='df', mode='w')

In [22]:
full_df.columns

Index(['GEOID', 'report_week', 'report_week_of_year', 'normalized_rating',
       'finegrained_reported', 'real_rating_observed', 'typeagency',
       'type_idxs', 'node_idxs', 'rating_week', 'rating_week_of_year',
       'finegrained_id', 'rating_observed', 'log_population_density',
       'log_population', 'log_income_median', 'education_bachelors_pct',
       'race_white_nh_pct', 'age_median', 'households_renteroccupied_pct',
       'normalized_log_population_density', 'normalized_log_population',
       'normalized_log_income_median', 'normalized_education_bachelors_pct',
       'normalized_race_white_nh_pct', 'normalized_age_median',
       'normalized_households_renteroccupied_pct'],
      dtype='object')