In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import vaex

from sklearn.impute import KNNImputer

In [11]:
epsg = 31983
path = '/home/yoshraf/projects/master-analysis-inequality-mobility/inequality-mobility/data/01_raw/external/'
path2 = '/home/yoshraf/projects/master-analysis-inequality-mobility/inequality-mobility/notebooks/final/mode-choice/data/'

In [3]:
def agg_type(gdf):
    '''
    This function aggregates the unit (taxpayer) to block level.
    
    Taxpayer -> Building Unit (Stret + Number) -> Block
    '''
    
    # Create the id for building unit
    gdf['unit'] = gdf['NOME DE LOGRADOURO DO IMOVEL'] + gdf['NUMERO DO IMOVEL'].astype(str)
    
    # How many rows
    print(gdf.shape)
    
    # Agg by the unit
    df_aux_b = gdf.groupby(
        ['sq', 'unit'],
        progress=True
                          ).agg(
        {
             'VALOR DO M2 DE CONSTRUCAO': 'mean',
             'VALOR DO M2 DO TERRENO': 'mean',
        }
    )
    # Agg by block
    print(df_aux_b.shape)
    df_aux_b2 = df_aux_b.groupby(['sq']).agg(
        {
         'VALOR DO M2 DE CONSTRUCAO': 'mean',
         'VALOR DO M2 DO TERRENO': 'mean',
        }
    )
    print(df_aux_b2.shape)
    df_agg2 = df_aux_b2.to_pandas_df()
    return df_agg2

In [4]:
# Read all the IPTU data and concatenate into a unique dataframe
dfs = []
date_ini, date_end  = 1995, 2022
for year in range(date_ini, date_end + 1):
    dfs.append(vaex.open(f'{path}IPTU-HDF5/IPTU_{year}/IPTU_{year}.hdf5'))
df_iptu = vaex.concat(dfs)

# Create the id for the block 'sq'
df_iptu['setor'] = df_iptu['NUMERO DO CONTRIBUINTE'].str.slice(0,3)
df_iptu['quadra'] = df_iptu['NUMERO DO CONTRIBUINTE'].str.slice(3,6)
df_iptu['sq'] = df_iptu['setor'] + df_iptu['quadra']
# Standlize the string format
df_iptu['TIPO DE USO DO IMOVEL'] = df_iptu['TIPO DE USO DO IMOVEL'].str.capitalize()

In [5]:
df_results = agg_type(df_iptu)

(82322059, 33)
groupby [########################################] 100.00% elapsed time  :    13.93s =  0.2m =  0.0h 
groupby [########################################] 100.00% elapsed time  :    26.73s =  0.4m =  0.0h
 (4502687, 4)
(46141, 3)


In [6]:
gdf_quadras = gpd.read_file(f'{path}quadras.gpkg')

In [7]:
df_results = pd.merge(
    gdf_quadras[['sq', 'geometry']],
    df_results,
    on=['sq']
)
df_results['LOG VALOR DO M2 DO TERRENO'] = np.log(df_results['VALOR DO M2 DO TERRENO'])

In [8]:
log_col = 'LOG VALOR DO M2 DO TERRENO'
col = 'VALOR DO M2 DO TERRENO'

In [9]:
cutoff = .175
q1 = df_results[log_col].quantile(1 - cutoff)

df_results['groups'] = df_results[log_col].map(lambda x: 1 if x <= q1 else 2 ).astype('category')

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  return np.percentile(values, q, axis=1, interpolation=interpolation)


In [12]:
df_quadras_results = gpd.read_file(f"{path2}quadras_classificadas.shp")
df_quadras_results['ds_groups'] = df_quadras_results['groups'].map({'1': 'Peripheral Group', '2': 'Central Group'})

In [13]:
X_sample = catalog.load('df_sample').reset_index()[['identifica_pessoa', 'coordenada_x_do_domicilio', 'coordenada_y_do_domicilio']]

2025-09-13 21:08:11,059 - kedro.io.data_catalog - INFO - Loading data from `df_sample` (ParquetDataSet)...


In [14]:
gdf_sample = gpd.GeoDataFrame(X_sample, 
                              geometry=gpd.points_from_xy(
                                  X_sample['coordenada_x_do_domicilio'],
                                  X_sample['coordenada_y_do_domicilio']),
                              crs=epsg)
gdf_aux = gpd.sjoin(gdf_sample, df_quadras_results[['sq', 'geometry', 'groups']], how="left", op='intersects')
gdf_aux['x'] = gdf_aux.geometry.x
gdf_aux['y'] = gdf_aux.geometry.y

  if (await self.run_code(code, result,  async_=asy)):


In [16]:
imputer = KNNImputer(n_neighbors=1)
gdf_aux.groups  = pd.DataFrame(imputer.fit_transform(gdf_aux[['x', 'y', 'groups']]), columns=['x', 'y', 'groups'])['groups']

In [17]:
gdf_aux[['identifica_pessoa', 'groups']].groups.value_counts(normalize=True)

1.0    0.614
2.0    0.386
Name: groups, dtype: float64

In [18]:
gdf_aux[['identifica_pessoa', 'groups']].to_parquet('data/df_region_shap.parquet')