In [None]:
root_dir = !git rev-parse --show-toplevel
import os; os.chdir(root_dir[0])

In [None]:
import requests
import geojson
import shapely
from collections import namedtuple
from shapely.geometry import shape, Point
import geopandas as gpd
import pandas as pd
import sqlalchemy

In [None]:
!ls data/

# Get shape data

## Get udh shapes

In [None]:
import glob
regions = glob.glob('data/preprocessed/udh/*shp')
regions.remove('data/preprocessed/udh/RM_Salvador_UDH_4_region.shp') # ta meio ferrado
regions;

In [None]:
## ['UDH_ATLAS', 'REGIONAL', 'CD_GEOCODM', 'NM_MUNICIP', 'geometry']
import re

udh_geo = gpd.GeoDataFrame()
for r in regions[:]:
    region = gpd.read_file(r)
    #region.rename(columns={'NOME_MUNIC': 'NM_MUNICIP', 'CODIGO_MUN': 'CD_GEOCODM'}, inplace=True)
    region.rename(columns={'RM': 'REGIONAL'}, inplace=True)
    udh_geo = pd.concat([udh_geo, region], sort=False)
udh_geo['UDH_ATLAS'] = udh_geo.UDH_ATLAS.apply(lambda x: int(re.sub(r'[a-zA-Z]', '', x)))
udh_geo = udh_geo.set_index('UDH_ATLAS')

In [None]:
print(udh_geo.shape)
udh_geo.columns

In [None]:
udh_geo.head()

In [None]:
udh_geo.plot();

## Get regions shapes

In [None]:
import rarfile
path_to_file = 'data/br/'
filename = 'Municipios_Brasil.rar'
with rarfile.RarFile(path_to_file + filename, 'r') as rar:
    rar.extractall(path_to_file)

In [None]:
municipality_geo = gpd.read_file('data/br/MUNICIPIOS_polígonos.shp')
municipality_geo.rename(columns={'COD_IBGE': 'CD_GEOCODM'}, inplace=True)
municipality_geo['CD_GEOCODM'] = municipality_geo.CD_GEOCODM.astype(int)
municipality_geo.head()

In [None]:
municipality_geo.plot();

# Get scalar data

## Get regions data

In [None]:
municipality_data = pd.read_excel('data/preprocessed/atlas2013_dadosbrutos_pt.xlsx', sheet_name='MUN 91-00-10')
municipality_data.head()

In [None]:
municipality_data = municipality_data.set_index('Codmun7').query('ANO == 2010')

## Get udh data

In [None]:
udh_files = glob.glob('data/preprocessed/udh/*.parquet')

udh_data = pd.DataFrame()
columns = set()
for file in udh_files[:]:
    f = pd.read_parquet(file)
    if not columns: columns.update(f.columns)
    else:
        extra_columns = set(f.columns) - columns
        if extra_columns:
            print(f'error, too many columns: {extra_columns}')
    #region.rename(columns={'NOME_MUNIC': 'NM_MUNICIP', 'CODIGO_MUN': 'CD_GEOCODM'}, inplace=True)
    udh_data = pd.concat([udh_data, f.query('ANO == 2010')], sort=False)


In [None]:
udh_data = udh_data.drop_duplicates().dropna(subset=['Cod_ID'])
udh_data = udh_data.set_index('UDH_Atlas')
udh_data.index = udh_data.index.astype(int)
udh_data.shape

In [None]:
_ = udh_data.isna().sum()
_.sort_values(ascending=False).head().to_dict()
# missing columns

# Join and make final dataframes

## Join municipalities

In [None]:
municipalities = municipality_geo.join(municipality_data, on='CD_GEOCODM', rsuffix='_DUPLICATE')

In [None]:
print('Couldnt join these lines: ')
municipalities[municipalities.UF_DUPLICATE.isna()]

In [None]:
municipalities.dropna(subset=['UF_DUPLICATE'], inplace=True)
print('Dropped unjoined lines!')

In [None]:
municipalities.plot(column='T_CRIFUNDIN_TODOS', legend=True);

## Join UDH

In [None]:
udh_geo.index.dtype
udh_data.index.dtype

In [None]:
udh = udh_geo.join(udh_data, rsuffix='_DUPLICATE')
udh.head()

Treat duplicated data

In [None]:
from feature_names import feature_columns

In [None]:
# This cell proves that all the first occurrences has just NA values in the specific column

duplicated_stuff = udh[udh.duplicated(subset=["geometry", "NM_MUNICIP"], keep='last')].isna()
assert all([all(duplicated_stuff[col]) for col in feature_columns])

In [None]:
udh = udh.drop_duplicates(subset=["geometry", "NM_MUNICIP"], keep='last')
udh[udh["I_ESCOLARIDADE"].isna()]

# Export data

In [None]:
%%script echo false
import tempfile
import shutil

with tempfile.TemporaryDirectory() as d:
    udh.to_file(d)
    shutil.make_archive('udh', 'zip', d)
    
with tempfile.TemporaryDirectory() as d:
    municipalities.to_file(d)
    shutil.make_archive('municipalities', 'zip', d)

In [None]:
!mkdir -p data/export
municipalities.to_pickle('data/export/municipality.pickle')
udh.to_pickle('data/export/udh.pickle')

In [None]:
!du -sh data/export/*

In [None]:
from zipfile import ZipFile
with ZipFile('data/export/data.zip', 'w') as zip_obj:
    zip_obj.write('data/export/udh.pickle', 'udh.pickle')
    zip_obj.write('data/export/municipality.pickle', 'municipality.pickle')

# Deploy data

In [None]:
from os import replace
replace('data/export/data.zip', 'geo_data_br/data/data.zip')

# Nice things

In [None]:
udh.iloc[0]['HOMEM30A34']

In [None]:
municipalities.plot('HOMEM30A34', legend=True);

# Gutter

In [None]:
udh_clean[udh_clean["NM_MUNICIP"] == "SÃO JOSÉ DOS CAMPOS"]