In [21]:
import requests 
import json
import os
import zipfile
import re
from snowflake.snowpark.session import Session
import boto3
from lat_lon_parser import parse
import csv
import gzip

In [2]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [None]:
creds = get_secret("wysde")

In [4]:
connection_parameters = {
    "account": creds["SNOWFLAKE_ACCOUNT"],
    "user": creds["SNOWFLAKE_USERNAME"],
    "password": creds["SNOWFLAKE_PASSWORD"],
    "warehouse": creds["SNOWFLAKE_WAREHOUSE"],
    "role": creds["SNOWFLAKE_ROLE"],
    "database": "sparsh",
    "schema": "public"
}

In [33]:
BASE_DATA_PATH = "./data/inegi"

## Download

In [22]:
def cleanEmpty(row):
    for i,v in enumerate(row):
        if not v or v == '*':
            row[i] = 'NULL'
        elif v == 'N/D':
            row[i] = 0
        else:
            if str(v):
                row[i] = str(v)
            else:
                if v.isnumeric():
                    row[i] = int(v)
                    print(row[i])
                if float(v):
                    row[i] = float(v)
                    print(row[i])
    return row

In [23]:
def openCSV(filename:str):
    newInegi = []
    with open(filename, newline='', encoding='utf8') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:
            if not row[0].isnumeric():
                continue
            #mun = 0 (totales de entidad) or loc = 0 (totales de entidad) 
            if int(row[4]) == 0 or int(row[4]) == 9998 or int(row[4]) == 9999:
                continue
            else:
                newRow = [row[0],
                row[2], 
                row[3], 
                row[4], 
                row[5], 
                'NULL' if not row[6] else parse(row[6]), 
                'NULL' if not row[7] else parse(row[7]), 
                row[8],
                row[9],
                row[10], 
                row[11], 
                row[130], 
                row[185], 
                row[186], 
                row[187], 
                row[214], 
                row[215], 
                row[237], 
                row[238],
                row[182]
                ]
                newInegi.append(cleanEmpty(newRow))
    return newInegi

In [47]:
def crearCSV(data):
    csv.register_dialect('pipe', delimiter='|')
    with open(os.path.join(BASE_DATA_PATH, 'csv/inegi.csv'), 'w', encoding='utf8', newline='') as f:
        writer = csv.writer(f,dialect="pipe")
        for row in data:
            writer.writerow(row)
    f.close()
    print("CSV creado!")

In [25]:
def validateDatatypeJson(dato, tipo,i, key,row):    
    try:
        if tipo == 'int':
            if dato == 'NULL':
                return 0
            else:
                return int(dato)
        if tipo == 'float':
            if dato == 'NULL':
                return 0.0
            else:
                return float(dato)
        if tipo == 'str':
            if dato == 'NULL':
                return 'NULL'
            else:
                return str(dato)
        else:
            return 'NULL'
    except ValueError as e:
        if key == 'ALTITUD':
            return 0.0
        else:
            return str('******')
        pass

In [26]:
def leerData4Json(archivo:str):
    datos = []
    tempDic = {}
    #Schema de tipos datos
    schema = ['int', 'int', 'str' ,'int','str','float','float','int','int','int','int','int','int','int','int','int','int','int','int','int']
    keys = ['ENTIDAD','MUN','NOM_MUN','LOC','NOM_LOC','LONGITUD','LATITUD','ALTITUD','POBTOT','POBFEM','POBMAS','PCON_DISC','GRAPROES','GRAPROES_F','GRAPROES_M','PSINDER','PDER_SS','VIVTOT','TVIVHAB','VPH_INTER']
    with open(archivo, newline='', encoding='utf8') as file:
        reader = csv.reader(file, delimiter='|', quotechar=',')
        for i,row in enumerate(reader):
            for idx in range((len(row))):
                tempDic[keys[idx]] = validateDatatypeJson(row[idx],schema[idx], i, keys[idx],row)
            datos.append(tempDic)   
            tempDic = {}
    return datos

In [27]:
def crearJson(json_filename:str, csv_data):
    with open(json_filename,'w', encoding='utf-8') as jsonfile:
        for row in csv_data:
            jsonfile.write(json.dumps(row) + '\n')
    jsonfile.close()
    print("JSON " + json_filename + " creado!")

In [28]:
def compressFile(filename):
    with open(filename, 'rb') as f_in:
        with gzip.open(filename + '.gz', 'wb') as f_out:
            f_out.writelines(f_in)

In [29]:
def fullJson(csv_data):
    jsonArray = []
    filename = 'inegi.json'
    for row in csv_data:
        jsonArray.append(row)
    crearJson(filename, jsonArray)
    compressFile(filename)
    print("JSON Completo!")

In [50]:
def splitJson(csv_data, parts:int):
    jsondir = ''
    extractDir = os.path.join(BASE_DATA_PATH, 'json')
    if not os.path.exists(extractDir):
        os.mkdir(extractDir)
        jsondir = extractDir + '/'
    else:
        jsondir = extractDir + '/'

    idx = 1
    idx_filename = 1
    jsonArray = []
    tregs = len(csv_data)
    rlimits = []
    dranges = []
    print("total registros: " + str(tregs))
    limit = round(tregs // parts) 
    print('limite: ' + str(limit))
    diff = tregs - limit*parts
    print('diferencia: ' + str(diff))
    for i in range(0,parts+1):
        rlimits.append(i*limit)
    
    s = 1
    a = 0

    for i in rlimits:
        if s < len(rlimits): 
            temprange = range(rlimits[a],rlimits[s])
            dranges.append(temprange)
            #json
            filename = jsondir + 'inegi'+ str(idx_filename) + '.json'
            crearJson(filename, csv_data[temprange.start:temprange.stop])
            compressFile(filename)
            idx_filename = idx_filename + 1
            a = s
            s += 1
        elif s == len(rlimits) and diff > 0:
            #json
            filename = jsondir + 'inegi'+ str(idx_filename) + '.json'
            crearJson(filename, csv_data[-diff:])
            compressFile(filename)
            idx_filename = idx_filename + 1
    
    print("JSON particionado!")  

In [31]:
def totalHabitantesCSV(filename:str):
    total = 0
    with open(filename, newline='',encoding='utf8') as file:
        reader = csv.reader(file, delimiter='|')
        for row in reader:
            if row[8].isnumeric():
                total = total + int(row[8])
    file.close()
    return total

In [32]:
def totalHabitantesJson(filename):
    total = 0
    with open(filename, 'r', newline='') as file:
        for row in file:
            datos = json.loads(row)
            #print(datos['POBTOT'])
            total = total + datos['POBTOT']
    file.close()
    return total

In [35]:
url_download = 'https://www.inegi.org.mx/contenidos/programas/ccpv/2020/datosabiertos/iter/iter_00_cpv2020_csv.zip'

In [None]:
def inegiDownloadFile(url:str) -> bool:
    try:
        zipfilename = url.split('/')[-1]
        os.makedirs(BASE_DATA_PATH)
        data_path = os.path.join(BASE_DATA_PATH, zipfilename)
        if not os.path.isfile(data_path):
            r = requests.get(url,allow_redirects=True)
            if r.status_code == 200:
                open(data_path, 'wb').write(r.content)
                return True    
            else:
                return False
        else:
            return False        
    except requests.RequestException as err:
        print(err)
        return None

In [None]:
if inegiDownloadFile(url_download):
    print('Descargado')
else:
    print('Ya descargado')  

In [41]:
def unzipDatos(zipfilename:str):
    if os.path.isfile(zipfilename):
        print('Archivo en directorio...OK')
        extractDir = os.path.join(BASE_DATA_PATH, 'csv')
        if not os.path.exists(extractDir):
            os.mkdir(extractDir)
        with zipfile.ZipFile(zipfilename, 'r') as zipa:
            zipa.extractall(extractDir)
        print('Unzipped')
    else:
        print('Archivo debe ser descargado antes')

In [42]:
unzipDatos(os.path.join(BASE_DATA_PATH, 'iter_00_cpv2020_csv.zip'))

Archivo en directorio...OK
Unzipped


In [43]:
def buscarArchivo(filename:str):
    dirname = os.path.join(BASE_DATA_PATH, "csv/iter_00_cpv2020/conjunto_de_datos/")
    for ruta in os.listdir(dirname):
        if os.path.isfile(os.path.join(dirname, filename)):
            return os.path.join(dirname,ruta)
        else:
            print('Sin archivo')

In [48]:
rutaCSV = buscarArchivo('conjunto_de_datos_iter_00CSV20.csv')
newInegi = openCSV(rutaCSV)
crearCSV(newInegi)
newInegi.clear()

CSV creado!


In [51]:
leerCSV = leerData4Json(os.path.join(BASE_DATA_PATH, 'csv/inegi.csv'))
splitJson(leerCSV, 7)
leerCSV.clear()

total registros: 189432
limite: 27061
diferencia: 5
JSON ./data/inegi/json/inegi1.json creado!
JSON ./data/inegi/json/inegi2.json creado!
JSON ./data/inegi/json/inegi3.json creado!
JSON ./data/inegi/json/inegi4.json creado!
JSON ./data/inegi/json/inegi5.json creado!
JSON ./data/inegi/json/inegi6.json creado!
JSON ./data/inegi/json/inegi7.json creado!
JSON ./data/inegi/json/inegi8.json creado!
JSON particionado!


## Snowpark

In [9]:
sesion = Session.builder.configs(connection_parameters).create()
if sesion != None:
    print("Conectado")
    print(sesion.sql("select current_warehouse(), current_database(), current_role()").collect()) 
else:
    print("Error de conexión")

Conectado
[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='SPARSH', CURRENT_ROLE()='ACCOUNTADMIN')]


In [10]:
sesion.use_role(connection_parameters['role'])
sesion.sql("grant all privileges on database "+connection_parameters['database']+ " to role "+connection_parameters['role'] +";").collect()
sesion.use_database(connection_parameters['database'])
sesion.use_schema(connection_parameters['schema'])

In [12]:
#warehouse
sesion.sql("grant usage on warehouse "+connection_parameters['warehouse']+" to role "+connection_parameters['role'] +";").collect()
sesion.use_warehouse(connection_parameters['warehouse'])

#Schema
sesion.sql("grant all privileges on schema "+connection_parameters['schema']+" to role "+connection_parameters['role'] +";").collect()
sesion.sql("grant create stage on schema " +connection_parameters['schema']+" to role "+connection_parameters['role'] +";").collect()

[Row(status='Statement executed successfully.')]

In [13]:
sesion.sql("create or replace table inegi_raw (v VARIANT);").collect()
sesion.sql("grant select on all tables in schema "+connection_parameters['schema']+" to role "+connection_parameters['role'] +";").collect()
sesion.sql("grant select on all views in schema "+connection_parameters['schema']+" to role "+connection_parameters['role'] +";").collect()

[Row(status='Statement executed successfully. 0 objects affected.')]

In [14]:
def ingesta_setup() -> dict:
    env = {
        'account' : connection_parameters['account'],
        'snowstage' : 'inegi'
    }
    return env

In [15]:
env = ingesta_setup()
sesion.sql('CREATE STAGE IF NOT EXISTS '+ env['snowstage']).collect()

[Row(status='Stage area INEGI successfully created.')]

In [16]:
def solo_archivos(ruta) -> list:
    ingesta_files = []
    for file in os.listdir(ruta):
        # search given pattern in the line 
        if re.search("\.json.gz$", file):
            ingesta_files.append(os.path.join(ruta,file))
    return ingesta_files

In [53]:
archivos_dir = os.path.join(BASE_DATA_PATH, 'json')
archivos = solo_archivos(archivos_dir)
archivos

['./data/inegi/json/inegi3.json.gz',
 './data/inegi/json/inegi2.json.gz',
 './data/inegi/json/inegi1.json.gz',
 './data/inegi/json/inegi8.json.gz',
 './data/inegi/json/inegi4.json.gz',
 './data/inegi/json/inegi5.json.gz',
 './data/inegi/json/inegi7.json.gz',
 './data/inegi/json/inegi6.json.gz']

In [55]:
for file in archivos:
    put_result = sesion.file.put('file://' + file , '@' + env['snowstage'])
    print(put_result[0].status)
    
file = os.path.join(os.getcwd(),'src/entidad.py') 
put_result = sesion.file.put('file://' + file , '@' + env['snowstage'], auto_compress= False, overwrite=True)
print(put_result[0].status)

UPLOADED
UPLOADED
UPLOADED
UPLOADED
UPLOADED
UPLOADED
UPLOADED
UPLOADED
UPLOADED


In [56]:
print("Transformando...")
sesion.sql("create or replace file format json type = json;").collect()
sesion.sql("copy into inegi_raw from @" + env['snowstage'] + " file_format = json pattern = '.*inegi[1-8].json.gz';").collect()
print("Transformación Completado")

Transformando...
Transformación Completado


In [57]:
sesion.close()
print("Sesión terminada")

Sesión terminada


## Data Modeling

In [58]:
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import udf
from snowflake.snowpark.types import StringType
from snowflake.snowpark.types import IntegerType

In [59]:
sesion = Session.builder.configs(connection_parameters).create()
if sesion != None:
    print("Conectado")
    print(sesion.sql("select current_warehouse(), current_database(), current_role()").collect()) 
else:
    print("Error de conexión")

Conectado
[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='SPARSH', CURRENT_ROLE()='ACCOUNTADMIN')]


In [60]:
query = "create or replace  view INEGI_DATA as select " + \
 "v:ENTIDAD::int as entidad," + \
 "v:MUN::int as municipio," + \
 "v:NOM_MUN::string as nom_municipio," + \
 "v:LOC::string as localidad," + \
 "v:NOM_LOC::string as nom_localidad," + \
 "v:LONGITUD::float as longitud," + \
 "v:LATITUD::float as latitud," + \
 "v:ALTITUD::int as altitud," + \
 "v:POBTOT::int as pob_total," + \
 "v:POBFEM::int as pob_fem," + \
 "v:POBMAS::int as pob_masc," + \
 "v:PCON_DISC::int as pob_discapacidad," + \
 "v:GRAPROES::int as pob_escolaridad," + \
 "v:GRAPROES_F::int as pob_esco_fem," + \
 "v:GRAPROES_M::int as pob_esco_masc," + \
 "v:PSINDER::int as pob_sssalud," + \
 "v:PDER_SS::int as pob_cssalud," + \
 "v:VIVTOT::int as total_vivienda," + \
 "v:TVIVHAB::int total_habitada," + \
 "v:VPH_INTER::int as hab_internet " + \
 "from INEGI_RAW;"
sesion.sql(query).collect()

[Row(status='View INEGI_DATA successfully created.')]

In [61]:
# UDF declaration
entidad_udf = sesion.udf.register_from_file(file_path='@inegi/entidad.py',func_name='nom_entidad',return_type=StringType(),input_types=[IntegerType()],is_permanent=True, name="nom_entidad",stage_location="@inegi")

In [62]:
# Vista con totales por entidad aplicando UDF para convertir número de entidad x nombre entidad
viewquery = "create or replace view INEGI_MAPA as " + \
"with poblacion_lat as ( select latitud,longitud, nom_entidad(entidad) as nom_entidad," + \
"ROW_NUMBER() OVER(PARTITION BY nom_entidad ORDER BY nom_entidad DESC) AS row_number from INEGI_DATA)," + \
"poblacion_t as (select sum(pob_total) as poblacion_total,nom_entidad(entidad) as nom_entidad from " + \
"INEGI_DATA group by entidad order by poblacion_total desc)" + \
" select pl.nom_entidad,pt.poblacion_total, pl.latitud, pl.longitud" + \
" from poblacion_lat pl left join poblacion_t pt on pl.nom_entidad = pt.nom_entidad" + \
" where row_number = 1;"
sesion.sql(viewquery).collect()

[Row(status='View INEGI_MAPA successfully created.')]

In [63]:
#Validar la vista solo con totales por entidad
df_entidad = sesion.table("INEGI_MAPA")
df_entidad.show()

---------------------------------------------------------------------------------------
|"NOM_ENTIDAD"         |"POBLACION_TOTAL"  |"LATITUD"           |"LONGITUD"           |
---------------------------------------------------------------------------------------
|OAXACA                |4132148            |18.095924166666663  |-96.58116027777776   |
|PUEBLA                |6583278            |19.995293333333333  |-97.84910722222222   |
|SAN LUIS POTOSI       |2822255            |22.238184166666667  |-99.21526083333332   |
|TLAXCALA              |1342977            |19.539680833333335  |-98.08591972222222   |
|COAHUILA DE ZARAGOZA  |3146771            |27.844765           |-103.72405305555556  |
|TAMAULIPAS            |3527735            |23.775704444444443  |-98.41690472222224   |
|NAYARIT               |1235456            |20.78537194444445   |-105.46803611111112  |
|HIDALGO               |3082841            |20.29848944444445   |-99.41953666666669   |
|BAJA CALIFORNIA SUR   |798447  

In [64]:
sesion.close()