In [1]:
import json
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import (udf, col)

from google.cloud import storage

In [68]:
BUCKET = "rjr-portal-da-transparencia"
YEAR = 2020

In [69]:
def load_json(name):
    bucket = storage.Client().get_bucket(BUCKET)
    blob = bucket.blob(f'aux_tables/{name}.json')
    maps = json.loads(blob.download_as_string())
    return maps

def mapping(df, map_, column, type_return):
    map_func = udf(lambda key: map_.get(str(key)), type_return())
    df = df.withColumn(column, map_func(col(column)))
    return df

def string_to_date(df, column, year):
    if year > 2014:
        pattern = '%d/%m/%Y'
    else:
        pattern = "%d%b%Y:%H:%M:%S"
    map_func =  udf (lambda date: datetime.strptime(date, pattern) 
                     if type(date) == str 
                     else None, DateType())
    df = df.withColumn(column, map_func(col(column)))
    return df

In [54]:
spark = SparkSession.builder.appName("censo").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [76]:
schema = load_json("turmas_schema")
try:
    schema = StructType.fromJson(schema)
except:
    schema = StructType.fromJson(json.loads(schema))
df = spark \
    .read \
    .options(header=True, delimiter="|", encoding="utf8") \
    .schema(schema=schema) \
    .csv(f"gs://{BUCKET}/landing_zone/censo-escolar/{YEAR}/turmas.csv" )

In [77]:
maps = load_json("turmas_maps")
string_columns = [column for column in df.columns 
                  if column.startswith("TP") or column.startswith("CO")]

for column in string_columns:
    if column in maps:
        df = mapping(df, maps[column], column, StringType)

In [78]:
bool_columns = [column for column in df.columns 
                  if column.startswith("IN")]

mapping_bool = {
    "0": False,
    "1": True
}

for column in bool_columns:
    df = mapping(df, mapping_bool, column, BooleanType)

In [79]:
drops = [
    "CO_REGIAO",
    "CO_MESORREGIAO",
    "CO_MICRORREGIAO",
    "CO_UF",
    "CO_MUNICIPIO",
    "CO_DISTRITO",
    "TP_DEPENDENCIA",
    "TP_LOCALIZACAO",
    "TP_CATEGORIA_ESCOLA_PRIVADA",
    "TP_CONVENIO_PODER_PUBLICO",
    "TP_REGULAMENTACAO",
    "TP_LOCALIZACAO_DIFERENCIADA"
]
df = df.drop(*drops)

In [42]:
df.write.parquet(f"gs://rjr-portal-da-transparencia/processing_zone/censo-escolar/{YEAR}/turmas.parquet",
                compression="snappy")

                                                                                