In [137]:
import json
import pickle
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col, udf, to_timestamp)

from google.cloud import storage


In [290]:
def load_json(filename):
    bucket = storage.Client().get_bucket("rjr-portal-da-transparencia")
    blob = bucket.blob(f'aux_tables/{filename}.json')
    maps = json.loads(blob.download_as_string())
    return maps


def mapping(df, map_, column, type_return):
    map_func = udf(lambda key: map_.get(str(key)), type_return())
    df = df.withColumn(column, map_func(col(column)))
    return df

def string_to_date(df, column):
    map_func =  udf (lambda x: datetime.strptime(x, '%d/%m/%Y') 
                     if type(x) == str 
                     else None, DateType())
    df = df.withColumn(column, map_func(col(column)))
    return df

In [270]:
spark = SparkSession.builder.appName("censo").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [324]:
schema = load_json("escola_schema")
schema = StructType.fromJson(json.loads(json.dumps(schema)))
df = spark \
    .read \
    .options(header=True, delimiter="|", encoding="utf8") \
    .schema(schema) \
    .csv("gs://rjr-portal-da-transparencia/landing_zone/censo-escolar/2012/ESCOLAS.csv" )

In [326]:
maps = load_json("maps")
string_columns = [column for column in df.columns 
                  if column.startswith("TP") or column.startswith("CO")]

for column in string_columns:
    if column in maps:
        df = mapping(df, maps[column], column, StringType)
    else:
        print(f"Map not found: {column}")

Map not found: CO_ENTIDADE
Map not found: CO_ORGAO_REGIONAL
Map not found: CO_DISTRITO
Map not found: CO_ESCOLA_SEDE_VINCULADA
Map not found: CO_IES_OFERTANTE
Map not found: CO_LINGUA_INDIGENA_1
Map not found: CO_LINGUA_INDIGENA_2
Map not found: CO_LINGUA_INDIGENA_3


In [327]:
mapping_bool = {
    0: False,
    1: True,
    "0": False,
    "1": True
}

boolean_columns = [column for column in df.columns 
                   if column.startswith("IN") or column.startswith("ID")]

for column in boolean_columns:
    df = mapping(df, mapping_bool, column, BooleanType)

In [328]:
df = string_to_date(df, "DT_ANO_LETIVO_INICIO")
df = string_to_date(df, "DT_ANO_LETIVO_TERMINO")