In [2]:
import pandas as pd
import os
from datetime import timedelta
from configparser import ConfigParser

In [72]:
config = ConfigParser()
config.read('./../desafio/scripts/config.ini')
diretorio_arquivos = os.path.expanduser(f"{config['GERAL']['BASE_DIR']}{config['EDGE']['EDGE_BASE_DIR']}") 
destino_hqls = os.path.expanduser(f"{config['GERAL']['BASE_DIR']}{config['GERAL']['HQL_DIR']}") 

In [17]:
files = os.listdir(diretorio_arquivos)
files

['CLIENTES.csv', 'ENDERECO.csv', 'DIVISAO.csv', 'VENDAS.csv', 'REGIAO.csv']

In [22]:
metadados = []
for file in files:
    data = pd.read_csv(f"{diretorio_arquivos}/{file}", sep=';')
    campos_tabela = [f"{column.lower().replace(' ', '_')} string" for column in list(data.columns)]
    campos_tabela.insert(0, f"id_{file.split('.')[0].lower()} string")
    metadados.append(
        {'tabela': file.split('.')[0].lower(), 
         'campos': campos_tabela})

metadados[:1]

[{'tabela': 'clientes',
  'campos': ['id_clientes string',
   'address_number string',
   'business_family string',
   'business_unit string',
   'customer string',
   'customerkey string',
   'customer_type string',
   'division string',
   'line_of_business string',
   'phone string',
   'region_code string',
   'regional_sales_mgr string',
   'search_type string']}]

In [82]:
# set dll

DB_EXT=config['HIVE']['DB_EXT'].lower()
DB_STG=config['HIVE']['DB_STG'].lower()
HDFS_BASE_DIR=config['HDFS']['HDFS_BASE_DIR']
PARTICAO=(datetime.today() - timedelta(1)).strftime('%Y%m%d')

for tbl in metadados:
    campos = ",\n\t".join(tbl['campos'])
    create=f"""-- TABELA {tbl['tabela']} on Hive

    CREATE DATABASE IF NOT EXISTS {DB_EXT}; 
    CREATE DATABASE IF NOT EXISTS {DB_STG};

    DROP TABLE {DB_EXT}.tbl_{tbl['tabela']};

    CREATE EXTERNAL TABLE IF NOT EXISTS {DB_EXT}.tbl_{tbl['tabela']} (
        {campos}
    )
    COMMENT "Tabela de {tbl['tabela']}"
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY "|"
    STORED AS TEXTFILE
    location  "{HDFS_BASE_DIR}/raw"
    TBLPROPERTIES ("skip.header.line.count"="1");

    SELECT count(*) as _records FROM {DB_EXT}.tbl_{tbl['tabela']} LIMIT 10;

    -- Tabela {tbl['tabela']} particionada

    DROP TABLE {DB_STG}.tbl_{tbl['tabela']};

    CREATE TABLE IF NOT EXISTS {DB_STG}.tbl_{tbl['tabela']} (
        {campos}
    )
    PARTITIONED BY (DT_FOTO STRING)
    ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
    STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
    TBLPROPERTIES ('orc.compress'='SNAPPY');

    SET hive.exec.dynamic.partition=true;
    SET hive.exec.dynamic.partition.mode=nonstrict;

    INSERT OVERWRITE TABLE 
        {DB_STG}.tbl_{tbl['tabela']}
    PARTITION(DT_FOTO)
    SELECT
        {campos},
        '{PARTICAO}' as DT_FOTO
    FROM {DB_EXT}.tbl_{tbl['tabela']};

    SELECT count(*) as _records FROM {DB_STG}.tbl_{tbl['tabela']} LIMIT 10;

    -- --------------------------------------------------------------
    --  FIM DO SCRIPT {tbl['tabela']}
    -- --------------------------------------------------------------

    """
    destino = f"{destino_hqls}/create_table_{tbl['tabela']}.hql"
    
    file_hql = open(destino, 'w')
    file_hql.write(create)
    file_hql.close()


In [3]:
# endereco
endereco = pd.read_csv('./../raw/ENDERECO.csv', sep=';')
endereco.shape

(455, 9)

In [4]:
endereco.head()

Unnamed: 0,Address Number,City,Country,Customer Address 1,Customer Address 2,Customer Address 3,Customer Address 4,State,Zip Code
0,10000000,Akron,US,PO Box 6258,,,,OH,44312
1,10000453,,UK,,,,,,
2,10000455,Huntington Beach,US,7392 Count Circle,,,,CA,92647
3,10000456,Edmonton,CA,8151 Wagner Road,,,,AB,T6E 4N6
4,10000458,Saginaw,US,PO Box 840,,,,MI,48606


In [15]:
# endereco.City.unique()
[row2 for row2 in [row.strip() for row in list(endereco['Customer Address 1'].unique())] if row2 != '']


['PO Box 6258',
 '7392 Count Circle',
 '8151 Wagner Road',
 'PO Box 840',
 '709 Rivergate Parkway',
 '1391 Gay Lussac',
 '6311 North Lindbergh Boulevard',
 '3213 Orange Grove Avenue',
 'Attention:  Accounts Payable',
 'Al Thalia Street',
 '8000 Haskell Avenue',
 '3290 Monier Circle #100',
 'PO Box 3265',
 '5745 N Tryon Street',
 '8120 E Doyle Springs Road',
 'Attention: Accounts Payable',
 'Box 5516',
 '1550 Aereli Avenue',
 '1594 Nicholson Street',
 '50 Mystic Avenue',
 'Spicer Clark - Hurth Off Highway Div',
 'PO Box 266',
 'Attention: Doris Nielson',
 'Kadaka Road 72 a',
 'PO Box 99',
 '1200 North Ellis Street',
 '4760 NW 165th Street',
 '7223 Girard Road',
 '12000 Balls Ford Road',
 '2755 Kirila Boulevard',
 'PO Box 2484',
 '4747 N Channel Ave',
 '3F Shinwoo Building',
 'PO Box 20505',
 '9555 Airline Highway',
 'Vehicle Operations',
 '601 East Gowen Road',
 'Paseo Colon Calles 34-36',
 '7225 NW 25th Street',
 'PO Box 5364',
 '1801 NW 82nd Ave',
 'AV 6 DE DICIEMBRE',
 '1023 Kansas A

In [67]:

import re

def clean_empty(_txt):
    rs = re.search("\s{2,}", _txt)
    if isinstance(rs, re.Match):
        print(rs.group())
        if rs.span()[0] == 0:
            return "Não informado"
        _txt = _txt.replace(rs.group(), ' ')

    return _txt

assert clean_empty("   ") == "Não informado"
assert clean_empty("Luiza") == "Luiza"
assert clean_empty("Luiza Batista") == "Luiza Batista"
assert clean_empty("Luiza  Batista") == "Luiza Batista"
# assert clean_empty("Luiza Batista  ") == "Luiza Batista"

   
  


In [None]:
{'actual_delivery_date': 'datetime64[ns]',
 'customerkey': 'object',
 'datekey': 'datetime64[ns]',
 'discount_amount': 'float',
 'invoice_date': 'datetime64[ns]',
 'invoice_number': 'object',
 'item_class': 'object',
 'item_number': 'object',
 'item': 'object',
 'line_number': 'object',
 'list_price': 'float',
 'order_number': 'object',
 'promised_delivery_date': 'datetime64[ns]',
 'sales_amount': 'float',
 'sales_amount_based_on_list_price': 'float',
 'sales_cost_amount': 'float',
 'sales_margin_amount': 'float',
 'sales_price': 'float',
 'sales_quantity': 'float',
 'sales_rep': 'float',
 'u_m': 'object',
 'dt_foto': 'datetime64[ns]'}


In [73]:
import math
math.trunc(0) == 0

True

In [66]:
e = endereco.City.map(clean_empty)
e[:10]

                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         
                         


0               Akron
1       Não informado
2    Huntington Beach
3            Edmonton
4             Saginaw
5      Goodlettsville
6        Boucherville
7           Hazelwood
8     North Highlands
9            Montreal
Name: City, dtype: object