In [115]:
import duckdb
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

from duckdb.typing import *
from duckdb import FunctionExpression

## Qulaity assessment

In [116]:
input_root_dir = '../data/'
output_root_dir = '../clean_data/'

input_files_dirs = [os.path.join(input_root_dir,'2023')]
output_files_dirs = [os.path.join(output_root_dir,'2023')]

# input_files_dirs = [os.path.join(input_root_dir,x) for x in os.listdir(input_root_dir)]
# output_files_dirs = [os.path.join(output_root_dir,x) for x in os.listdir(input_root_dir)]

In [117]:
metadata_headers = ['region','federative_unit','state','code','latitude','longitude','altitude','foundation_date']
column_names = ['date','time','total_precipitation','avg_atmospheric_pressure','max_atmospheric_pressure','min_atmospheric_pressure','global_radiation','avg_air_temperature','dew_point','max_temperature','min_temperature','max_dew_point','min_dew_point','max_relative_air_humidity','min_relative_air_humidity','relative_air_humidity','wind_direction','max_wind_gust','wind_speed']
# column_types = [VARCHAR,VARCHAR,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE,DOUBLE]
# column_names = ['column00','column01','column02','column03','column04','column05','column06','column07','column08','column09','column10','column11','column12','column13','column14','column15','column16','column17','column18']


In [118]:
df = pd.read_csv('../data/2000/INMET_CO_DF_A001_BRASILIA_07-05-2000_A_31-12-2000.CSV',
                 encoding='ISO-8859-1',
                 delimiter=';',
                 skiprows=9,
                 decimal=',',
                 names=column_names,
                 usecols=column_names,
                 na_values=-9999,)  # We have to specify `usecols, because the files have a trailing ;, which causes an extra column to be created`

df.drop_duplicates(subset=['date', 'time'])

Unnamed: 0,date,time,total_precipitation,avg_atmospheric_pressure,max_atmospheric_pressure,min_atmospheric_pressure,global_radiation,avg_air_temperature,dew_point,max_temperature,min_temperature,max_dew_point,min_dew_point,max_relative_air_humidity,min_relative_air_humidity,relative_air_humidity,wind_direction,max_wind_gust,wind_speed
0,2000-05-07,00:00,,,,,,,,,,,,,,,,,
1,2000-05-07,01:00,,,,,,,,,,,,,,,,,
2,2000-05-07,02:00,,,,,,,,,,,,,,,,,
3,2000-05-07,03:00,,,,,,,,,,,,,,,,,
4,2000-05-07,04:00,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5731,2000-12-31,19:00,0.0,883.7,884.2,883.7,2828.0,25.8,16.9,27.4,25.4,19.2,16.7,67.0,53.0,58.0,271.0,9.0,5.1
5732,2000-12-31,20:00,0.0,883.9,883.9,883.6,2105.0,24.4,17.9,26.2,24.4,18.9,17.0,69.0,59.0,67.0,310.0,8.4,4.0
5733,2000-12-31,21:00,0.0,883.7,884.0,883.6,674.0,23.7,16.5,24.7,23.7,18.0,16.5,68.0,62.0,64.0,264.0,7.1,2.7
5734,2000-12-31,22:00,0.0,884.7,884.7,883.7,22.0,22.7,16.7,23.7,22.7,16.7,15.9,69.0,62.0,69.0,287.0,4.2,1.9


In [119]:
def parse_time(time:str) -> str:
    if re.match('^([01][\d]|2[0-3])([0-5][\d]) UTC$',time) is not None:
        return time[:2] + ':' + time[2:4]
    
    return time


def parse_date(date:str) -> str:
    if re.match('^[\d]{4}/[\d]{2}/[\d]{2}$', date):
        return date.replace('/','-')
    if re.match('^[\d]{2}/[\d]{2}/[\d]{2}$', date):
        date = date.split('/')
        date[0], date[-1] = '20'+date[-1], date[0]
        return '-'.join(date)
    
    return date


In [120]:
try:
    duckdb.create_function('parse_time', parse_time, [VARCHAR], VARCHAR)
except duckdb.NotImplementedException:
    pass

try:
    duckdb.create_function('parse_date', parse_date, [VARCHAR], VARCHAR)
except duckdb.NotImplementedException:
    pass

In [121]:
duckdb.sql(f"SELECT parse_time(time) AS time,COLUMNS(* EXCLUDE time) FROM df WHERE COLUMNS(* EXCLUDE (date,time)) IS NULL ORDER BY date, time")

┌─────────┬────────────┬─────────────────────┬───┬──────────────────────┬────────────────┬───────────────┬────────────┐
│  time   │    date    │ total_precipitation │ … │ relative_air_humid…  │ wind_direction │ max_wind_gust │ wind_speed │
│ varchar │  varchar   │       double        │   │        double        │     double     │    double     │   double   │
├─────────┼────────────┼─────────────────────┼───┼──────────────────────┼────────────────┼───────────────┼────────────┤
│ 00:00   │ 2000-05-07 │                NULL │ … │                 NULL │           NULL │          NULL │       NULL │
│ 01:00   │ 2000-05-07 │                NULL │ … │                 NULL │           NULL │          NULL │       NULL │
│ 02:00   │ 2000-05-07 │                NULL │ … │                 NULL │           NULL │          NULL │       NULL │
│ 03:00   │ 2000-05-07 │                NULL │ … │                 NULL │           NULL │          NULL │       NULL │
│ 04:00   │ 2000-05-07 │                

In [122]:
df = duckdb.sql("SELECT * FROM df EXCEPT SELECT * FROM df WHERE COLUMNS(* EXCLUDE (date,time)) IS NULL ORDER BY date, time").to_df()

In [123]:
df = duckdb.sql("""
           SELECT parse_date(date) AS date,
            parse_time(time) AS time,
           COLUMNS(* EXCLUDE (wind_direction,max_relative_air_humidity,min_relative_air_humidity,relative_air_humidity,time,date,wind_speed)),
           max_relative_air_humidity::UTINYINT AS max_relative_air_humidity,
           min_relative_air_humidity::UTINYINT AS min_relative_air_humidity,
           relative_air_humidity::UTINYINT AS relative_air_humidity,
           wind_direction::USMALLINT AS wind_direction,
           wind_speed
           FROM df""").to_csv('bla.csv')