In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Timedelta, Timestamp
from collections import namedtuple

from IPython.core.debugger import set_trace

In [26]:
df_estacao = pd.read_csv("DATA/dados_estacoes_5min.csv",
        parse_dates=["DATE"],
        index_col=["DATE"])

df_radar = pd.read_csv("DATA/dados_radar_semHorarioVerao.csv",
                      parse_dates=['DATE'],
                      index_col=["DATE"])

df_estacao.sort_index(inplace=True)
df_radar.sort_index(inplace=True)

In [27]:
df_est = df_estacao[df_estacao > 0].dropna(how="all",axis=0).dropna(how="all",axis=1)
indices = df_est.index
eventos = []

evento = []
last_indice = indices[0]
evento.append(last_indice)
for indice in indices[1:]:
    evento.append(indice)
    # Eventos com um gap maior que 10 min são eventos distintos
    if indice - last_indice > Timedelta("10min"):
        # eventos maiores que 10 min
        if len(evento) > 1:
            eventos.append(evento)
        evento = []
    last_indice = indice
print(len(eventos))
    

1685


In [28]:
# Evento aleátorio para tests
evento = eventos[3]
evento

[Timestamp('2016-01-01 15:50:00'),
 Timestamp('2016-01-01 16:00:00'),
 Timestamp('2016-01-01 16:10:00'),
 Timestamp('2016-01-01 16:20:00'),
 Timestamp('2016-01-01 16:30:00'),
 Timestamp('2016-01-01 16:40:00'),
 Timestamp('2016-01-01 16:50:00'),
 Timestamp('2016-01-01 19:00:00')]

In [29]:
df_est.loc[evento].dropna(axis=1, how="all")

Unnamed: 0_level_0,MB_GAM1,MB_PRO1,MB_PRO2,MB_SEG1
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01 15:50:00,,,,15.5
2016-01-01 16:00:00,,3.0,,2.5
2016-01-01 16:10:00,,9.75,6.75,
2016-01-01 16:20:00,,5.5,5.75,
2016-01-01 16:30:00,,1.0,4.75,
2016-01-01 16:40:00,0.25,,,0.25
2016-01-01 16:50:00,0.5,,,
2016-01-01 19:00:00,,1.75,,


In [30]:
df_evento = df_est.loc[evento].dropna(axis=1, how="all")
evento_estacao = df_evento.MB_PRO1.dropna()
evento_estacao

DATE
2016-01-01 16:00:00    3.00
2016-01-01 16:10:00    9.75
2016-01-01 16:20:00    5.50
2016-01-01 16:30:00    1.00
2016-01-01 19:00:00    1.75
Name: MB_PRO1, dtype: float64

In [31]:
evento_estacao.sum()

21.0

In [32]:
evento_estacao.index

DatetimeIndex(['2016-01-01 16:00:00', '2016-01-01 16:10:00',
               '2016-01-01 16:20:00', '2016-01-01 16:30:00',
               '2016-01-01 19:00:00'],
              dtype='datetime64[ns]', name='DATE', freq=None)

In [33]:
def plot_event(radar,station,name):
    plt.figure()
    plt.title(name)
    station.plot(label="STATION")
    radar.plot(label="RADAR")
    plt.legend()
    
    file_name = "IMAGENS/eventos/" + str(name)+"jpg"
    plt.savefig(file_name)
    

É necessario ainda separar os eventos por estação, como é possivel ver acima os eventos começam em horarios distintos por estaçoes.


In [40]:
df = pd.DataFrame(columns=["start", "station","duration","Pluviometro","RADAR"])
stations = df_est.columns
for station in stations:
    df_station = df_est[station]
    df_station_nonull = df_station[df_station > 0].dropna()
    indices = df_station_nonull.index
    dict_evento = {}
    evento = []
    last_indice = indices[0]
    for indice in indices[1:]:
        evento.append(last_indice)
        #set_trace()
        # Eventos com um gap maior que 10 min são eventos distintos
        if indice - last_indice > Timedelta("10min"):
            # eventos maiores que 10 min

            dict_evento["start"] = evento[0]
            dict_evento["station"] = station
            dict_evento["duration"] = (evento[-1] - evento[0]) + Timedelta("10min")
            dict_evento["Pluviometro"] = df_station.loc[evento].sum()
            try:
                dict_evento["RADAR"] = df_radar[station].loc[evento].sum()
                name = station + str(evento[0])
                #plot_event(df_radar[station].loc[evento],df_station.loc[evento],name)
            except KeyError:
                dict_evento["RADAR"] = np.nan

            df = df.append(dict_evento, ignore_index=True)
            evento = []
        last_indice = indice

    

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [41]:
df.head()

Unnamed: 0,start,station,duration,Pluviometro,RADAR
0,2016-10-07 14:40:00,MB_ANHA1,00:10:00,0.75,
1,2016-10-13 08:30:00,MB_ANHA1,00:10:00,0.25,0.214636
2,2016-10-14 07:20:00,MB_ANHA1,00:10:00,0.25,0.594438
3,2016-10-14 08:10:00,MB_ANHA1,00:50:00,3.25,1.56986
4,2016-10-14 09:10:00,MB_ANHA1,01:40:00,5.0,2.367961


In [42]:
df["ERROR"] = (df.Pluviometro - df.RADAR)

In [43]:
df.head()

Unnamed: 0,start,station,duration,Pluviometro,RADAR,ERROR
0,2016-10-07 14:40:00,MB_ANHA1,00:10:00,0.75,,
1,2016-10-13 08:30:00,MB_ANHA1,00:10:00,0.25,0.214636,0.035364
2,2016-10-14 07:20:00,MB_ANHA1,00:10:00,0.25,0.594438,-0.344438
3,2016-10-14 08:10:00,MB_ANHA1,00:50:00,3.25,1.56986,1.68014
4,2016-10-14 09:10:00,MB_ANHA1,01:40:00,5.0,2.367961,2.632039


In [44]:
df.groupby("station").describe()["ERROR"]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MB_ANHA1,227.0,1.373968,7.510792,-43.993928,0.0625,0.25,1.5,73.831623
MB_ANHA3,192.0,-0.541608,4.812046,-53.711793,-0.082,0.0625,0.224768,10.498964
MB_ANHA4,258.0,-0.116645,2.964486,-23.562928,-0.004432,0.0625,0.1875,11.306447
MB_ANHA5,405.0,0.200299,3.393859,-39.607525,0.0625,0.0625,0.375,12.410372
MB_ANHA6,29.0,0.192325,0.371388,-0.406759,0.0625,0.0625,0.1875,1.680681
MB_ANHA7,372.0,-0.989119,3.813844,-30.720814,-0.441556,0.018925,0.0625,7.126998
MB_ANHA7.1,0.0,,,,,,,
MB_ANHA8,86.0,0.385156,3.815237,-26.513181,0.027925,0.0625,0.534553,13.25
MB_ANHA8.1,0.0,,,,,,,
MB_BAL1,206.0,0.078793,3.983515,-50.502213,-0.011609,0.0625,0.241399,8.377209


In [45]:
df.to_csv("DATA/eventos.csv",index=False)