In [1]:
import pandas as pd
import os

pasta = r"G:\Meu Drive\Estudos\Mestrado\Github\masters\bases_originais\Clima"

dfs = []

for arquivo in os.listdir(pasta):
    if arquivo.endswith('.CSV') and 'JOAO PESSOA' in arquivo.upper():
        caminho_completo = os.path.join(pasta, arquivo)

        df = pd.read_csv(
            caminho_completo,
            encoding='latin-1',
            skiprows=8,
            sep=';'
        )

        # garante que existem pelo menos 8 colunas
        if df.shape[1] < 8:
            continue

        # mantém 1ª, 2ª, 3ª e 8ª colunas
        df = df.iloc[:, [0, 1, 2, 7]]

        # renomeia as duas primeiras
        df.columns = ['DATA', 'HORA', df.columns[2], df.columns[3]]

        dfs.append(df)

df_final = pd.concat(dfs, ignore_index=True)


In [2]:
import numpy as np

df_final['TEMPERATURA'] = (
    df_final['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)']
    .replace(['-9999', -999, 9999, 999.9], np.nan)
)

df_final['PRECIPITAÇÃO'] = (
    df_final['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']
    .replace(['-9999', -999, 9999, 999.9], np.nan)
)

In [3]:
df_final = df_final[['DATA',	'HORA', 'TEMPERATURA', 'PRECIPITAÇÃO']]
df_final

Unnamed: 0,DATA,HORA,TEMPERATURA,PRECIPITAÇÃO
0,2020/01/01,0000 UTC,266,0
1,2020/01/01,0100 UTC,267,0
2,2020/01/01,0200 UTC,266,0
3,2020/01/01,0300 UTC,265,0
4,2020/01/01,0400 UTC,262,0
...,...,...,...,...
61363,2019/12/31,1900 UTC,276,0
61364,2019/12/31,2000 UTC,27,0
61365,2019/12/31,2100 UTC,265,0
61366,2019/12/31,2200 UTC,265,0


In [4]:
import pandas as pd
import numpy as np

df = df_final.copy()
# df = df[~df['DATA'].astype(str).str.contains('2018|2019|2020|2021', na=False)]

# DATA como date
df['DATA'] = pd.to_datetime(df['DATA'], errors='coerce').dt.date

# numéricas
colunas_numericas = ['TEMPERATURA', 'PRECIPITAÇÃO']
df[colunas_numericas] = df[colunas_numericas].apply(
    pd.to_numeric, errors='coerce'
)

df_diario = (
    df
    .groupby('DATA', as_index=False)
    .agg(
        temp_max=('TEMPERATURA', 'max'),
        temp_min=('TEMPERATURA', 'min'),
        temp_media=('TEMPERATURA', 'mean'),
        precipitacao_total=('PRECIPITAÇÃO', 'sum')
    )
)
df_diario = df_diario.sort_values('DATA')

In [5]:
df_diario.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2192 entries, 0 to 2191
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DATA                2192 non-null   object 
 1   temp_max            1675 non-null   float64
 2   temp_min            1675 non-null   float64
 3   temp_media          1675 non-null   float64
 4   precipitacao_total  2192 non-null   float64
dtypes: float64(4), object(1)
memory usage: 85.8+ KB


In [7]:
df_diario

Unnamed: 0,DATA,temp_max,temp_min,temp_media,precipitacao_total
0,2019-01-01,27.0,25.0,26.000000,0.0
1,2019-01-02,29.0,29.0,29.000000,0.0
2,2019-01-03,28.0,26.0,27.000000,0.0
3,2019-01-04,27.0,26.0,26.333333,0.0
4,2019-01-05,29.0,27.0,27.666667,0.0
...,...,...,...,...,...
2187,2024-12-27,27.0,25.0,26.000000,2.0
2188,2024-12-28,,,,0.0
2189,2024-12-29,31.0,26.0,28.200000,0.0
2190,2024-12-30,30.0,25.0,27.666667,0.0


In [8]:
df = df_diario

In [9]:
df['DATA'] = pd.to_datetime(df['DATA'])

df['dia'] = df['DATA'].dt.day
df['mes'] = df['DATA'].dt.month

media_diaria = (
    df
    .groupby(['mes', 'dia'], as_index=False)
    .mean(numeric_only=True)
)
media_diaria 

Unnamed: 0,mes,dia,temp_max,temp_min,temp_media,precipitacao_total
0,1,1,28.666667,26.000000,27.286111,0.333333
1,1,2,29.000000,28.000000,28.500000,0.000000
2,1,3,29.200000,27.000000,28.116667,0.000000
3,1,4,29.200000,28.000000,28.566667,0.166667
4,1,5,29.666667,25.666667,27.388889,0.666667
...,...,...,...,...,...,...
361,12,27,29.000000,26.400000,27.600000,0.333333
362,12,28,27.600000,25.800000,26.550000,0.666667
363,12,29,29.750000,27.500000,28.591667,0.333333
364,12,30,29.000000,26.000000,27.444444,0.000000


In [10]:
media_clima = media_diaria.rename(columns={
    'temp_max': 'temp_max_anos',
    'temp_min': 'temp_min_anos',
    'temp_media': 'temp_media_anos',
    'precipitacao_total': 'precipitacao_anos'
})


In [12]:
df = df.merge(
    media_clima[
        ['mes', 'dia', 
         'temp_max_anos', 
         'temp_min_anos', 
         'temp_media_anos', 
         'precipitacao_anos']
    ],
    on=['mes', 'dia'],
    how='left'
)
df

Unnamed: 0,DATA,temp_max,temp_min,temp_media,precipitacao_total,dia,mes,temp_max_anos,temp_min_anos,temp_media_anos,precipitacao_anos
0,2019-01-01,27.0,25.0,26.000000,0.0,1,1,28.666667,26.000000,27.286111,0.333333
1,2019-01-02,29.0,29.0,29.000000,0.0,2,1,29.000000,28.000000,28.500000,0.000000
2,2019-01-03,28.0,26.0,27.000000,0.0,3,1,29.200000,27.000000,28.116667,0.000000
3,2019-01-04,27.0,26.0,26.333333,0.0,4,1,29.200000,28.000000,28.566667,0.166667
4,2019-01-05,29.0,27.0,27.666667,0.0,5,1,29.666667,25.666667,27.388889,0.666667
...,...,...,...,...,...,...,...,...,...,...,...
2187,2024-12-27,27.0,25.0,26.000000,2.0,27,12,29.000000,26.400000,27.600000,0.333333
2188,2024-12-28,,,,0.0,28,12,27.600000,25.800000,26.550000,0.666667
2189,2024-12-29,31.0,26.0,28.200000,0.0,29,12,29.750000,27.500000,28.591667,0.333333
2190,2024-12-30,30.0,25.0,27.666667,0.0,30,12,29.000000,26.000000,27.444444,0.000000


In [14]:
df = df[~df['DATA'].astype(str).str.contains('2018|2019|2020|2021', na=False)]


df.to_csv(r"G:\Meu Drive\Estudos\Mestrado\Github\masters\bases_tratadas\weather_dataset_2022-2024.csv", index=False, sep=';')
