In [None]:
import pandas as pd
import os

pasta = r"G:\Meu Drive\Estudos\Mestrado\Github\masters\bases_originais\Clima"

dfs = []

for arquivo in os.listdir(pasta):
    if arquivo.endswith('.CSV') and 'JOAO PESSOA' in arquivo.upper():
        caminho_completo = os.path.join(pasta, arquivo)

        df = pd.read_csv(
            caminho_completo,
            encoding='latin-1',
            skiprows=8,
            sep=';'
        )

        # garante que existem pelo menos 8 colunas
        if df.shape[1] < 8:
            continue

        # mantém 1ª, 2ª, 3ª e 8ª colunas
        df = df.iloc[:, [0, 1, 2, 7]]

        # renomeia as duas primeiras
        df.columns = ['DATA', 'HORA', df.columns[2], df.columns[3]]

        dfs.append(df)

df_final = pd.concat(dfs, ignore_index=True)


In [None]:
import numpy as np

df_final['TEMPERATURA'] = (
    df_final['TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)']
    .replace(['-9999', -999, 9999, 999.9], np.nan)
)

df_final['PRECIPITAÇÃO'] = (
    df_final['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)']
    .replace(['-9999', -999, 9999, 999.9], np.nan)
)

In [None]:
df_final = df_final[['DATA',	'HORA', 'TEMPERATURA', 'PRECIPITAÇÃO']]
df_final

Unnamed: 0,DATA,HORA,TEMPERATURA,PRECIPITAÇÃO
0,2020/01/01,0000 UTC,266,0
1,2020/01/01,0100 UTC,267,0
2,2020/01/01,0200 UTC,266,0
3,2020/01/01,0300 UTC,265,0
4,2020/01/01,0400 UTC,262,0
...,...,...,...,...
61363,2019/12/31,1900 UTC,276,0
61364,2019/12/31,2000 UTC,27,0
61365,2019/12/31,2100 UTC,265,0
61366,2019/12/31,2200 UTC,265,0


In [None]:
import pandas as pd
import numpy as np

df = df_final.copy()
# df = df[~df['DATA'].astype(str).str.contains('2018|2019|2020|2021', na=False)]
df = df[~df['DATA'].astype(str).str.contains('2024', na=False)]

# DATA como date
df['DATA'] = pd.to_datetime(df['DATA'], errors='coerce').dt.date
df_final['DATA'] = pd.to_datetime(df_final['DATA'], errors='coerce').dt.date

df_base_dias = df_final.drop_duplicates(subset='DATA', ignore_index=True)

# numéricas
colunas_numericas = ['TEMPERATURA', 'PRECIPITAÇÃO']
df[colunas_numericas] = df[colunas_numericas].apply(
    pd.to_numeric, errors='coerce'
)

df_diario = (
    df
    .groupby('DATA', as_index=False)
    .agg(
        temp_max=('TEMPERATURA', 'max'),
        temp_min=('TEMPERATURA', 'min'),
        temp_media=('TEMPERATURA', 'mean'),
        precipitacao_total=('PRECIPITAÇÃO', 'sum')
    )
)
df_diario = df_diario.sort_values('DATA')

In [None]:
df_diario.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DATA                1826 non-null   object 
 1   temp_max            1346 non-null   float64
 2   temp_min            1346 non-null   float64
 3   temp_media          1346 non-null   float64
 4   precipitacao_total  1826 non-null   float64
dtypes: float64(4), object(1)
memory usage: 71.5+ KB


In [None]:
df_diario

Unnamed: 0,DATA,temp_max,temp_min,temp_media,precipitacao_total
0,2019-01-01,27.0,25.0,26.000000,0.0
1,2019-01-02,29.0,29.0,29.000000,0.0
2,2019-01-03,28.0,26.0,27.000000,0.0
3,2019-01-04,27.0,26.0,26.333333,0.0
4,2019-01-05,29.0,27.0,27.666667,0.0
...,...,...,...,...,...
1821,2023-12-27,27.0,27.0,27.000000,0.0
1822,2023-12-28,26.0,26.0,26.000000,0.0
1823,2023-12-29,31.0,28.0,29.666667,0.0
1824,2023-12-30,30.0,24.0,26.800000,0.0


In [None]:
df = df_diario

In [None]:
df['DATA'] = pd.to_datetime(df['DATA'])

df['dia'] = df['DATA'].dt.day
df['mes'] = df['DATA'].dt.month

media_diaria = (
    df
    .groupby(['mes', 'dia'], as_index=False)
    .mean(numeric_only=True)
)
media_diaria 

Unnamed: 0,mes,dia,temp_max,temp_min,temp_media,precipitacao_total
0,1,1,28.800000,25.60,27.143333,0.2
1,1,2,29.000000,28.00,28.500000,0.0
2,1,3,29.000000,27.25,28.020833,0.0
3,1,4,28.750000,27.25,27.958333,0.2
4,1,5,29.400000,25.60,27.133333,0.8
...,...,...,...,...,...,...
361,12,27,29.500000,26.75,28.000000,0.0
362,12,28,27.600000,25.80,26.550000,0.8
363,12,29,29.333333,28.00,28.722222,0.4
364,12,30,28.800000,26.20,27.400000,0.0


In [None]:
media_clima = media_diaria.rename(columns={
    'temp_max': 'temp_max_anos',
    'temp_min': 'temp_min_anos',
    'temp_media': 'temp_media_anos',
    'precipitacao_total': 'precipitacao_anos'
})


In [None]:
df_base_dias['DATA'] = pd.to_datetime(df_base_dias['DATA'])

df_base_dias['dia'] = df_base_dias['DATA'].dt.day
df_base_dias['mes'] = df_base_dias['DATA'].dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_base_dias['DATA'] = pd.to_datetime(df_base_dias['DATA'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_base_dias['dia'] = df_base_dias['DATA'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_base_dias['mes'] = df_base_dias['DATA'].dt.month


In [None]:
df = df_base_dias.merge(
    media_clima[
        ['mes', 'dia', 
         'temp_max_anos', 
         'temp_min_anos', 
         'temp_media_anos', 
         'precipitacao_anos']
    ],
    on=['mes', 'dia'],
    how='left'
)
df

Unnamed: 0,DATA,HORA,TEMPERATURA,PRECIPITAÇÃO,dia,mes,temp_max_anos,temp_min_anos,temp_media_anos,precipitacao_anos
0,2020-01-01,0000 UTC,266,0,1.0,1.0,28.800000,25.60,27.143333,0.2
1,2020-01-02,0000 UTC,268,0,2.0,1.0,29.000000,28.00,28.500000,0.0
2,2020-01-03,0000 UTC,266,0,3.0,1.0,29.000000,27.25,28.020833,0.0
3,2020-01-04,0000 UTC,267,0,4.0,1.0,28.750000,27.25,27.958333,0.2
4,2020-01-05,0000 UTC,265,0,5.0,1.0,29.400000,25.60,27.133333,0.8
...,...,...,...,...,...,...,...,...,...,...
2188,2019-12-27,0000 UTC,268,0,27.0,12.0,29.500000,26.75,28.000000,0.0
2189,2019-12-28,0000 UTC,266,0,28.0,12.0,27.600000,25.80,26.550000,0.8
2190,2019-12-29,0000 UTC,266,0,29.0,12.0,29.333333,28.00,28.722222,0.4
2191,2019-12-30,0000 UTC,27,0,30.0,12.0,28.800000,26.20,27.400000,0.0


In [None]:
df = df.drop(columns=['HORA', 'TEMPERATURA', 'PRECIPITAÇÃO', 'dia', 'mes'])
df = df.dropna()

In [None]:
df.sort_values('DATA')

Unnamed: 0,DATA,temp_max_anos,temp_min_anos,temp_media_anos,precipitacao_anos
1828,2019-01-01,28.800000,25.60,27.143333,0.2
1829,2019-01-02,29.000000,28.00,28.500000,0.0
1830,2019-01-03,29.000000,27.25,28.020833,0.0
1831,2019-01-04,28.750000,27.25,27.958333,0.2
1832,2019-01-05,29.400000,25.60,27.133333,0.8
...,...,...,...,...,...
1822,2024-12-27,29.500000,26.75,28.000000,0.0
1823,2024-12-28,27.600000,25.80,26.550000,0.8
1824,2024-12-29,29.333333,28.00,28.722222,0.4
1825,2024-12-30,28.800000,26.20,27.400000,0.0


In [None]:
df.info()

In [None]:
# df = df[~df['DATA'].astype(str).str.contains('2018|2019|2020|2021', na=False)]


df.to_csv(r"G:\Meu Drive\Estudos\Mestrado\Github\masters\bases_tratadas\weather_dataset_2022-2024.csv", index=False, sep=';')
