In [7]:
import os
import pandas as pd
txt_file_path = '/root/data/rrr/integrated_weather_dataset/scripts/download/logs_precipitation/left_timestamps.txt'
with open(txt_file_path, 'r') as file:
    timestamps = file.read().splitlines()

directory = '/root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/data/'

for timestamp in timestamps:
    filename = f'{timestamp}'
    filepath = os.path.join(directory, filename)
    
    if os.path.isfile(filepath):
        filesize = os.path.getsize(filepath)
        print(f'{filename}: {filesize} bytes')
    else:
        print(f'{filename}')


2005-01-01-000000.nc: 65241 bytes
2005-07-02-120000.nc: 59947 bytes
2007-03-23-030000.nc: 63573 bytes
2007-03-24-073000.nc: 61813 bytes
2007-03-24-093000.nc: 61787 bytes
2007-05-01-143000.nc: 60650 bytes
2007-09-22-190000.nc: 62106 bytes
2008-07-31-210000.nc: 58959 bytes
2008-08-12-203000.nc: 59144 bytes
2009-09-21-080000.nc: 59160 bytes
2010-10-19-203000.nc: 62724 bytes
2010-10-19-230000.nc: 62647 bytes
2010-10-20-000000.nc: 62445 bytes
2010-10-20-003000.nc: 62686 bytes
2010-10-20-010000.nc: 62656 bytes
2010-10-20-013000.nc: 62772 bytes
2010-10-20-020000.nc: 62849 bytes
2010-10-20-023000.nc: 63009 bytes
2010-10-20-030000.nc: 63089 bytes
2010-10-20-033000.nc: 62851 bytes
2010-10-20-040000.nc: 62633 bytes
2010-10-20-043000.nc: 62829 bytes
2010-10-20-050000.nc: 62870 bytes
2010-10-20-053000.nc: 63357 bytes
2010-10-20-060000.nc: 63309 bytes
2010-10-20-063000.nc: 63386 bytes
2010-10-20-070000.nc: 62870 bytes
2010-10-20-073000.nc: 62854 bytes
2010-10-20-080000.nc: 63437 bytes
2010-10-20-083

In [10]:
import os
import pandas as pd
import numpy as np
import h5py
from datetime import datetime

MIN_LON = -120
MAX_LON = -115
MIN_LAT = 31.5
MAX_LAT = 38

def process_hdf5_file(file_path, output_path):
    file_name = file_path.split("/")[-1] 
    timestamp_str = file_name.split(".")[0]
    timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d-%H%M%S")
    hdf = h5py.File(file_path, 'r')
    lon = hdf['lon'][:]
    lat = hdf['lat'][:]
    precipitation = hdf['precipitation'][0]

    lon_indices = np.where((lon >= MIN_LON) & (lon <= MAX_LON))[0]
    lat_indices = np.where((lat >= MIN_LAT) & (lat <= MAX_LAT))[0]

    precipitation_filtered = precipitation[np.ix_(lon_indices, lat_indices)]
    
    data_list = []
    for i, lon_idx in enumerate(lon_indices):
        for j, lat_idx in enumerate(lat_indices):
            data_list.append({
                'Timestamp': str(timestamp),
                'Longitude': lon[lon_idx],
                'Latitude': lat[lat_idx],
                'Precipitation': precipitation_filtered[i, j]
            })
    
    df = pd.DataFrame(data_list)
    
    output_csv = os.path.join(output_path, f'{timestamp.year}.csv')
    
    if os.path.exists(output_csv):
        print(f"Appending to {output_csv}")
    else:
        print(f"Creating new file: {output_csv}")
    
    df.to_csv(output_csv, mode='a', header=not os.path.exists(output_csv), index=False)
    print(f"Finished processing file: {file_name}")

def process_files_from_txt(txt_file_path, directory, output_path):
    with open(txt_file_path, 'r') as file:
        timestamps = file.read().splitlines()

    for timestamp in timestamps:
        file_name = f'{timestamp}'
        file_path = os.path.join(directory, file_name)

        if os.path.exists(file_path):
            process_hdf5_file(file_path, output_path)
        else:
            print(f"File {file_name} not found in the directory.")

output_path = '/root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data'

process_files_from_txt(txt_file_path, directory, output_path)


Creating new file: /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2005.csv
Finished processing file: 2005-01-01-000000.nc
Appending to /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2005.csv
Finished processing file: 2005-07-02-120000.nc
Creating new file: /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2007.csv
Finished processing file: 2007-03-23-030000.nc
Appending to /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2007.csv
Finished processing file: 2007-03-24-073000.nc
Appending to /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2007.csv
Finished processing file: 2007-03-24-093000.nc
Appending to /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2007.csv
Finished processing file: 2007-05-01-143000.nc
Appending to /root/data/rrr/integrated_weather_dataset/data/raw/Precipitation/rem_data/2007.csv
Finished processing file: 2007-09-

In [1]:
import pandas as pd
import os
rem_data_dir = '/root/data/rrr/integrated_weather_dataset/data/processed/Precipitation/rem_data'
main_data_dir = '/root/data/rrr/integrated_weather_dataset/data/processed/Precipitation'
mainn_data_dir = '/root/data/rrr/integrated_weather_dataset/data/processed/Precipitation/final_data'
for year in range(2006, 2024):
    rem_data_file = os.path.join(rem_data_dir, f"{year}.csv")
    main_data_file = os.path.join(main_data_dir, f"{year}.csv")
    mainn_data_file = os.path.join(mainn_data_dir, f"{year}.csv")
    if os.path.exists(rem_data_file) and os.path.exists(main_data_file):
        rem_data = pd.read_csv(rem_data_file)
        main_data = pd.read_csv(main_data_file)
        
        merged_data = pd.concat([main_data, rem_data], axis=0)
        merged_data = merged_data.sort_values(by=['Timestamp','Longitude','Latitude'], ascending=[True, True, True])
        merged_data = merged_data.drop_duplicates(subset=['Timestamp', 'Longitude', 'Latitude'], keep='first')
        merged_data.to_csv(mainn_data_file, index=False)

        print(f"Processed {year}.csv and saved the merged data.")
    else:
        if not os.path.exists(rem_data_file):
            print(f"Missing rem_data file for year {year}. Skipping.")
        if not os.path.exists(main_data_file):
            print(f"Missing main data file for year {year}. Skipping.")


Missing rem_data file for year 2006. Skipping.
Processed 2007.csv and saved the merged data.
Processed 2008.csv and saved the merged data.
Processed 2009.csv and saved the merged data.
Processed 2010.csv and saved the merged data.
Missing rem_data file for year 2011. Skipping.
Processed 2012.csv and saved the merged data.
Missing rem_data file for year 2013. Skipping.
Missing rem_data file for year 2014. Skipping.
Missing rem_data file for year 2015. Skipping.
Missing rem_data file for year 2016. Skipping.
Processed 2017.csv and saved the merged data.
Missing rem_data file for year 2018. Skipping.
Missing rem_data file for year 2019. Skipping.
Missing rem_data file for year 2020. Skipping.
Missing rem_data file for year 2021. Skipping.
Missing rem_data file for year 2022. Skipping.
Processed 2023.csv and saved the merged data.


In [28]:
start_time = "2005-01-01 00:00:00"
end_time = "2005-12-31 23:30:00"
timestamps = pd.date_range(start=start_time, end=end_time, freq="30min")
print(timestamps)
missing_timestamps = df_tempp['Timestamp'].unique() - timestamps
if not missing_timestamps:
    print("All timestamps are present.")
else:
    print(f"Missing timestamps: {missing_timestamps}")


  timestamps = pd.date_range(start=start_time, end=end_time, freq="30T")


DatetimeIndex(['2005-01-01 00:00:00', '2005-01-01 00:30:00',
               '2005-01-01 01:00:00', '2005-01-01 01:30:00',
               '2005-01-01 02:00:00', '2005-01-01 02:30:00',
               '2005-01-01 03:00:00', '2005-01-01 03:30:00',
               '2005-01-01 04:00:00', '2005-01-01 04:30:00',
               ...
               '2005-12-31 19:00:00', '2005-12-31 19:30:00',
               '2005-12-31 20:00:00', '2005-12-31 20:30:00',
               '2005-12-31 21:00:00', '2005-12-31 21:30:00',
               '2005-12-31 22:00:00', '2005-12-31 22:30:00',
               '2005-12-31 23:00:00', '2005-12-31 23:30:00'],
              dtype='datetime64[ns]', length=17520, freq='30min')


ValueError: The truth value of a TimedeltaIndex is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [19]:
main_data[main_data['Timestamp']=='2005-08-28 16:00:00']

Unnamed: 0,Timestamp,Longitude,Latitude,Precipitation
37384750,2005-08-28 16:00:00,-119.950000,31.550000,0.0
37384751,2005-08-28 16:00:00,-119.950000,31.650000,0.0
37384752,2005-08-28 16:00:00,-119.950000,31.750000,0.0
37384753,2005-08-28 16:00:00,-119.950000,31.849998,0.0
37384754,2005-08-28 16:00:00,-119.950000,31.949999,0.0
...,...,...,...,...
37387995,2005-08-28 16:00:00,-115.049995,37.550000,0.0
37387996,2005-08-28 16:00:00,-115.049995,37.649998,0.0
37387997,2005-08-28 16:00:00,-115.049995,37.750000,0.0
37387998,2005-08-28 16:00:00,-115.049995,37.850000,0.0


In [20]:
rem_data[rem_data['Timestamp']=='2005-08-28 16:00:00']

Unnamed: 0,Timestamp,Longitude,Latitude,Precipitation


In [22]:
df_tempp[df_tempp['Timestamp']=='2005-08-28 16:00:00']

Unnamed: 0,Timestamp,Longitude,Latitude,Precipitation
37392875,2005-08-28 16:00:00,-119.950000,31.550000,0.0
37392874,2005-08-28 16:00:00,-119.950000,31.650000,0.0
37391250,2005-08-28 16:00:00,-119.950000,31.750000,0.0
37392337,2005-08-28 16:00:00,-119.950000,31.849998,0.0
37392336,2005-08-28 16:00:00,-119.950000,31.949999,0.0
...,...,...,...,...
37393412,2005-08-28 16:00:00,-115.049995,37.550000,0.0
37393411,2005-08-28 16:00:00,-115.049995,37.649998,0.0
37393410,2005-08-28 16:00:00,-115.049995,37.750000,0.0
37393409,2005-08-28 16:00:00,-115.049995,37.850000,0.0


In [29]:
df_tempp['Timestamp'] = pd.to_datetime(df_tempp['Timestamp'])
df_tempp['Timestamp'].unique()

<DatetimeArray>
['2005-01-01 00:00:00', '2005-01-01 00:30:00', '2005-01-01 01:00:00',
 '2005-01-01 01:30:00', '2005-01-01 02:00:00', '2005-01-01 02:30:00',
 '2005-01-01 03:00:00', '2005-01-01 03:30:00', '2005-01-01 04:00:00',
 '2005-01-01 04:30:00',
 ...
 '2005-12-31 19:00:00', '2005-12-31 19:30:00', '2005-12-31 20:00:00',
 '2005-12-31 20:30:00', '2005-12-31 21:00:00', '2005-12-31 21:30:00',
 '2005-12-31 22:00:00', '2005-12-31 22:30:00', '2005-12-31 23:00:00',
 '2005-12-31 23:30:00']
Length: 17520, dtype: datetime64[ns]

In [30]:
timestamps

DatetimeIndex(['2005-01-01 00:00:00', '2005-01-01 00:30:00',
               '2005-01-01 01:00:00', '2005-01-01 01:30:00',
               '2005-01-01 02:00:00', '2005-01-01 02:30:00',
               '2005-01-01 03:00:00', '2005-01-01 03:30:00',
               '2005-01-01 04:00:00', '2005-01-01 04:30:00',
               ...
               '2005-12-31 19:00:00', '2005-12-31 19:30:00',
               '2005-12-31 20:00:00', '2005-12-31 20:30:00',
               '2005-12-31 21:00:00', '2005-12-31 21:30:00',
               '2005-12-31 22:00:00', '2005-12-31 22:30:00',
               '2005-12-31 23:00:00', '2005-12-31 23:30:00'],
              dtype='datetime64[ns]', length=17520, freq='30min')

In [3]:
start_time = "2005-01-01 00:00:00"
end_time = "2005-12-31 23:30:00"
timestamps = pd.date_range(start=start_time, end=end_time, freq="30min")
are_equal = timestamps.equals(df_tempp['Timestamp'].unique())


NameError: name 'df_tempp' is not defined