In [None]:
import netCDF4
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
import numpy as np
import os
import glob

shapefile_path = "Data\\gadm41_VNM_0.shp" 

In [None]:
def process_single_nc_file(file_path, output_folder, shapefile_path):
    with netCDF4.Dataset(file_path, 'r') as nc_file:
        required_variables = [
            'latitude', 'longitude', 'time', 'date', 'xco2_quality_flag', 'xco2'
        ]
        available_variables = nc_file.variables.keys()

        latitude = nc_file.variables['latitude'][:]
        longitude = nc_file.variables['longitude'][:]
        time_var = nc_file.variables['time']
        time_data = time_var[:]
        date_data = nc_file.variables['date'][:]
        xco2_quality_flag_data = nc_file.variables['xco2_quality_flag'][:] 
        xco2_data = nc_file.variables['xco2'][:] 

        output_csv_name = None
        if date_data.shape[0] > 0 and date_data.ndim >= 2 and date_data.shape[1] >= 3:
            year_fn, month_fn, day_fn = int(date_data[0, 0]), int(date_data[0, 1]), int(date_data[0, 2])
            date_for_filename = f"{str(year_fn)[-2:]}-{month_fn:02d}-{day_fn:02d}"
            output_csv_name = f"raw_data_{date_for_filename}.csv"
        else:
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            output_csv_name = f"{base_name}_processed_quality_filtered.csv"
            
        output_csv_path = os.path.join(output_folder, output_csv_name)

        time_units = getattr(time_var, 'units', 'seconds since 1970-01-01 00:00:00')
        datetime_values_flat = []
        flat_time_data = time_data.flatten()

        if 'since' in time_units:
            try:
                time_origin_str_full = time_units.split(' since ')[-1]
                time_origin = datetime.strptime(time_origin_str_full, '%Y-%m-%d %H:%M:%S')
                time_deltas_flat = []
                if 'hours' in time_units.lower():
                    time_deltas_flat = [timedelta(hours=float(t)) for t in flat_time_data]
                elif 'minutes' in time_units.lower():
                    time_deltas_flat = [timedelta(minutes=float(t)) for t in flat_time_data]
                elif 'days' in time_units.lower():
                    time_deltas_flat = [timedelta(days=float(t)) for t in flat_time_data]
                else:
                    time_deltas_flat = [timedelta(seconds=float(t)) for t in flat_time_data]
                datetime_values_flat = [time_origin + delta for delta in time_deltas_flat]
            except Exception as e_time:
                print(f"Lỗi: {e_time}. Giữ giá trị thời gian thô.")
                datetime_values_flat = flat_time_data.tolist()
        else:
            print(f"Giữ giá trị thời gian thô.")
            datetime_values_flat = flat_time_data.tolist()

        dates_str_for_column = []
        if date_data.ndim == 2 and date_data.shape[0] == len(latitude.flatten()) and date_data.shape[1] >= 3:
            for i in range(date_data.shape[0]):
                year, month, day = int(date_data[i,0]), int(date_data[i,1]), int(date_data[i,2])
                dates_str_for_column.append(f"{year}-{month:02d}-{day:02d}")
        else:
            dates_str_for_column = [None] * len(latitude.flatten())

        lat_flat = latitude.flatten()
        lon_flat = longitude.flatten()
        xco2_flat = xco2_data.flatten() 
        xco2_quality_flag_flat = xco2_quality_flag_data.flatten() 

        points_gdf = gpd.GeoDataFrame(
            geometry=gpd.points_from_xy(lon_flat, lat_flat),
            crs="EPSG:4326"
        )
        points_gdf['original_index'] = np.arange(len(lat_flat))
        points_gdf['xco2_quality_flag'] = xco2_quality_flag_flat

        vietnam_gdf = gpd.read_file(shapefile_path)
        if vietnam_gdf.crs != points_gdf.crs:
            vietnam_gdf = vietnam_gdf.to_crs(points_gdf.crs)

        points_in_vietnam_gdf = gpd.sjoin(points_gdf, vietnam_gdf, how="inner", predicate="within")
        if points_in_vietnam_gdf.empty:
            return

        good_quality_points_df = points_in_vietnam_gdf[points_in_vietnam_gdf['xco2_quality_flag'] == 0].copy()
        if good_quality_points_df.empty:
            return

        filtered_indices = good_quality_points_df['original_index'].values

        df_data = {
            'latitude': lat_flat[filtered_indices],
            'longitude': lon_flat[filtered_indices],
            'time': [datetime_values_flat[i] for i in filtered_indices],
            'date': [dates_str_for_column[i] for i in filtered_indices],
            'xco2': xco2_flat[filtered_indices],
            'xco2_quality_flag': xco2_quality_flag_flat[filtered_indices], 
        }
        df = pd.DataFrame(df_data)
        df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

In [None]:
def main():
    input_folder = "Data\\GHG"  
    output_folder = "Output\\2020\\raw_data"
    os.makedirs(output_folder, exist_ok=True) 

    nc_files_to_process = glob.glob(os.path.join(input_folder, "*.nc4"))
    for nc_file_path in nc_files_to_process:
        process_single_nc_file(nc_file_path, output_folder, shapefile_path)
    print("Hoàn tất xử lý tất cả các file.")

if __name__ == "__main__":
    main()