In [None]:
import time
import pandas as pd
import os
import numpy as np
import netCDF4 as nc
from watchdog.observers import Observer
#  Observer是用来监控文件系统事件的主要工具
from watchdog.events import FileSystemEventHandler
# FileSystemEventHandler基类用于处理文件系统事件

In [None]:
# 处理 CSV 文件的函数
def process_csv(file_path):
    # 读取文件并剔除异常数据
    df = pd.read_csv(file_path,encoding='gb2312')
    index_temp = (df['CTD温度(℃)'] < 40.0) |(df['CTD温度(℃)'] > -2.5)
    df = df[index_temp]
    index_cond = (df['CTD电导率(S/m)'] < 6) |(df['CTD电导率(S/m)'] > 0)
    df = df[index_cond]
    index_salt = (df['CTD盐度(S/m)'] < 41.0) |(df['CTD盐度(S/m)'] > 0)
    df = df[index_salt]
    index_pre = df['CTD深度(m)'] > 0
    df = df[index_pre]

    TIME = df['数据时间'].iloc[-1] # 留着下面time备用

    # df去除时间列，去除重复行，保持深度从小到大排序
    df = df.drop(df.columns[0], axis=1)
    df = df.drop_duplicates()
    df = df.sort_values(by='CTD深度(m)')
    return TIME,df
    

In [None]:
def get_deviceID(file_path):
    # 使用os.path.split分割路径，得到目录和文件名
    directory, filename = os.path.split(file_path)
    # 分割目录名以获取'Glider'后面的部分
    glider_part = directory.split('/')[-1]  # 获取倒数第二个元素
    DeviceID = glider_part[-2:]
    return DeviceID

In [None]:
# NETCDF3格式生成的文件只有2kb，而默认的不使用 'NETCDF3_CLASSIC'的情况下有11kb，所以使用NETCDF3格式
# NETCDF3格式不能使用长度大于1的字符串，而 len(time)=19，所以需要把字符串拆分存储为字符数组
# 而netCDF4 库在底层利用了 NumPy 来处理数据格式，所以期望数据是在一定的 NumPy 数据类型格式下
# 所以 np.array(list(time), dtype='S1')，在list函数将其转为python列表后，又需要转为numpy数组
# 至于最后的 dtype='S1'，'S1' 表示单个字节的字符串类型，因为 time 拆分后是19个单个字节的字符串

def nc_generate(df,TIME,DeviceID,new_path):
    # 获得各个变量的值
    deviceID = int(DeviceID)
    profileID = df['工作剖面序号'].iloc[-1]
    lat = df['纬度(°)'].iloc[-1]
    lon = df['经度(°)'].iloc[-1]
    depth = df['CTD深度(m)']
    time = TIME
    temp = df['CTD温度(℃)']
    cond = df['CTD电导率(S/m)']
    salt = df['CTD盐度(S/m)']
    density = df['CTD密度(kg/m3)']
    sound = df['CTD声速(m/s)']

    depth_len = df.shape[0] # 获取数据行数，为depth_dim准备
    # 创建nc文件
    format = 'NETCDF3_CLASSIC'
    filename = 'Glider'+ DeviceID + '-Profile' + str(profileID)
    file  = nc.Dataset(new_path+'/'+f'{filename}.nc', 'w',format= format)

    # 创建维度
    time_dim = file.createDimension('Time',len(time))
    lat_dim = file.createDimension('Latitude',1)
    lon_dim = file.createDimension('Longitude',1)
    deviceID_dim = file.createDimension('DeviceID',1)
    profileID_dim = file.createDimension('ProfileID',1)
    depth_dim = file.createDimension('Depth',depth_len)

    # 创建变量
    time_var = file.createVariable('Time','S1',('Time',))
    lat_var = file.createVariable('Latitude','f4',('Latitude',))
        # 'Latitude': 这是新创建的变量的名称
        # 'f4': 这指定了变量的数据类型。'f4'代表64位浮点数（双精度）
        # ('Latitude',):指定新变量所依赖的维度，('Latitude',)创建元组，而 ('Latitude')为一个字符串
    lon_var = file.createVariable('Longitude','f4',('Longitude',))
    deviceID_var = file.createVariable('DeviceID','i4',('DeviceID',))
    profileID_var = file.createVariable('ProfileID','i4',('ProfileID',))
    depth_var = file.createVariable('Depth','f4',('Depth',))
    temp_var = file.createVariable('Temperature','f4',('Depth',))
    cond_var = file.createVariable('Conductivity','f4',('Depth',))
    salt_var = file.createVariable('Salinity','f4',('Depth',))
    density_var = file.createVariable('Density','f4',('Depth',))
    sound_var = file.createVariable('Sound','f4',('Depth',))

    # 添加属性
    lat_var.units = "degrees_north"
    lon_var.units = "degrees_east"    
    depth_var.units = "meters_vertical"    
    temp_var.units = "celsius"    
    salt_var.units = "psu" 
    cond_var.units = "S/m" 
    sound_var.units = "m/s" 
    density_var.units = "kg/m3" 

    # 变量中写入数据
        # 必须time_var[:] = time_data，不可以time_var= time_data
        # 使用createVariable方法创建一个变量后，这个变量实际上是一个特殊的对象，它代表netCDF文件中的一个数据区域。
        # time_var = time_data会改变time_var的引用，而不是修改它所代表的数据区域的内容
        # 相反，time_var[:]是用来引用该变量整个数据区域的一个“切片”
        # time_var[:] = time_data 实际上是将time_data数组中的数据复制到time_var所代表的数据区域中
    lat_var[:] = lat
    lon_var[:] = lon
    depth_var[:] = depth
    time_char_array = np.array(list(time), dtype='S1')
    time_var[:] = time_char_array # time_var只能使用整数索引写入，也许是数据类型'S19'的原因
    cond_var[:] = cond
    salt_var[:] = salt
    density_var[:] = density
    sound_var[:] = sound
    temp_var[:] = temp
    profileID_var[:] = profileID
    deviceID_var[:] = deviceID

    file.close() # 关闭文件

In [None]:
# file_path = 'C:/Users/lenovo/Desktop/java/Glider05/工作剖面号83.csv'
# new_path = 'C:/Users/lenovo/Desktop/java'
# TIME,df = process_csv(file_path)
# deviceID = get_deviceID(file_path)
# nc_generate(df,TIME,deviceID,new_path)


In [None]:
# # 读取文件并剔除异常数据
# df = pd.read_csv('C:/Users/lenovo/Desktop/java/工作剖面号83.csv',encoding='gb2312')

# index_temp = (df['CTD温度(℃)'] < 40.0) |(df['CTD温度(℃)'] > -2.5)
# df = df[index_temp]

# index_cond = (df['CTD电导率(S/m)'] < 6) |(df['CTD电导率(S/m)'] > 0)
# df = df[index_cond]

# index_salt = (df['CTD盐度(S/m)'] < 41.0) |(df['CTD盐度(S/m)'] > 0)
# df = df[index_salt]

# index_pre = df['CTD深度(m)'] > 0
# df = df[index_pre]

# TIME = df['数据时间'].iloc[-1] # 留着下面time备用

# # df去除时间列，去除重复行，保持深度从小到大排序
# df = df.drop(df.columns[0], axis=1)
# df = df.drop_duplicates()
# df = df.sort_values(by='CTD深度(m)')

In [None]:
# # 给定的文件路径
# file_path = 'E:/Data/C盘-桌面/2023年所有海试任务/2023年12月海试任务/Glider15/原始数据/岸基数据/2023-12-14/任务帧数据/'
# # 使用os.path.split分割路径，得到目录和文件名
# directory, filename = os.path.split(file_path)
# # 分割目录名以获取'Glider'后面的部分
# glider_part = directory.split('/')[-5]  # 获取倒数第二个元素



In [None]:
# # deviceID = int(glider_part[-2:])
# profileID = df['工作剖面序号'].iloc[-1]
# lat = df['纬度(°)'].iloc[-1]
# lon = df['经度(°)'].iloc[-1]
# depth = df['CTD深度(m)']
# time = TIME
# temp = df['CTD温度(℃)']
# cond = df['CTD电导率(S/m)']
# salt = df['CTD盐度(S/m)']
# density = df['CTD密度(kg/m3)']
# sound = df['CTD声速(m/s)']

In [83]:
df = pd.read_csv(r'C:\Users\lenovo\Desktop\java\Glider05\工作剖面号83.csv',encoding='gb2312')
df['数据时间'] = pd.to_datetime(df['数据时间'])

In [86]:
DeviceID = '003'
deviceID = int(DeviceID)
profileID = df['工作剖面序号'].iloc[-1]
lat = round(df['纬度(°)'].iloc[-1], 4)
lon = round(df['经度(°)'].iloc[-1],4)
depth = round(df['CTD深度(m)'],3)
TIME = str(df['数据时间'].iloc[0]) # 出水时间用第一行
time = TIME
temp = round(df['CTD温度(℃)'],4)
cond = round(df['CTD电导率(S/m)'],4)
salt = round(df['CTD盐度(S/m)'],4)
density = round(df['CTD密度(kg/m3)'],4)
sound = round(df['CTD声速(m/s)'],4)

depth_len = df.shape[0] # 获取数据行数，为depth_dim准备
# 创建nc文件
format = 'NETCDF3_CLASSIC'
# filename = f'Glider_HY-BX_{DeviceID}_Pro{profileID}_105.21E_15.11N_202303270530_Q.nc'
filename = 'Glider'+ DeviceID + '-Profile' + str(profileID)
new_path = r'C:\Users\lenovo\Desktop\java\Glider05'
file  = nc.Dataset(new_path+'/'+f'{filename}.nc', 'w',format= format)

depth_len = df.shape[0] # 获取数据行数，为depth_dim准备
# 创建nc文件
format = 'NETCDF3_CLASSIC'
filename = 'Glider'+ DeviceID + '-Profile' + str(profileID)
file  = nc.Dataset(new_path+'/'+f'{filename}.nc', 'w',format= format)

# 创建维度
profileID_dim = file.createDimension('num_profile',1)
time_dim = file.createDimension('num_time',len(time))
lon_dim = file.createDimension('num_longitude',1)
lat_dim = file.createDimension('num_latitude',1)
depth_dim = file.createDimension('num_node',depth_len)
# deviceID_dim = file.createDimension('DeviceID',1)

# 创建变量
profileID_var = file.createVariable('profile_number','i4',('num_profile',))
time_var = file.createVariable('end_time','S1',('num_time',))
lon_var = file.createVariable('end_longitude','f4',('num_longitude',))
lat_var = file.createVariable('end_latitude','f4',('num_latitude',))
    # 'Latitude': 这是新创建的变量的名称
    # 'f4': 这指定了变量的数据类型。'f4'代表64位浮点数（双精度）
    # ('Latitude',):指定新变量所依赖的维度，('Latitude',)创建元组，而 ('Latitude')为一个字符串

# deviceID_var = file.createVariable('DeviceID','i4',('DeviceID',))
depth_var = file.createVariable('seawater_pressure','f4',('num_node',))
temp_var = file.createVariable('seawater_temperature','f4',('num_node',))
cond_var = file.createVariable('seawater_conductivity','f4',('num_node',))
salt_var = file.createVariable('seawater_salinity','f4',('num_node',))
density_var = file.createVariable('seawater_density','f4',('num_node',))
sound_var = file.createVariable('seawater_sound_velocity','f4',('num_node',))

# 添加属性
profileID_var.long_name="current profile number of glider"
time_var.long_name="the end UTC time of current profile observed by the glider"
lon_var.long_name = 'the end longitude of current profile observed by the glider' 
lon_var.units = "degrees_east" 
lat_var.long_name = 'the end latitude of current profile observed by the glider'
lat_var.units = "degrees_north"
depth_var.long_name = "a profile of seawater pressure observed by the glider"
depth_var.units = "dbar"    
temp_var.long_name = 'a profile of seawater temperature observed by the glider'
temp_var.units = "Celsius" 
cond_var.long_name = "a profile of seawater conductivity observed by the glider"
cond_var.units = "S/m" 
salt_var.long_name = "a profile of seawater salinity observed by the glider"
salt_var.units = "PSU" 
density_var.long_name =  "a profile of seawater density observed by the glider"
density_var.units = "kg/m3" 
sound_var.long_name = "a profile of seawater sound velocity observed by the glider"
sound_var.units = "m/s" 

# 变量中写入数据
    # 必须time_var[:] = time_data，不可以time_var= time_data
    # 使用createVariable方法创建一个变量后，这个变量实际上是一个特殊的对象，它代表netCDF文件中的一个数据区域。
    # time_var = time_data会改变time_var的引用，而不是修改它所代表的数据区域的内容
    # 相反，time_var[:]是用来引用该变量整个数据区域的一个“切片”
    # time_var[:] = time_data 实际上是将time_data数组中的数据复制到time_var所代表的数据区域中
lat_var[:] = lat
lon_var[:] = lon
depth_var[:] = depth
time_char_array = np.array(list(time), dtype='S1')
time_var[:] = time_char_array # time_var只能使用整数索引写入，也许是数据类型'S19'的原因
cond_var[:] = cond
salt_var[:] = salt
density_var[:] = density
sound_var[:] = sound
temp_var[:] = temp
profileID_var[:] = profileID
# deviceID_var[:] = deviceID

# 全局属性(需要放到最后来，不然添加不上全局属性)
file.Glider_ReleaseTime = '2022-12-11 14:03:07 UTC'
file.Glider_ReleaseLongitude = '111.8444°E'
file.Glider_ReleaseLatitude = '17.4169°N'
file.CTD_SN = '2408'
file.sensor	= 'temperature,SBECTD,±0.001℃;conductivity,SBECTD,±0.01psu;depth,SBECTD,±0.01m'
file.information = ''
file.source = 'Wei Ma, TianJin University, wei.ma@tju.edu.cn'
file.Comment = 'quality control data'
import datetime
import pytz
current_time = datetime.datetime.now() # 获取当前时间
utc_time = current_time.astimezone(pytz.utc) # 将当前时间转换为 UTC 时间
formatted_utc_time = utc_time.strftime("%Y-%m-%d %H:%M:%S %Z") # 格式化并显示 UTC 时间
time_now = str(formatted_utc_time)[:19]
file.history = f'Created by Python at {time_now}'

file.close() # 关闭文件

In [None]:
# 文件夹监控处理器
class MyHandler(FileSystemEventHandler):
    def __init__(self,folder_path):
        self.folder_path=folder_path

    def on_created(self, event):
        if event.is_directory:
            # 如果是目录则直接返回
            return
        elif event.event_type == 'created' and event.src_path.endswith('.csv'):
            print("New CSV file detected:", event.src_path)
            # 处理新创建的 CSV 文件
            # process_csv(event.src_path)

            file_path = event.src_path # 检测到的csv文件的路径
            new_path = self.folder_path # 存放nc文件的路径=检测新文件生成的文件夹的路径，也就是说，检测到的csv和生成的nc在一个文件夹
            TIME,df = process_csv(file_path)
            deviceID = get_deviceID(file_path)
            nc_generate(df,TIME,deviceID,new_path)

In [None]:
# 监控文件夹并处理 CSV 文件的函数
def watch_folder(folder_path):
    event_handler = MyHandler(folder_path=folder_path)
    observer = Observer()
    observer.schedule(event_handler, folder_path, recursive=False)
    # 使用前面创建的 event_handler 来监控 folder_path 指定的路径
    # recursive=False指定观察器只监控指定的文件夹，而不会监控其子文件夹
    observer.start()
    # 启动观察器，从这一刻起，observer 会在后台运行，监控文件夹的变化
    try: # 尝试可能会发生异常的代码块
        print("Watching folder for new CSV files...")
        while True:
            time.sleep(1) 
            # 只会影响中断运行时的响应速度，和observer观测没有关系
            # 不能确保每秒钟检查一次，这样不可以稍微减少资源消耗
            # Observer 是事件驱动，不是通过轮询文件系统来检测变化的，
            # 而是基于操作系统的文件系统事件通知，资源消耗低
    except KeyboardInterrupt: # 处理异常类型的代码块
        observer.stop() # 停止观察器
    observer.join()
    # 主线程负责启动一个用于监控任务的后台线程，当主线程接收到用户的退出命令后，
    # 它首先通知后台线程停止监控，然后使用 join() 等待后台线程结束
    # join() 用来确保监控线程可以完整地结束它的活动，如处理正在进行中的事件
    # join() 帮助进行资源清理，这对于防止内存泄漏和其他潜在的资源占用问题非常重要

In [None]:
# 主函数
if __name__ == "__main__":
    folder_path = 'C:/Users/lenovo/Desktop/java/Glider05'  # 替换为你要监控的文件夹路径
    watch_folder(folder_path)