In [10]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.neighbors import KDTree

import os
import glob

from matplotlib.ticker import MultipleLocator
# import cartopy.crs as ccrs

from sklearn.metrics import mean_squared_error

In [18]:
# ver2

# 사용할 관측 데이터
def calculate_rmse(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    return rmse
# 디렉토리 내의 모든 CSV 파일 목록 가져오기
csv_files = glob.glob('./DATA/20230705_스시2_서울&시흥_1월관측_후처리자료/' + '*.csv')
csv_files = csv_files[:-4]
print("CSV 파일 목록:")

for file in csv_files:

    print(file)
    obs_df = pd.read_csv(file, low_memory=False)
    obs_df.columns = obs_df.columns.str.lower()

    file_info = file.split('_')

    road_name = file_info[-4]
    date = file_info[-2]
    road_dir_1 = file_info[-1]
    road_dir_1 = road_dir_1.split('.')[0]
    date = file_info[-2]
    print(road_name, date, road_dir_1)

    data_obs_test = obs_df
    #데이터의 timestmp는 센서에 따라서 다른 경우가 있기 때문에 각각 int로 저장해둔 시간 변수를 사용
    data_obs_test["timestamp"] = pd.to_datetime(data_obs_test["timestamp"])
    data_obs_test["year"] = data_obs_test["timestamp"].dt.year
    data_obs_test["month"] = data_obs_test["timestamp"].dt.month
    data_obs_test["day"] = data_obs_test["timestamp"].dt.day
    data_obs_test["hour"] = data_obs_test["timestamp"].dt.hour
    print('해당 관측 자료에 포함되는 일',data_obs_test['day'].unique())
    print('해당 관측 자료에 포함되는 시간',data_obs_test['hour'].unique())
    # 1월 19일 15시 데이터로 진행 > 관측자료
    print("관측 자료 개수",len(data_obs_test))

    day = data_obs_test['day'].unique()[0]
    hour = data_obs_test['hour'].unique()[0]
    year = data_obs_test['year'].unique()[0]
    month = data_obs_test['month'].unique()[0]

    yyyymm = f'{year}{month:02d}'
    dd_obs = day
    hour = hour
    days = [dd_obs-1,dd_obs] # 모델 자료 시간을 KST로 변환으로 인해 한 날짜의 데이터가 2개의 파일에 분할되어 있음
    models = {'jr', 'mg', 'org', 'ss', 'yc'}
    for model_name in models:
        site_df = pd.DataFrame()
        for dd in days:
            model_file_dir = f'./DATA/MODEL/seoul/{model_name}/{yyyymm}/{dd}/'
            csv_files = glob.glob(f'{model_file_dir}/*.csv')
            
            # 찾은 CSV 파일들을 출력
            for csv_file in csv_files:
                model_road = pd.read_csv(csv_file)
                cond1 = model_road['road_name'] == road_name
                cond2 = model_road['direction'].str.startswith(road_dir_1)
                model_road = model_road[cond1&cond2]
                #site_list.append(model_road)
            site_df = pd.concat([site_df,model_road])

        site_df['date_time'] = pd.to_datetime(site_df['date_time'])
        site_df['hour'] = site_df['date_time'].dt.hour
        site_df['day'] = site_df['date_time'].dt.day
        model_1 = site_df

        model_data_df = model_1.copy() # 해당 조건의 모델 데이터 / 재사용을 위해 복사본 사용
        obs_date_df = data_obs_test.copy() # 해당 조건의 관측 데이터 / 재사용을 위해 복사본 사용

        print('모델 데이터의 컬럼 : ',model_data_df.columns)
        print('관측 데이터의 컬럼 : ',obs_date_df.columns)
        #KDTree 를 통한 근접 점 찾기
        road_df = pd.DataFrame()
        # 첫 번째 줄의 위경도 데이터
        observ_line = np.array(list(zip(obs_date_df['longitude'], obs_date_df['latitude'])))

        # 두 번째 줄의 위경도 데이터
        model_line = np.array(list(zip(model_data_df['lon'], model_data_df['lat'])))

        # KDTree 객체 생성
        tree = KDTree(model_line)

        # 각 점마다 가장 가까운 점을 찾아 매칭
        matched_points = []
        for point in observ_line:
            _, index = tree.query([point], k=1)  # k=1로 설정하여 가장 가까운 점 하나만 선택
            matched_points.append(model_line[index[0]])

        matched_lon = [point[0][0] for point in matched_points]
        matched_lat = [point[0][1] for point in matched_points]

        df = pd.DataFrame({'longitude': observ_line[:, 0], 'latitude': observ_line[:, 1], 'lon': matched_lon, 'lat': matched_lat})

        days = model_data_df['day'].unique()
        model_date_2 = model_data_df.copy()

        total_df = pd.merge(df,data_obs_test,on=['longitude','latitude'])
        total_df = pd.merge(total_df,model_date_2,on=['lon','lat','hour','day'])
        total_df = total_df.drop_duplicates()
        road_df = pd.concat([road_df,total_df])
        if 'road temperature100 [°c] cur' in road_df.columns:
            # Rename 'road temperature100 [°c] cur' to 'surface_temperature'
            road_df.rename(columns={'road temperature100 [°c] cur': 'surface_temperature'}, inplace=True)
        elif 'surface_temperature' in road_df.columns:
            # Do nothing, as the desired column name is already present
            pass
        else:
            # Handle the case when neither column is present if needed
            pass
        
        road_df['rmse'] = abs(road_df['road_temp'] - road_df['surface_temperature'])
        road_df = road_df.sort_values(by=['timestamp','rmse'])
        road_df = road_df.drop_duplicates(subset=['lon', 'lat'])
       
        road_df.to_csv(f"C:/Users/user/Desktop/모델검증/DATA/test/{road_name}{date}{model_name}{road_dir_1}_abs.csv", index=False)
        del road_df

CSV 파일 목록:
./DATA/20230705_스시2_서울&시흥_1월관측_후처리자료\seoul_1_gangbyeon_vaisala_20230119_U.csv
gangbyeon 20230119 U
해당 관측 자료에 포함되는 일 [19]
해당 관측 자료에 포함되는 시간 [15]
관측 자료 개수 31977
모델 데이터의 컬럼 :  Index(['date_time', 'update_time', 'loc', 'lon', 'lat', 'seq', 'p_hour',
       'road_name', 'direction', 'altitude', 'link_id', 'road_temp',
       'road_hydro', 'road_ice', 'hour', 'day'],
      dtype='object')
관측 데이터의 컬럼 :  Index(['timestamp', 'longitude', 'latitude', 'relative_humidity',
       'surface_temperature', 'ice_layer_thickness', 'surface_state',
       'dew_point_temperature', 'water_layer_thickness',
       'frost_point_temperature', 'ambient_temperature', 'grip',
       'snow_layer_thickness', 'year', 'month', 'day', 'hour'],
      dtype='object')
모델 데이터의 컬럼 :  Index(['date_time', 'update_time', 'loc', 'lon', 'lat', 'seq', 'p_hour',
       'road_name', 'direction', 'altitude', 'link_id', 'road_temp',
       'road_hydro', 'road_ice', 'hour', 'day'],
      dtype='object')
관측 데이터의 컬럼 :  Inde

In [None]:
# Assuming you have a list of CSV file paths named csv_files
def cal_rmse_df(site):
    csv_files = glob.glob('./DATA/test/' + '*.csv')
    df_list = []
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)  # Use csv_file instead of csv_files
        if site in csv_file:
            df = pd.read_csv(csv_file)
            # Check if either 'road temperature100 [°c] cur' or 'surface_temperature' is present
            if 'road temperature100 [°c] cur' in df.columns:
                # Rename 'road temperature100 [°c] cur' to 'surface_temperature'
                df.rename(columns={'road temperature100 [°c] cur': 'surface_temperature'}, inplace=True)
            elif 'surface_temperature' in df.columns:
                # Do nothing, as the desired column name is already present
                pass
            else:
                # Handle the case when neither column is present if needed
                pass
            df_mse = mean_squared_error(df['road_temp'], df['surface_temperature'])
            df_rmse = np.sqrt(df_mse)
            print(csv_file.split("\\")[-1],end=" ")
            print(df_rmse)
            df_list.append(df)

    # Concatenate the list of dataframes into a single dataframe
    dfs = pd.concat(df_list, ignore_index=True)  # ignore_index=True resets the index
    mse = mean_squared_error(dfs['road_temp'], dfs['surface_temperature'])
    rmse = np.sqrt(mse)
    print(site,rmse)
    
    
models  = ["jr","mg","org","ss","yc"]
for model in models:
    cal_rmse_df(model)

dongbu20230119jrU_opt.csv 1.80888234899377
dongbu20230126jrD_opt.csv 4.513307081576804
dongbu20230127jrD_opt.csv 1.947272275204774
gangbyeon20230119jrU_opt.csv 3.413180172234023
naebu20230119jrU_opt.csv 5.66220066785081
olympic20230117jrU_opt.csv 2.7427475278374933
olympic20230118jrU_opt.csv 3.4415828388153327
olympic20230119jrU_opt.csv 3.286607196702438
olympic20230226jrD_opt.csv 2.938439652816712
olympic20230227jrD_opt.csv 2.380445381734406
seobu20230120jrD_opt.csv 3.119547956864004
jr 3.292690736078622
dongbu20230119mgU_opt.csv 3.161741901245304
dongbu20230126mgD_opt.csv 4.326242082922219
dongbu20230127mgD_opt.csv 1.9213552253200419
gangbyeon20230119mgU_opt.csv 5.889327038181449
naebu20230119mgU_opt.csv 8.151261580606985
olympic20230117mgU_opt.csv 2.806688688996743
olympic20230118mgU_opt.csv 3.229561749515979
olympic20230119mgU_opt.csv 3.9827264418528676
olympic20230226mgD_opt.csv 2.544561138474566
olympic20230227mgD_opt.csv 2.164477102673814
seobu20230120mgD_opt.csv 3.7833737712151