가장 가까운 AWS 세곳 정하고 거리 가중치 평균으로 관측소의 기상정보를 추출해 저장

In [1]:
from collections import defaultdict
from haversine import haversine
import pandas as pd
import numpy as np
import os

In [2]:
df_pm = pd.read_csv("../dataset/META/pmmap.csv")
df_aws = pd.read_csv("../dataset/META/awsmap.csv")

In [3]:
pm_dict = defaultdict(list)
for _, pm in df_pm.iterrows():
  pm_loc = (pm.Latitude, pm.Longitude)
  for _, aws in df_aws.iterrows():
    aws_loc = (aws.Latitude, aws.Longitude)
    dist = haversine(pm_loc, aws_loc, unit='km')
    pm_dict[pm.Location].append((aws.Location, dist))
  pm_dict[pm.Location] = sorted(pm_dict[pm.Location], key=lambda x: x[1])[:3]

In [4]:
for pm, aws_list in pm_dict.items():
    df_pm = pd.read_csv(f"../dataset/TRAIN/{pm}.csv")
    df_pm["PM2.5"] = df_pm["PM2.5"].interpolate()
    
    aws_dfs = []
    aws_weights = []
    for aws_name, aws_weight in aws_list:
        df = pd.read_csv(f"../dataset/TRAIN_AWS/{aws_name}.csv")
        df.fillna(method="ffill", inplace=True)
        aws_dfs.append(df)
        aws_weights.append(aws_weight)
        
    aws_weights = [round(w, 3) for w in aws_weights]
    s = sum(aws_weights)
    aws_weights = [round(w / s, 3) for w in aws_weights]
    
    cols = [np.zeros(len(df_pm)) for _ in range(5)]
    for aws_df, w in zip(aws_dfs, aws_weights):
        cols[0] += np.array(aws_df["기온(°C)"]) * w
        cols[1] += np.array(aws_df["풍향(deg)"]) * w
        cols[2] += np.array(aws_df["풍속(m/s)"]) * w
        cols[3] += np.array(aws_df["강수량(mm)"]) * w
        cols[4] += np.array(aws_df["습도(%)"]) * w

    df_pm["기온(°C)"] = cols[0]
    df_pm["풍향(deg)"] = cols[1]
    df_pm["풍속(m/s)"] = cols[2]
    df_pm["강수량(mm)"] = cols[3]
    df_pm["습도(%)"] = cols[4]
    df_pm.to_csv(f"../dataset/CUSTOM_v0/{pm}.csv", index=False)