# 训练数据加载

In [5]:
import os
from datetime import datetime
from typing import Sequence, Any, Dict

import numpy as np
import pandas as pd
import xarray as xr
from dask.distributed import Client
from tqdm.auto import tqdm

# Worker‐local cache
_ds_abandon = None
_ds_feat    = None

def extract_single_point(
    lat: float,
    lon: float,
    year: int,
    p_area: float,
    capacity_m: float,
    unique_id: Any,
    country: str,
    abandon_pattern: str,
    feature_pattern: str
) -> Dict[str, Any]:
    """
    在每个 worker 进程中首次打开并缓存两个 Dataset，然后针对单点抽特征。
    """
    global _ds_abandon, _ds_feat

    # 第一次调用时打开并 chunk
    if _ds_abandon is None:
        ds = xr.open_mfdataset(
            abandon_pattern, combine='by_coords',
            engine='netcdf4', parallel=False
        )
        # 先加载 metadata，再 chunk
        tlen = ds.sizes['time']
        _ds_abandon = ds.chunk({'time': tlen, 'lat': 500, 'lon': 500})

    if _ds_feat is None:
        ds = xr.open_mfdataset(
            feature_pattern, combine='by_coords',
            engine='netcdf4', parallel=False
        )
        tlen = ds.sizes['time']
        _ds_feat = ds.chunk({'time': tlen, 'lat': 1000, 'lon': 1000})

    # 最近邻抽取
    env_pt     = _ds_feat.sel(time=str(year), method='nearest') \
                         .sel(lat=lat, lon=lon, method='nearest')
    abandon_pt = _ds_abandon.sel(lat=lat, lon=lon, method='nearest')

    feat = []
    # 环境/社会经济特征
    for var in _ds_feat.data_vars:
        arr = env_pt[var].load().values
        feat.append(float(arr) if arr.ndim == 0 else float(arr.flat[0]))

    # 撂荒属性
    for var in ("current_abandonment", "abandonment_year",
                "abandonment_duration", "recultivation"):
        if var in _ds_abandon.data_vars:
            arr = abandon_pt[var].load().values
            feat.append(float(arr))
        else:
            feat.append(np.nan)

    # landcover 序列 embedding
    if "landcover" in _ds_abandon.data_vars:
        lc = abandon_pt["landcover"].load().values  # shape (time,)
        seq = np.nan_to_num(lc, 0).astype(int)
        seq = np.clip(seq, 1, 9) - 1
        onehot = np.eye(9)[seq]
        feat.extend(onehot.mean(axis=0).tolist())

    return {
        'lat': lat, 'lon': lon, 'year': year,
        'unique_id': unique_id,
        'p_area': p_area, 'capacity_m': capacity_m,
        'country': country,
        **{f'f{i}': v for i, v in enumerate(feat)}
    }


def load_pv_sites(
    csv_path: str,
    years: Sequence[int] = (2018, 2020)
) -> pd.DataFrame:
    """
    加载并标准化 PV 站点数据，过滤指定年份。
    """
    df = pd.read_csv(csv_path)
    if df.empty:
        raise ValueError(f"CSV 文件为空: {csv_path}")

    # 经纬度列映射
    rename_map = {}
    for src in ('latitude','lat_deg','LAT','Lat'):
        if src in df.columns:
            rename_map[src] = 'lat'
    for src in ('longitude','lon_deg','LON','Lon'):
        if src in df.columns:
            rename_map[src] = 'lon'
    df = df.rename(columns=rename_map)

    # 类型强制
    df['lat']  = pd.to_numeric(df['lat'], errors='raise')
    df['lon']  = pd.to_numeric(df['lon'], errors='raise')
    df['year'] = pd.to_numeric(df['year'], downcast='integer', errors='raise')

    required = {'lat','lon','year','unique_id','p_area','capacity_m','country'}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"CSV 文件缺少必要列: {sorted(missing)}")

    df = df[df['year'].isin(years)]
    if df.empty:
        raise ValueError(f"没有符合年份 {years} 的记录")

    return df.reset_index(drop=True)

# 数据采样


## 特征提取通用函数

In [6]:
# 顶部已经定义好了：
# _nc_lock = threading.Lock()

def extract_all_features(
    ds_feat: xr.Dataset,
    ds_abandon: xr.Dataset,
    pv_df: pd.DataFrame,
    years: Sequence[int]
) -> pd.DataFrame:
    records = []

    for yr in tqdm(years, desc="处理年份"):
        sub = pv_df[pv_df.year == yr].reset_index(drop=True)
        lats = xr.DataArray(sub.lat.values, dims="point")
        lons = xr.DataArray(sub.lon.values, dims="point")

        ds_fy = ds_feat.sel(time=str(yr), method="nearest")

        # —— 环境特征
        env_dict = {}
        for var in tqdm(ds_feat.data_vars, desc=f"提取环境变量 {yr}"):
            da_sel = ds_fy[var].sel(lat=lats, lon=lons, method="nearest")

            # ← 关键修改：在 compute 之前加全局锁
            with _nc_lock:
                arr = da_sel.compute(scheduler="single-threaded")

            env_dict[var] = arr  # numpy array

        # —— 撂荒属性
        aband_dict = {}
        for var in ("current_abandonment","abandonment_year",
                    "abandonment_duration","recultivation"):
            if var in ds_abandon.data_vars:
                da_sel = ds_abandon[var].sel(lat=lats, lon=lons, method="nearest")
                with _nc_lock:
                    arr = da_sel.compute(scheduler="single-threaded")
                aband_dict[var] = arr
            else:
                aband_dict[var] = np.full(len(sub), np.nan)

        # —— landcover 序列 embedding
        if "landcover" in ds_abandon.data_vars:
            lc_sel = ds_abandon.landcover.sel(lat=lats, lon=lons, method="nearest")
            with _nc_lock:
                lc_arr = lc_sel.compute(scheduler="single-threaded")
            lc_vals = np.nan_to_num(lc_arr, nan=0).astype(int)
            lc_vals = np.clip(lc_vals, 1, 9) - 1
            onehot = np.eye(9)[lc_vals]
            mean_onehot = onehot.mean(axis=0)
            for i in range(9):
                aband_dict[f"lc_cls_{i}"] = mean_onehot[:, i]

        df_feat = pd.DataFrame({**env_dict, **aband_dict})
        df_out  = pd.concat([sub, df_feat], axis=1)
        records.append(df_out)

    return pd.concat(records, ignore_index=True)


## GMM负样本增强

In [7]:
import numpy as np
import pandas as pd
import xarray as xr
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

def get_candidate_negatives(ds_abandon: xr.Dataset,
                            coords_pos: pd.DataFrame) -> pd.DataFrame:
    """
    候选负样本：current_abandonment=1 且不在 coords_pos。
    返回 DataFrame ['lat','lon']。
    """
    mask = ds_abandon['current_abandonment'] == 1
    idx = np.column_stack(np.where(mask.values))
    cand = pd.DataFrame({
        'lat': ds_abandon['lat'].values[idx[:,0]],
        'lon': ds_abandon['lon'].values[idx[:,1]]
    })
    pos_set = set(zip(coords_pos['lat'].round(6), coords_pos['lon'].round(6)))
    cand = cand[~cand.apply(lambda r: (round(r.lat,6), round(r.lon,6)) in pos_set, axis=1)]
    return cand.reset_index(drop=True)

def sample_negative(ds_abandon: xr.Dataset,
                    ds_feature: xr.Dataset,
                    coords_pos: pd.DataFrame,
                    year: int = 2020,
                    sample_size: int = 100_000,
                    n_clusters: int = 10,
                    quantile: float = 0.7):
    """
    强负样本采样：
      1) 从候选负样本中随机抽 sample_size，
         用 extract_features(year) 提取特征并 GMM 聚类；
      2) 选与“正样本集”最远的簇；
      3) 对所有候选分批预测，筛出这些强负簇对应点；
      4) 提取它们的特征并返回 X_neg, y_neg, coords_neg, scaler, gmm。
    """
    cand = get_candidate_negatives(ds_abandon, coords_pos)
    samp = cand.sample(min(sample_size, len(cand)), random_state=0).reset_index(drop=True)
    feats = np.vstack([
        extract_features(r.lat, r.lon, year, ds_abandon, ds_feature)
        for _, r in samp.iterrows()
    ])
    scaler = StandardScaler().fit(feats)
    fs = scaler.transform(feats)
    gmm = GaussianMixture(n_components=n_clusters, random_state=0).fit(fs)

    # 近似正样本 pos_fs
    pos_fs = fs[:n_clusters]
    dists = np.array([np.min(np.linalg.norm(pos_fs - c, axis=1)) for c in gmm.means_])
    thr = np.quantile(dists, quantile)
    strong_cls = np.where(dists >= thr)[0]

    strong_list = []
    batch = 10_000
    for i in range(0, len(cand), batch):
        block = cand.iloc[i:i+batch]
        feats_blk = np.vstack([
            extract_features(r.lat, r.lon, year, ds_abandon, ds_feature)
            for _, r in block.iterrows()
        ])
        labels = gmm.predict(scaler.transform(feats_blk))
        strong_list.append(block.iloc[np.isin(labels, strong_cls)])
    strong_df = pd.concat(strong_list).drop_duplicates().reset_index(drop=True)

    X_neg = np.vstack([
        extract_features(r.lat, r.lon, year, ds_abandon, ds_feature)
        for _, r in strong_df.iterrows()
    ])
    y_neg = np.zeros(X_neg.shape[0], dtype=int)
    coords_neg = strong_df[['lat','lon']].reset_index(drop=True)

    return X_neg, y_neg, coords_neg, scaler, gmm

# 测试部分

In [1]:
import os
from datetime import datetime
from typing import Sequence
import numpy as np
import pandas as pd
import xarray as xr
from tqdm.auto import tqdm
from dask.diagnostics import ProgressBar
import threading
import glob

def load_datasets(abandon_pattern: str, feature_pattern: str):
    """
    打开 NetCDF，用 h5netcdf 替代 netcdf4，避免底层 HDF5 并发错误。
    """
    files_abandon = glob.glob(abandon_pattern)
    files_feature = glob.glob(feature_pattern)
    if not files_abandon or not files_feature:
        raise FileNotFoundError("找不到文件")

    # 用 h5netcdf 引擎打开
    ds_abandon = xr.open_mfdataset(
        files_abandon,
        combine='by_coords',
        engine='h5netcdf',
        parallel=False          # 还是用单线程模式
    )
    ds_feat = xr.open_mfdataset(
        files_feature,
        combine='by_coords',
        engine='h5netcdf',
        parallel=False
    )

    # 一次性 rechunk（保持你原先的尺寸）
    t_ab = ds_abandon.sizes['time']
    ds_abandon = ds_abandon.chunk({'time': t_ab, 'lat': 500, 'lon': 500})

    t_ft = ds_feat.sizes['time']
    ds_feat = ds_feat.chunk({'time': t_ft, 'lat': 1000, 'lon': 1000})

    return ds_abandon, ds_feat


def load_pv_sites(
    csv_path: str,
    years: Sequence[int] = (2018, 2020)
) -> pd.DataFrame:
    """
    加载并标准化 PV 站点数据，过滤指定年份。
    """
    df = pd.read_csv(csv_path)
    if df.empty:
        raise ValueError(f"CSV 文件为空: {csv_path}")

    # 经纬度列映射
    rename_map = {}
    for src in ('latitude', 'lat_deg', 'LAT', 'Lat'):
        if src in df.columns:
            rename_map[src] = 'lat'
    for src in ('longitude', 'lon_deg', 'LON', 'Lon'):
        if src in df.columns:
            rename_map[src] = 'lon'
    df = df.rename(columns=rename_map)

    # 强制类型转换
    df['lat'] = pd.to_numeric(df['lat'], errors='raise')
    df['lon'] = pd.to_numeric(df['lon'], errors='raise')
    df['year'] = pd.to_numeric(df['year'], downcast='integer', errors='raise')

    required = {'lat', 'lon', 'year', 'unique_id', 'p_area', 'capacity_m', 'country'}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"CSV 文件缺少必要列: {sorted(missing)}")

    df = df[df['year'].isin(years)]
    if df.empty:
        raise ValueError(f"没有符合年份 {years} 的记录")

    return df.reset_index(drop=True)


def extract_all_features(
    ds_feat: xr.Dataset,
    ds_abandon: xr.Dataset,
    pv_df: pd.DataFrame,
    years: Sequence[int]
) -> pd.DataFrame:
    """
    向量化地一次性对每个变量和年份 sel + load，
    并用 _io_lock 串行化所有底层 NetCDF4 读操作。
    """
    records = []

    for yr in years:
        sub = pv_df[pv_df.year == yr].reset_index(drop=True)
        lats = xr.DataArray(sub.lat.values, dims="point")
        lons = xr.DataArray(sub.lon.values, dims="point")

        ds_fy = ds_feat.sel(time=str(yr), method="nearest")

        # 环境变量
        env_dict = {}
        for var in tqdm(ds_feat.data_vars, desc=f"提取环境变量 {yr}"):
            da_sel = ds_fy[var].sel(lat=lats, lon=lons, method="nearest")
            arr = da_sel.load(scheduler="single-threaded").values
            env_dict[var] = arr

        # 撂荒属性
        aband_dict = {}
        for var in ("current_abandonment", "abandonment_year",
                    "abandonment_duration", "recultivation"):
            if var in ds_abandon.data_vars:
                da_sel = ds_abandon[var].sel(lat=lats, lon=lons, method="nearest")
                arr = da_sel.load(scheduler="single-threaded").values
                aband_dict[var] = arr
            else:
                aband_dict[var] = np.full(len(sub), np.nan)

        # landcover 序列 embedding
        if "landcover" in ds_abandon.data_vars:
            lc_sel = ds_abandon.landcover.sel(lat=lats, lon=lons, method="nearest")

            lc_arr = lc_sel.load(scheduler="single-threaded").values
            seq = np.nan_to_num(lc_arr, nan=0).astype(int)
            seq = np.clip(seq, 1, 9) - 1
            onehot = np.eye(9)[seq]
            mean_onehot = onehot.mean(axis=0)
            for i in range(9):
                aband_dict[f"lc_cls_{i}"] = mean_onehot[:, i]

        df_feat = pd.DataFrame({**env_dict, **aband_dict})
        df_out  = pd.concat([sub, df_feat], axis=1)
        records.append(df_out)

    return pd.concat(records, ignore_index=True)


def main(test_mode: bool = False, test_n: int = 500):
    PATHS = {
        'abandonment': "D:/xarray/abandonment_chunkall/*.nc",
        'feature':     "D:/xarray/aligned2/Feature_all/*.nc",
        'csv':         "aligned_for_training.csv",
        'test_output': "positive_samples_test_500.csv",
        'output':      "positive_samples_full_with_features.csv"
    }
    YEARS = [2018, 2020]

    print("开始处理...", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # 1. 打开并 rechunk
    ds_abandon, ds_feat = load_datasets(
        PATHS['abandonment'], PATHS['feature']
    )

    # 2. 读取 PV 站点并切片（测试模式）
    pv_df = load_pv_sites(PATHS['csv'], years=YEARS)
    if test_mode:
        pv_df = pv_df.iloc[:test_n].reset_index(drop=True)
        print(f"⚠️ 测试模式：仅前 {test_n} 条记录")

    # 3. 向量化抽取
    print("批量抽取特征 …")
    with ProgressBar():
        df_all = extract_all_features(ds_feat, ds_abandon, pv_df, YEARS)

    # 4. 去重 & 保存
    df_unique = (
        df_all
        .sort_values(['year','lat','lon'])
        .drop_duplicates(subset=['lat','lon'], keep='last')
        .reset_index(drop=True)
    )
    out_path = PATHS['test_output'] if test_mode else PATHS['output']
    os.makedirs(os.path.dirname(out_path) or '.', exist_ok=True)
    df_unique.to_csv(out_path, index=False)

    print(f"完成，结果保存到: {out_path}")
    print("结束时间:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


if __name__ == "__main__":
    main(test_mode=True)


开始处理... 2025-05-18 16:26:21


ValueError: unrecognized engine h5netcdf must be one of: ['netcdf4', 'scipy', 'rasterio', 'store']

# Process

## 1.1 Load

In [84]:

from function import *
abandon_2d_variable = [
    "current_abandonment",
    "recultivation", 
    "abandonment_duration",
    "abandonment_year"
]
fea_3d_variable = [
    'GDPpc',
    'GDPtot',
    'GURdist',
    'Population',
    'gdmp',
    'rsds',
    'tas',
    'wind'
]
fea_2d_variable = [
    'DEM',
    'Powerdist',
    'PrimaryRoad',
    'SecondaryRoad',
    'Slope',
    'TertiaryRoad'
]
PATHS = {
    'abandonment': "D:/xarray/abandonment_chunkall/*.nc",
    'feature':     "D:/xarray/aligned2/Feature_all/*.nc",
    'csv':         "aligned_for_training.csv",
    'test_output': "positive_samples_test_500.csv",
    'output':      "positive_samples_full_with_features.csv"
}

YEARS = [2018, 2020]
time=['2018-01-01','2020-01-01']
# 2. 读取 PV 站点并切片（测试模式）
# 2. 读取 PV 站点并切片（测试模式）
pv_df = load_pv_sites(PATHS['csv'], years=YEARS)
# Convert lon and lat columns to float32

pv_df['lon'] = pv_df['lon'].astype('float32')
pv_df['lat'] = pv_df['lat'].astype('float32')
# Rename 'year' to 'time' and convert to datetime64
pv_df = pv_df.rename(columns={'year': 'time'})
pv_df['time'] = pd.to_datetime(pv_df['time'], format='%Y')


# 1. 打开并 rechunk
ds_abandon, ds_feat = load_datasets(
    PATHS['abandonment'], PATHS['feature']
)

import xarray as xr

ds_merge=xr.merge([ds_abandon, ds_feat])
# Convert coordinates to float32 while preserving other variables
ds_merge = ds_merge.assign_coords({
    'lon': ds_merge.lon.astype('float32'),
    'lat': ds_merge.lat.astype('float32')
})

# For variables without time dimension, expand them to have same value for all times
for var in ds_merge.data_vars:
    if 'time' not in ds_merge[var].dims:
        # Expand the variable to have time dimension with same values
        ds_merge[var] = ds_merge[var].expand_dims(time=ds_merge.time)
ds_merge

  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
  result = blockwise(
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]
  result = blockwise(
  result = blockwise(
  result = blockwise(
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]
  result = blockwise(
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.s

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 43.87 MiB Shape (21600, 43200) (4600, 2500) Dask graph 2511 chunks in 2579 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 43.87 MiB Shape (21600, 43200) (4600, 2500) Dask graph 2511 chunks in 2579 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 43.87 MiB Shape (21600, 43200) (4600, 2500) Dask graph 2511 chunks in 2579 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 43.87 MiB Shape (21600, 43200) (4600, 2500) Dask graph 2511 chunks in 2579 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,43.87 MiB
Shape,"(21600, 43200)","(4600, 2500)"
Dask graph,2511 chunks in 2579 graph layers,2511 chunks in 2579 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,597.38 MiB
Shape,"(31, 21600, 43200)","(16, 4350, 2250)"
Dask graph,19964 chunks in 2659 graph layers,19964 chunks in 2659 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 597.38 MiB Shape (31, 21600, 43200) (16, 4350, 2250) Dask graph 19964 chunks in 2659 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,597.38 MiB
Shape,"(31, 21600, 43200)","(16, 4350, 2250)"
Dask graph,19964 chunks in 2659 graph layers,19964 chunks in 2659 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 15.82 MiB Shape (21600, 43200) (1440, 2880) Dask graph 225 chunks in 3 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.84 GiB
Shape,"(31, 21600, 43200)","(26, 3086, 6172)"
Dask graph,147 chunks in 19 graph layers,147 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 1.84 GiB Shape (31, 21600, 43200) (26, 3086, 6172) Dask graph 147 chunks in 19 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.84 GiB
Shape,"(31, 21600, 43200)","(26, 3086, 6172)"
Dask graph,147 chunks in 19 graph layers,147 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 24.80 MiB Shape (31, 21600, 43200) (26, 500, 500) Dask graph 13950 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 24.80 MiB Shape (31, 21600, 43200) (26, 500, 500) Dask graph 13950 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 24.80 MiB Shape (31, 21600, 43200) (26, 500, 500) Dask graph 13950 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 15.82 MiB Shape (21600, 43200) (1440, 2880) Dask graph 225 chunks in 3 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 6.95 GiB 14.71 MiB Shape (21600, 43200) (982, 1964) Dask graph 484 chunks in 3 graph layers Data type float64 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 6.95 GiB 14.71 MiB Shape (21600, 43200) (982, 1964) Dask graph 484 chunks in 3 graph layers Data type float64 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.48 GiB 15.82 MiB Shape (21600, 43200) (1440, 2880) Dask graph 225 chunks in 3 graph layers Data type float32 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,3.48 GiB,15.82 MiB
Shape,"(21600, 43200)","(1440, 2880)"
Dask graph,225 chunks in 3 graph layers,225 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 6.95 GiB 14.71 MiB Shape (21600, 43200) (982, 1964) Dask graph 484 chunks in 3 graph layers Data type float64 numpy.ndarray",43200  21600,

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,14.71 MiB
Shape,"(21600, 43200)","(982, 1964)"
Dask graph,484 chunks in 3 graph layers,484 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,2.82 GiB
Shape,"(31, 21600, 43200)","(26, 2700, 5400)"
Dask graph,588 chunks in 24 graph layers,588 chunks in 24 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 2.82 GiB Shape (31, 21600, 43200) (26, 2700, 5400) Dask graph 588 chunks in 24 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,2.82 GiB
Shape,"(31, 21600, 43200)","(26, 2700, 5400)"
Dask graph,588 chunks in 24 graph layers,588 chunks in 24 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.84 GiB
Shape,"(31, 21600, 43200)","(26, 3086, 6172)"
Dask graph,147 chunks in 19 graph layers,147 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 1.84 GiB Shape (31, 21600, 43200) (26, 3086, 6172) Dask graph 147 chunks in 19 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,1.84 GiB
Shape,"(31, 21600, 43200)","(26, 3086, 6172)"
Dask graph,147 chunks in 19 graph layers,147 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,2.82 GiB
Shape,"(31, 21600, 43200)","(26, 2700, 5400)"
Dask graph,192 chunks in 19 graph layers,192 chunks in 19 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 215.52 GiB 2.82 GiB Shape (31, 21600, 43200) (26, 2700, 5400) Dask graph 192 chunks in 19 graph layers Data type float64 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,215.52 GiB,2.82 GiB
Shape,"(31, 21600, 43200)","(26, 2700, 5400)"
Dask graph,192 chunks in 19 graph layers,192 chunks in 19 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 107.76 GiB 24.80 MiB Shape (31, 21600, 43200) (26, 500, 500) Dask graph 13950 chunks in 22 graph layers Data type float32 numpy.ndarray",43200  21600  31,

Unnamed: 0,Array,Chunk
Bytes,107.76 GiB,24.80 MiB
Shape,"(31, 21600, 43200)","(26, 500, 500)"
Dask graph,13950 chunks in 22 graph layers,13950 chunks in 22 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## 1.2 ZZZ

In [70]:
print("Longitude:", pv_df[pv_df['unique_id'] == 19077]['lon'].values[0])
print("Latitude:", pv_df[pv_df['unique_id'] == 19077]['lat'].values[0])

Longitude: -177.92916666666852
Latitude: -29.245833333181395


In [78]:
# Find the nearest point in ds_merge to the given coordinates
target_lon = np.float64(-177.92916666666852)
target_lat = np.float64(-29.245833333181395)

# Calculate absolute differences
lon_diff = np.abs(ds_merge.lon.values - target_lon)
lat_diff = np.abs(ds_merge.lat.values - target_lat)

# Find indices of minimum differences
nearest_lon_idx = np.argmin(lon_diff)
nearest_lat_idx = np.argmin(lat_diff)
print(nearest_lon_idx, nearest_lat_idx)
# Get the actual coordinates
nearest_lon = np.float64(ds_merge.lon.values[nearest_lon_idx])
nearest_lat = np.float64(ds_merge.lat.values[nearest_lat_idx])

print(nearest_lon, nearest_lat)


248 7290
-177.92916666666665 -29.245833333333326


## 1.3 SDFD

In [130]:
from tqdm.auto import tqdm
import numpy as np

def process_chunk(df, ds_merge, step,stop=0):

    # Get coordinates that exist in both df_temp and pv_df
    common_lats = np.intersect1d(ds_merge.lat.values, df['lat'].unique())
    common_lons = np.intersect1d(ds_merge.lon.values, df['lon'].unique())

    # Select data from df_temp using only the common coordinates
    ds_merge = ds_merge.sel(
        lat=common_lats,
        lon=common_lons,
    )

    #step = 500
    total_lat = len(ds_merge.lat)
    total_lon = len(ds_merge.lon)
    merged_dfs = []
    
    # Calculate total iterations for progress bar
    total_iterations = (total_lat // step + (1 if total_lat % step else 0)) * \
                    (total_lon // step + (1 if total_lon % step else 0))

    # Create progress bar with Chinese description
    pbar = tqdm(total=total_iterations, desc="处理数据块")

    # Iterate through all latitude and longitude points in chunks
    for start_lat in range(0, total_lat, step):
        end_lat = min(start_lat + step, total_lat)
        for start_lon in range(0, total_lon, step):
            end_lon = min(start_lon + step, total_lon)
            #print(1)
            # Extract data from ds_merge for each time point
            df = ds_merge.isel(
                lat=slice(start_lat, end_lat),
                lon=slice(start_lon, end_lon)
            ).compute().to_dataframe()
            
            # Reset index to convert multi-index to columns
            df = df.reset_index()
            
            # Merge with pv_df based on lat/lon coordinates
            chunk_merged = pd.merge(df, pv_df, on=['lat','lon','time'], how='inner')
            merged_dfs.append(chunk_merged)
            
            # Update progress bar
            pbar.update(1)
            if stop==1:
                break
    # Close progress bar
    pbar.close()
    # Combine all chunks into final dataframe
    merged_df = pd.concat(merged_dfs, ignore_index=True)
    return merged_df




# Get all variables from ds_merge
all_vars = list(ds_merge.data_vars)
merged_dfs = []
# Iterate through each variable
for i, var in enumerate(all_vars):
    print(f"Processing variable {i+1} of {len(all_vars)}: {var}")
    # Check if variable has time dimension
    df_temp = ds_merge[var].sel(time=['2018-01-01','2020-01-01'])

    # Process the chunk and merge with pv_df
    merged_df = process_chunk(pv_df, df_temp, step=2500)

    merged_dfs.append(merged_df)
    # if i==2:
    #     break
# Concatenate all merged dataframes with single-column retention
final_merged_df = merged_dfs[0]
for df in tqdm(merged_dfs[1:], desc="合并数据"):

    final_merged_df = pd.merge(
        final_merged_df,
        df,
        on=["time", "lon", "lat"],
        how="inner",
        suffixes=("", "_drop")
    )
    # Drop duplicated columns with "_drop" suffix
    final_merged_df = final_merged_df.loc[:, ~final_merged_df.columns.str.endswith("_drop")]

#df_dem=process_chunk(pv_df, ds_merge.DEM.sel(time=['2018-01-01','2020-01-01']),step=2500,stop=1)

#ds_merge.gdmp.sel(time=['2018-01-01','2020-01-01'])
#df_gdmp=process_chunk(pv_df, ds_merge.gdmp.sel(time=['2018-01-01','2020-01-01']),step=2500,stop=1)

Processing variable 1 of 19: DEM


处理数据块:   0%|          | 0/21 [00:00<?, ?it/s]

Processing variable 2 of 19: wind


处理数据块:   0%|          | 0/21 [00:00<?, ?it/s]

合并数据:   0%|          | 0/1 [00:00<?, ?it/s]

In [131]:
final_merged_df

Unnamed: 0,time,lat,lon,spatial_ref,band,DEM,unique_id,p_area,capacity_m,country,wind
0,2018-01-01,-13.837500,-171.804169,0,1,54.3750,26510,87614.298290,7.850205,WSM,2.293710
1,2018-01-01,-13.829166,-171.995834,0,1,2.8125,26500,10696.657940,0.956408,WSM,2.770978
2,2018-01-01,8.245833,-81.987503,0,1,14.3125,30068,131374.081100,10.611416,PAN,2.391064
3,2018-01-01,8.362500,-82.345833,0,1,14.4375,22282,2439.926533,0.239662,PAN,2.284361
4,2018-01-01,8.370833,-82.345833,0,1,14.4375,22224,523100.190900,42.299616,PAN,2.510491
...,...,...,...,...,...,...,...,...,...,...,...
6382,2020-01-01,51.020832,-93.787498,0,1,359.8125,44,140.626685,0.000000,CAN,4.284453
6383,2020-01-01,51.654167,-128.129166,0,1,27.5625,10590,356.878383,0.000000,CAN,3.531718
6384,2020-01-01,51.937500,-122.970833,0,1,1084.2500,19490,61931.932830,3.874179,CAN,2.055982
6385,2020-01-01,66.045830,-154.254166,0,1,101.2500,19297,7950.458513,0.000000,USA,2.860010


In [104]:
#df_dem=pd.concat([df_dem.assign(time='2018-01-01'), df_dem.assign(time='2020-01-01')])

pd.merge(df_dem, df_gdmp, on=['lat', 'lon'], how='inner')
df_gdmp


Unnamed: 0,time,lat,lon,spatial_ref,band,gdmp,unique_id,p_area,capacity_m,country
0,2018-01-01,-13.837500,-171.804169,0,1,,26510,87614.298290,7.850205,WSM
1,2018-01-01,-13.829166,-171.995834,0,1,,26500,10696.657940,0.956408,WSM
2,2018-01-01,8.245833,-81.987503,0,1,137.873126,30068,131374.081100,10.611416,PAN
3,2018-01-01,8.362500,-82.345833,0,1,120.962997,22282,2439.926533,0.239662,PAN
4,2018-01-01,8.370833,-82.345833,0,1,143.121350,22224,523100.190900,42.299616,PAN
...,...,...,...,...,...,...,...,...,...,...
6382,2020-01-01,51.020832,-93.787498,0,1,,44,140.626685,0.000000,CAN
6383,2020-01-01,51.654167,-128.129166,0,1,,10590,356.878383,0.000000,CAN
6384,2020-01-01,51.937500,-122.970833,0,1,,19490,61931.932830,3.874179,CAN
6385,2020-01-01,66.045830,-154.254166,0,1,,19297,7950.458513,0.000000,USA


## 1.4 Save

In [63]:
 # Remove spatial_ref and band columns
final_merged_df = final_merged_df.drop(['spatial_ref', 'band'], axis=1)

# Save to CSV
final_merged_df.to_csv('training_embedding.csv', index=False)



In [81]:
# Calculate valid values and percentages for abandonment_year and Slope
valid_abandonment = final_merged_df['abandonment_year'].notna().sum()
valid_slope = final_merged_df['rsds'].notna().sum()
total_rows = len(final_merged_df)

print(f"Total rows: {total_rows}")
print(f"Valid abandonment_year values: {valid_abandonment} ({valid_abandonment/total_rows*100:.2f}%)")
print(f"Valid Slope values: {valid_slope} ({valid_slope/total_rows*100:.2f}%)")

# Display the dataframe
final_merged_df

Total rows: 3619418
Valid abandonment_year values: 8163 (0.23%)
Valid Slope values: 314552 (8.69%)


Unnamed: 0,lat,lon,abandonment_year,unique_id,p_area,capacity_m,country,year,abandonment_duration,recultivation,...,Population,Powerdist,PrimaryRoad,SecondaryRoad,Slope,TertiaryRoad,gdmp,rsds,tas,wind
0,-29.245832,-177.929169,,19077,385.150738,0.000000,NZL,2020,,,...,,,,,,,,,,
1,-22.645834,-152.795837,,34745,615.614888,0.000000,PYF,2020,,,...,,,,,,,,,,
2,-21.229166,-175.162506,,34743,39.668561,0.000000,TON,2020,,,...,,,,,,,,,,
3,-21.187500,-175.187500,,34741,30959.725410,2.160149,TON,2020,,,...,,,,,,,,,,
4,-21.154167,-175.179169,,34740,38427.397160,0.000000,TON,2020,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3619413,49.562500,127.912498,,51578,5655.069418,0.502084,CHN,2018,,,...,,,,,,,,,,3.667992
3619414,52.045834,124.695831,,60309,87886.281650,7.124299,CHN,2018,,,...,,,,,,,,,,2.130104
3619415,52.045834,124.695831,,60309,87886.281650,7.124299,CHN,2018,,,...,,,,,,,,,,2.196081
3619416,67.662498,134.654160,,19298,41369.184550,2.757651,RUS,2020,,,...,,,,,,,,,,2.249942


# Load

In [None]:
# float32, 