In [None]:
# 定义“农历年”
def get_seasonal_year(date):
    return date.year if date.month >= 3 else date.year - 1

df['seasonal_year'] = df['DATE'].apply(get_seasonal_year)

# 定义季节划分
def assign_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['DATE'].apply(assign_season)

# 确保数据列是数值型
df['TMAX'] = pd.to_numeric(df['TMAX'], errors='coerce')
df['TMIN'] = pd.to_numeric(df['TMIN'], errors='coerce')
df['PRCP'] = pd.to_numeric(df['PRCP'], errors='coerce')

# 计算季节性特征
def compute_seasonal_features(group):
    seasonal_features = {}
    for season in ['Winter', 'Spring', 'Summer', 'Fall']:
        season_data = group[group['season'] == season]

        # 确保数据列是数值型
        season_data['TMAX'] = pd.to_numeric(season_data['TMAX'], errors='coerce')
        season_data['TMIN'] = pd.to_numeric(season_data['TMIN'], errors='coerce')
        season_data['PRCP'] = pd.to_numeric(season_data['PRCP'], errors='coerce')

        # 过滤掉 NaN 数据
        season_data = season_data.dropna(subset=['TMAX', 'TMIN', 'PRCP'])

        # 避免空数据计算错误
        if not season_data.empty:
            seasonal_features[f'Tmax_{season}'] = season_data['TMAX'].mean()
            seasonal_features[f'Tmin_{season}'] = season_data['TMIN'].mean()
            seasonal_features[f'Prcp_{season}'] = season_data['PRCP'].sum()
        else:
            seasonal_features[f'Tmax_{season}'] = np.nan
            seasonal_features[f'Tmin_{season}'] = np.nan
            seasonal_features[f'Prcp_{season}'] = np.nan

    return pd.Series(seasonal_features)

# 计算季节性特征
seasonal_features_df = df.groupby(['lat', 'long', 'seasonal_year']).apply(compute_seasonal_features).reset_index()

# 计算积温（GDD, Growing Degree Days）
def calculate_gdd(df, base_temp=5):
    df['GDD'] = np.maximum(df['TMAX'] - base_temp, 0)
    df['GDD_cumsum'] = df.groupby(['lat', 'long', 'seasonal_year'])['GDD'].cumsum()
    df['GDD_30d'] = df.groupby(['lat', 'long']).rolling(30, on='DATE')['GDD'].sum().reset_index(0, drop=True)
    df['GDD_60d'] = df.groupby(['lat', 'long']).rolling(60, on='DATE')['GDD'].sum().reset_index(0, drop=True)
    df['GDD_rate_change'] = df['GDD_30d'] - df['GDD_60d']
    return df

# 定义“农历年”
def get_seasonal_year(date):
    return date.year if date.month >= 3 else date.year - 1
df['seasonal_year'] = df['DATE'].apply(get_seasonal_year)

# 定义季节划分
def assign_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['DATE'].apply(assign_season)

# 计算季节性特征
def compute_seasonal_features(group):
    seasonal_features = {}
    for season in ['Winter', 'Spring', 'Summer', 'Fall']:
        season_data = group[group['season'] == season]
        seasonal_features[f'Tmax_{season}'] = season_data['TMAX'].mean()
        seasonal_features[f'Tmin_{season}'] = season_data['TMIN'].mean()
        seasonal_features[f'Prcp_{season}'] = season_data['PRCP'].sum()
    return pd.Series(seasonal_features)

seasonal_features_df = df.groupby(['lat', 'long', 'seasonal_year']).apply(compute_seasonal_features).reset_index()

# 计算积温（GDD, Growing Degree Days）
def calculate_gdd(df, base_temp=5):
    df['GDD'] = df['TMAX'] - base_temp
    df['GDD_cumsum'] = df.groupby(['lat', 'long', 'seasonal_year'])['GDD'].cumsum()

    df['GDD_30d'] = df.groupby(['lat', 'long'])['GDD'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)
    df['GDD_60d'] = df.groupby(['lat', 'long'])['GDD'].rolling(60, min_periods=1).sum().reset_index(level=[0,1], drop=True)
    df['GDD_120d'] = df.groupby(['lat', 'long'])['GDD'].rolling(120, min_periods=1).sum().reset_index(level=[0,1], drop=True)

    df['GDD_rate_change_30_60'] = df['GDD_30d'] - df['GDD_60d']
    df['GDD_rate_change_60_120'] = df['GDD_60d'] - df['GDD_120d']

    return df

df = calculate_gdd(df)

# 计算滑动平均温度
df['TAVG'] = (df['TMAX'] + df['TMIN']) / 2
df['TAVG_7d'] = df.groupby(['lat', 'long'])['TAVG'].rolling(7, min_periods=1).mean().reset_index(level=[0,1], drop=True)
df['TAVG_30d'] = df.groupby(['lat', 'long'])['TAVG'].rolling(30, min_periods=1).mean().reset_index(level=[0,1], drop=True)

# 降水特征
df['PRCP_cumsum'] = df.groupby(['lat', 'long', 'seasonal_year'])['PRCP'].cumsum()
df['PRCP_7d_cumsum'] = df.groupby(['lat', 'long'])['PRCP'].rolling(7, min_periods=1).sum().reset_index(level=[0,1], drop=True)
df['PRCP_30d_cumsum'] = df.groupby(['lat', 'long'])['PRCP'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)
df['PRCP_60d_cumsum'] = df.groupby(['lat', 'long'])['PRCP'].rolling(60, min_periods=1).sum().reset_index(level=[0,1], drop=True)
df['PRCP_120d_cumsum'] = df.groupby(['lat', 'long'])['PRCP'].rolling(120, min_periods=1).sum().reset_index(level=[0,1], drop=True)

df['PRCP_dry_days'] = df.groupby(['lat', 'long'])['PRCP'].transform(lambda x: (x < 1).rolling(30, min_periods=1).sum())
df['PRCP_rainy_days'] = df.groupby(['lat', 'long'])['PRCP'].transform(lambda x: (x > 5).rolling(30, min_periods=1).sum())

# 计算霜冻天数和高温天数
df['Frost_days_30d'] = df.groupby(['lat', 'long'])['TMIN'].transform(lambda x: (x < 0).rolling(30, min_periods=1).sum())
df['Heat_days_30d'] = df.groupby(['lat', 'long'])['TMAX'].transform(lambda x: (x > 25).rolling(30, min_periods=1).sum())

# 计算温度波动性
df['TMAX_fluctuation'] = df.groupby(['lat', 'long'])['TMAX'].rolling(30, min_periods=1).std().reset_index(level=[0,1], drop=True)

# 计算霜冻天数和高温天数
df['Frost_days_365d'] = df.groupby(['lat', 'long'])['TMIN'].transform(lambda x: (x < 0).rolling(365, min_periods=1).sum())
df['Heat_days_365d'] = df.groupby(['lat', 'long'])['TMAX'].transform(lambda x: (x > 25).rolling(365, min_periods=1).sum())

# 计算温度波动性
df['TMAX_fluctuation'] = df.groupby(['lat', 'long'])['TMAX'].rolling(365, min_periods=1).std().reset_index(level=[0,1], drop=True)

df = df[df['DATE'].dt.month == 2].groupby(['lat', 'long', 'seasonal_year']).last().reset_index()


In [None]:
# 定义一个函数来检测连续 30 天及以上 TMAX 或 TMIN 缺失的情况
def remove_long_missing_periods(group):
    group = group.sort_values(by='DATE').reset_index(drop=True)
    group['TMAX_missing'] = group['TMAX'].isna()
    group['TMIN_missing'] = group['TMIN'].isna()

    # 计算连续缺失的天数
    group['TMAX_missing_cumsum'] = group['TMAX_missing'].astype(int).groupby(group['TMAX_missing'].diff().ne(0).cumsum()).cumsum()
    group['TMIN_missing_cumsum'] = group['TMIN_missing'].astype(int).groupby(group['TMIN_missing'].diff().ne(0).cumsum()).cumsum()

    # 过滤掉连续缺失 30 天及以上的部分
    mask = (group['TMAX_missing_cumsum'] >= 15) | (group['TMIN_missing_cumsum'] >= 15)
    return group[~mask].drop(columns=['TMAX_missing', 'TMIN_missing', 'TMAX_missing_cumsum', 'TMIN_missing_cumsum'])

# 按照 lat 和 long 进行分组并处理
df_cleaned1 = df.groupby(['lat', 'long'], group_keys=False).apply(remove_long_missing_periods)

In [None]:
# 计算 30 天滚动均值的自定义函数
def rolling_mean_cudf(group, col):
    group = group.sort_values(by='DATE').reset_index(drop=True)

    # 计算滚动均值
    rolling_means = group[col].fillna(method='ffill').rolling(window=30, min_periods=1).mean()

    # 用滚动均值填充缺失值
    group[col] = group[col].fillna(rolling_means)

    return group

# 按照 lat, long 分组计算并填充 TMAX、TMIN
df = df.groupby(['lat', 'long'], group_keys=False).apply(lambda g: rolling_mean_cudf(g, 'TMAX'))
df = df.groupby(['lat', 'long'], group_keys=False).apply(lambda g: rolling_mean_cudf(g, 'TMIN'))

In [None]:
# 按经纬度分组处理
def remove_non_continuous_dates(group):
    group = group.sort_values("DATE").reset_index(drop=True)
    group["date_diff"] = group["DATE"].diff().dt.days  # 计算日期差值

    # 找到不连续的日期（date_diff > 1）
    discontinuous_rows = group[group["date_diff"] > 1]

    if not discontinuous_rows.empty:
        # 取最后一个不连续的日期（不一定是最大日期）
        last_discontinuous_date = discontinuous_rows.iloc[-1]["DATE"]
        # 删除最后一个不连续日期及之前的数据
        group = group[group["DATE"] > last_discontinuous_date]

    return group.drop(columns=["date_diff"])

# 如果 df_cleaned1 是 cudf.DataFrame，则转换为 pandas
if isinstance(df, cudf.DataFrame):
    df1 = df.to_pandas()

# 应用分组逻辑
df1 = df1.groupby(["lat", "long"], group_keys=False).apply(remove_non_continuous_dates)