In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../dataset/training.csv")

In [3]:
# -----------------------------
# 1️⃣ Heat probability based on 28-25°C
# -----------------------------
df['heat_prob'] = df['t2m_C'].apply(lambda x: 1.0 if x >= 28 else 0.5 if x >= 26 else 0)

# -----------------------------
# 2️⃣ Create a 7-day max target (next 7 days)
# -----------------------------
df['heat_next_7d_max'] = df.groupby(['latitude','longitude'])['heat_prob']\
                            .transform(lambda x: x.shift(-1).rolling(7, min_periods=1).max())

# -----------------------------
# 3️⃣ Drop NaNs caused by rolling/shift
# -----------------------------
df_forecast = df.dropna(subset=['heat_next_7d_max']).reset_index(drop=True)

# -----------------------------
# 4️⃣ Features for forecasting
# -----------------------------
lag_days = [1,3,7]
for lag in lag_days:
    df_forecast[f't2m_C_lag{lag}'] = df_forecast.groupby(['latitude','longitude'])['t2m_C'].shift(lag)

# Drop rows with lag NaNs
df_forecast = df_forecast.dropna(subset=[f't2m_C_lag{lag}' for lag in lag_days]).reset_index(drop=True)

lag_features = [f't2m_C_lag{lag}' for lag in lag_days] + [
    't2m_C','anomaly_T2m_C','tp_mm','tp_7d_cum','tp_14d_cum','tp_7d_avg',
    'consec_rain_days','month_sin','month_cos','tp_anomaly','tp_std_anomaly'
]

X = df_forecast[lag_features]
y = df_forecast['heat_next_7d_max']

print("Features shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Features shape: (292965, 14)
Target distribution:
 heat_next_7d_max
0.0    292965
Name: count, dtype: int64


In [4]:
df

Unnamed: 0,latitude,longitude,year,month,day,date,tp_mm,t2m,t2m_C,anomaly_T2m_C,...,tp_std_anomaly,heatwave_flag,check_next_day,check_next_2days,check_next_3days,next_day_match,next_2days_match,next_3days_match,heat_prob,heat_next_7d_max
0,27.5,85.0,1940,1,1,1940-01-01,-0.574015,290.56674,-0.221850,-0.221850,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
1,27.5,85.0,1940,1,2,1940-01-02,-0.574015,291.19666,-0.121837,-0.121837,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
2,27.5,85.0,1940,1,3,1940-01-03,-0.574015,290.42987,-0.244075,-0.244075,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
3,27.5,85.0,1940,1,4,1940-01-04,-0.574015,290.29614,-0.264713,-0.264713,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
4,27.5,85.0,1940,1,5,1940-01-05,-0.574015,291.80084,-0.026587,-0.026587,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293065,28.0,86.0,2025,9,2,2025-09-02,-0.334395,287.61624,-0.690166,-0.690166,...,-0.719800,0,1.0,1.0,1.0,True,True,True,0,0.0
293066,28.0,86.0,2025,9,3,2025-09-03,0.472795,290.17860,-0.283763,-0.283763,...,0.011375,0,1.0,1.0,1.0,True,True,True,0,0.0
293067,28.0,86.0,2025,9,4,2025-09-04,1.024964,290.30640,-0.263126,-0.263126,...,0.511545,0,1.0,1.0,,True,True,False,0,0.0
293068,28.0,86.0,2025,9,5,2025-09-05,0.521716,289.67624,-0.363139,-0.363139,...,0.055689,0,1.0,,,True,False,False,0,0.0


In [5]:
df.columns


Index(['latitude', 'longitude', 'year', 'month', 'day', 'date', 'tp_mm', 't2m',
       't2m_C', 'anomaly_T2m_C', 'heat_stress_proxy', 'tp_7d_cum',
       'tp_14d_cum', 'tp_7d_avg', 'consec_rain_days', 'tp_lag1', 'tp_lag3',
       'tp_lag7', 'heavy_rain', 'month_sin', 'month_cos', 'heat_proxy',
       'heat_next_day', 'heat_next_2days', 'heat_next_3days', 'tp_anomaly',
       'tp_std_anomaly', 'heatwave_flag', 'check_next_day', 'check_next_2days',
       'check_next_3days', 'next_day_match', 'next_2days_match',
       'next_3days_match', 'heat_prob', 'heat_next_7d_max'],
      dtype='object')

In [6]:
df.head(20)

Unnamed: 0,latitude,longitude,year,month,day,date,tp_mm,t2m,t2m_C,anomaly_T2m_C,...,tp_std_anomaly,heatwave_flag,check_next_day,check_next_2days,check_next_3days,next_day_match,next_2days_match,next_3days_match,heat_prob,heat_next_7d_max
0,27.5,85.0,1940,1,1,1940-01-01,-0.574015,290.56674,-0.22185,-0.22185,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
1,27.5,85.0,1940,1,2,1940-01-02,-0.574015,291.19666,-0.121837,-0.121837,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
2,27.5,85.0,1940,1,3,1940-01-03,-0.574015,290.42987,-0.244075,-0.244075,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
3,27.5,85.0,1940,1,4,1940-01-04,-0.574015,290.29614,-0.264713,-0.264713,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
4,27.5,85.0,1940,1,5,1940-01-05,-0.574015,291.80084,-0.026587,-0.026587,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
5,27.5,85.0,1940,1,6,1940-01-06,-0.574015,291.88464,-0.013887,-0.013887,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
6,27.5,85.0,1940,1,11,1940-01-11,-0.574015,291.24286,-0.115487,-0.115487,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
7,27.5,85.0,1940,1,13,1940-01-13,-0.574015,292.04556,0.013101,0.013101,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
8,27.5,85.0,1940,1,14,1940-01-14,-0.574015,291.84808,-0.018649,-0.018649,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0
9,27.5,85.0,1940,1,15,1940-01-15,-0.574015,292.04285,0.011514,0.011514,...,-0.342657,0,0.0,0.0,0.0,True,True,True,0,0.0


In [7]:
# Check distribution before shifting
print(df.groupby(['latitude','longitude'])['heat_stress_proxy'].sum())

# Check for consecutive hot days
df['consec_hot_days'] = df.groupby(['latitude','longitude'])['heat_stress_proxy']\
                           .transform(lambda x: x.groupby((x != x.shift()).cumsum()).cumsum())
print(df['consec_hot_days'].value_counts())


latitude  longitude
27.50     85.00        15107
          85.25        13482
          85.50        12852
          85.75        13098
          86.00        10862
27.75     85.00        12736
          85.25        10240
          85.50         8987
          85.75         8158
          86.00         2369
28.00     85.00         9662
          85.25         5845
          85.50          289
          85.75            0
          86.00            0
Name: heat_stress_proxy, dtype: int64
consec_hot_days
0      169383
1       12671
2        8651
3        6795
4        5610
        ...  
178         5
179         5
180         4
181         3
182         2
Name: count, Length: 183, dtype: int64
