# COMMENTS

In [1]:
# ============================================
# Install & Import Dependencies
# ============================================
# !pip install scipy pandas

import pandas as pd
import numpy as np
from scipy.io import loadmat
from datetime import datetime, timedelta

# ============================================
# Helper Function: MATLAB datenum → datetime
# ============================================
def matlab2datetime(matlab_datenum):
    return datetime.fromordinal(int(matlab_datenum)) \
           + timedelta(days=matlab_datenum % 1) \
           - timedelta(days=366)

# ============================================
# Load .mat Dataset
# ============================================
data = loadmat('./NEUSTG_19502020_12stations.mat')

lat = data['lattg'].flatten()
lon = data['lontg'].flatten()
sea_level = data['sltg']
station_names = [s[0] for s in data['sname'].flatten()]
time = data['t'].flatten()
time_dt = np.array([matlab2datetime(t) for t in time])

# ============================================
# Select Target Stations
# ============================================

ALL_STATIONS = ['Annapolis','Atlantic_City','Charleston','Washington','Wilmington', 
                'Eastport', 'Portland', 'Sewells_Point', 'Sandy_Hook', 
                'Lewes', 'Fernandina_Beach', 'The_Battery']

selected_idx = [station_names.index(st) for st in ALL_STATIONS]
selected_names = [station_names[i] for i in selected_idx]
selected_lat = lat[selected_idx]
selected_lon = lon[selected_idx]
selected_sea_level = sea_level[:, selected_idx]  # time × selected_stations

# ============================================
# Build Preview DataFrame
# ============================================
df_preview = pd.DataFrame({
    'time': np.tile(time_dt[:5], len(selected_names)),
    'station_name': np.repeat(selected_names, 5),
    'latitude': np.repeat(selected_lat, 5),
    'longitude': np.repeat(selected_lon, 5),
    'sea_level': selected_sea_level[:5, :].T.flatten()
})

# ============================================
# Print Data Head
# ============================================
print(f"Number of stations: {len(selected_names)}")
print(f"Sea level shape (time x stations): {selected_sea_level.shape}")
df_preview.head()

Number of stations: 12
Sea level shape (time x stations): (622392, 12)


Unnamed: 0,time,station_name,latitude,longitude,sea_level
0,1950-01-01 00:00:00.000000,Annapolis,38.98328,-76.4816,1.341
1,1950-01-01 00:59:59.999997,Annapolis,38.98328,-76.4816,1.311
2,1950-01-01 02:00:00.000003,Annapolis,38.98328,-76.4816,1.28
3,1950-01-01 03:00:00.000000,Annapolis,38.98328,-76.4816,1.28
4,1950-01-01 03:59:59.999997,Annapolis,38.98328,-76.4816,1.341


In [2]:
import numpy as np

threshold_df = loadmat('./Seed_Coastal_Stations_Thresholds.mat')
mat = threshold_df  # whatever variable holds the loaded .mat dict

# sname is shape (1, 12) with each entry like array(['Annapolis'], dtype='<U9')
sname_raw = mat["sname"].ravel()

# Convert each cell to a plain Python string
station_names = [str(x.squeeze()) for x in sname_raw]

# thminor_stnd is shape (12, 1) floats
threshold_vals = mat["thminor_stnd"].astype(float).ravel().tolist()

thresholds = dict(zip(station_names, threshold_vals))

thresholds

{'Annapolis': 2.104,
 'Atlantic_City': 3.344,
 'Charleston': 2.98,
 'Eastport': 8.071,
 'Fernandina_Beach': 3.148,
 'Lewes': 2.675,
 'Portland': 6.267,
 'Sandy_Hook': 2.809,
 'Sewells_Point': 2.706,
 'The_Battery': 3.192,
 'Washington': 2.673,
 'Wilmington': 2.423}

In [3]:
# ============================================
# Convert Hourly → Daily per Station
# ============================================
# Convert time to pandas datetime
time_dt = pd.to_datetime(time_dt)

# Build hourly DataFrame for selected stations
df_hourly = pd.DataFrame({
    'time': np.tile(time_dt, len(selected_names)),
    'station_name': np.repeat(selected_names, len(time_dt)),
    'latitude': np.repeat(selected_lat, len(time_dt)),
    'longitude': np.repeat(selected_lon, len(time_dt)),
    'sea_level': selected_sea_level.flatten()
})

In [4]:
df_hourly

Unnamed: 0,time,station_name,latitude,longitude,sea_level
0,1950-01-01 00:00:00.000000,Annapolis,38.983280,-76.481600,1.341
1,1950-01-01 00:59:59.999997,Annapolis,38.983280,-76.481600,2.067
2,1950-01-01 02:00:00.000003,Annapolis,38.983280,-76.481600,1.890
3,1950-01-01 03:00:00.000000,Annapolis,38.983280,-76.481600,1.798
4,1950-01-01 03:59:59.999997,Annapolis,38.983280,-76.481600,1.646
...,...,...,...,...,...
7468699,2020-12-31 19:00:00.000007,The_Battery,40.700556,-74.014167,1.432
7468700,2020-12-31 20:00:00.000003,The_Battery,40.700556,-74.014167,1.481
7468701,2020-12-31 21:00:00.000000,The_Battery,40.700556,-74.014167,1.155
7468702,2020-12-31 22:00:00.000007,The_Battery,40.700556,-74.014167,1.187


In [5]:
thresholds_df = pd.DataFrame.from_dict(
    thresholds,
    orient="index",
    columns=["flood_threshold"]
).reset_index()

thresholds_df.columns = ["station_name", "flood_threshold"]
thresholds_df

Unnamed: 0,station_name,flood_threshold
0,Annapolis,2.104
1,Atlantic_City,3.344
2,Charleston,2.98
3,Eastport,8.071
4,Fernandina_Beach,3.148
5,Lewes,2.675
6,Portland,6.267
7,Sandy_Hook,2.809
8,Sewells_Point,2.706
9,The_Battery,3.192


In [6]:
df_hourly

Unnamed: 0,time,station_name,latitude,longitude,sea_level
0,1950-01-01 00:00:00.000000,Annapolis,38.983280,-76.481600,1.341
1,1950-01-01 00:59:59.999997,Annapolis,38.983280,-76.481600,2.067
2,1950-01-01 02:00:00.000003,Annapolis,38.983280,-76.481600,1.890
3,1950-01-01 03:00:00.000000,Annapolis,38.983280,-76.481600,1.798
4,1950-01-01 03:59:59.999997,Annapolis,38.983280,-76.481600,1.646
...,...,...,...,...,...
7468699,2020-12-31 19:00:00.000007,The_Battery,40.700556,-74.014167,1.432
7468700,2020-12-31 20:00:00.000003,The_Battery,40.700556,-74.014167,1.481
7468701,2020-12-31 21:00:00.000000,The_Battery,40.700556,-74.014167,1.155
7468702,2020-12-31 22:00:00.000007,The_Battery,40.700556,-74.014167,1.187


In [7]:
df_hourly_2 = df_hourly.merge(thresholds_df, on='station_name', how='left')
df_hourly_2

Unnamed: 0,time,station_name,latitude,longitude,sea_level,flood_threshold
0,1950-01-01 00:00:00.000000,Annapolis,38.983280,-76.481600,1.341,2.104
1,1950-01-01 00:59:59.999997,Annapolis,38.983280,-76.481600,2.067,2.104
2,1950-01-01 02:00:00.000003,Annapolis,38.983280,-76.481600,1.890,2.104
3,1950-01-01 03:00:00.000000,Annapolis,38.983280,-76.481600,1.798,2.104
4,1950-01-01 03:59:59.999997,Annapolis,38.983280,-76.481600,1.646,2.104
...,...,...,...,...,...,...
7468699,2020-12-31 19:00:00.000007,The_Battery,40.700556,-74.014167,1.432,3.192
7468700,2020-12-31 20:00:00.000003,The_Battery,40.700556,-74.014167,1.481,3.192
7468701,2020-12-31 21:00:00.000000,The_Battery,40.700556,-74.014167,1.155,3.192
7468702,2020-12-31 22:00:00.000007,The_Battery,40.700556,-74.014167,1.187,3.192


In [8]:
# ============================================
# Daily Aggregation + Flood Flag
# ============================================
df_daily = df_hourly_2.groupby(['station_name', pd.Grouper(key='time', freq='D')]).agg({
    'sea_level': 'mean',
    'latitude': 'first',
    'longitude': 'first',
    'flood_threshold': 'first'
}).reset_index()
df_daily

Unnamed: 0,station_name,time,sea_level,latitude,longitude,flood_threshold
0,Annapolis,1950-01-01,2.299667,38.98328,-76.481600,2.104
1,Annapolis,1950-01-02,1.941625,38.98328,-76.481600,2.104
2,Annapolis,1950-01-03,1.562000,38.98328,-76.481600,2.104
3,Annapolis,1950-01-04,1.518958,38.98328,-76.481600,2.104
4,Annapolis,1950-01-05,1.922667,38.98328,-76.481600,2.104
...,...,...,...,...,...,...
311191,Wilmington,2020-12-27,1.928364,34.22750,-77.953611,2.423
311192,Wilmington,2020-12-28,1.692818,34.22750,-77.953611,2.423
311193,Wilmington,2020-12-29,1.889909,34.22750,-77.953611,2.423
311194,Wilmington,2020-12-30,2.333591,34.22750,-77.953611,2.423


In [9]:
# 1) daily max from hourly
hourly_max = (
    df_hourly
    .groupby(["station_name", pd.Grouper(key="time", freq="D")])["sea_level"]
    .max()
    .reset_index(name="daily_max")
)

# 2) daily mean/std
df_daily = (
    df_hourly
    .set_index("time")
    .groupby("station_name")["sea_level"]
    .resample("D")
    .agg(daily_mean="mean", daily_std="std")
    .reset_index()
)

# 3) merge in daily max + thresholds
df_daily = df_daily.merge(hourly_max, on=["station_name", "time"], how="left")
df_daily = df_daily.merge(
    thresholds_df, 
    on="station_name", how="left"
)

# 4) flood label: max > threshold
df_daily["flood"] = (df_daily["daily_max"] > df_daily["flood_threshold"]).astype("int8")

In [10]:
df_daily

Unnamed: 0,station_name,time,daily_mean,daily_std,daily_max,flood_threshold,flood
0,Annapolis,1950-01-01,2.299667,1.476769,6.288,2.104,1
1,Annapolis,1950-01-02,1.941625,1.544821,6.105,2.104,1
2,Annapolis,1950-01-03,1.562000,1.159594,4.612,2.104,1
3,Annapolis,1950-01-04,1.518958,0.698805,3.200,2.104,1
4,Annapolis,1950-01-05,1.922667,0.614369,3.414,2.104,1
...,...,...,...,...,...,...,...
311191,Wilmington,2020-12-27,1.928364,1.373229,5.386,2.423,1
311192,Wilmington,2020-12-28,1.692818,0.804025,3.594,2.423,1
311193,Wilmington,2020-12-29,1.889909,0.559250,3.133,2.423,1
311194,Wilmington,2020-12-30,2.333591,0.734587,4.157,2.423,1


In [11]:
import numpy as np
import pandas as pd

df_daily = df_daily.sort_values(["station_name", "time"]).reset_index(drop=True)

# --- Seasonality (yearly cycle) ---
df_daily["day_of_year"] = df_daily["time"].dt.dayofyear
df_daily["doy_sin"] = np.sin(2 * np.pi * df_daily["day_of_year"] / 365.25)
df_daily["doy_cos"] = np.cos(2 * np.pi * df_daily["day_of_year"] / 365.25)

# --- Lag features (use daily_max as the main signal) ---
for lag in [1, 2, 3, 7, 14]:
    df_daily[f"max_lag_{lag}"] = df_daily.groupby("station_name")["daily_max"].shift(lag)
    df_daily[f"mean_lag_{lag}"] = df_daily.groupby("station_name")["daily_mean"].shift(lag)
    df_daily[f"flood_lag_{lag}"] = df_daily.groupby("station_name")["flood"].shift(lag)

# --- Rolling stats (shift(1) to avoid peeking at today's value) ---
for w in [3, 7, 14]:
    g = df_daily.groupby("station_name")["daily_max"]
    df_daily[f"max_roll_mean_{w}"] = g.shift(1).rolling(w).mean()
    df_daily[f"max_roll_std_{w}"]  = g.shift(1).rolling(w).std()
    df_daily[f"max_roll_min_{w}"]  = g.shift(1).rolling(w).min()
    df_daily[f"max_roll_max_{w}"]  = g.shift(1).rolling(w).max()

# --- Recent flood counts (very predictive) ---
for w in [7, 14, 30]:
    df_daily[f"flood_count_{w}"] = (
        df_daily.groupby("station_name")["flood"]
        .shift(1)
        .rolling(w)
        .sum()
    )

# --- Optional: distance-to-threshold style feature ---
df_daily["max_minus_threshold"] = df_daily["daily_max"] - df_daily["flood_threshold"]

# added: normalized distance (OOD-friendly)
df_daily["distance_ratio"] = df_daily["max_minus_threshold"] / (df_daily["flood_threshold"] + 1e-6)
# added: how close we got recently
df_daily["recent_max_minus_thr_7"] = (
    df_daily.groupby("station_name")["max_minus_threshold"].shift(1).rolling(7).max()
)

In [12]:
df_daily

Unnamed: 0,station_name,time,daily_mean,daily_std,daily_max,flood_threshold,flood,day_of_year,doy_sin,doy_cos,...,max_roll_mean_14,max_roll_std_14,max_roll_min_14,max_roll_max_14,flood_count_7,flood_count_14,flood_count_30,max_minus_threshold,distance_ratio,recent_max_minus_thr_7
0,Annapolis,1950-01-01,2.299667,1.476769,6.288,2.104,1,1,0.017202,0.999852,...,,,,,,,,4.184,1.988592,
1,Annapolis,1950-01-02,1.941625,1.544821,6.105,2.104,1,2,0.034398,0.999408,...,,,,,,,,4.001,1.901615,
2,Annapolis,1950-01-03,1.562000,1.159594,4.612,2.104,1,3,0.051584,0.998669,...,,,,,,,,2.508,1.192015,
3,Annapolis,1950-01-04,1.518958,0.698805,3.200,2.104,1,4,0.068755,0.997634,...,,,,,,,,1.096,0.520912,
4,Annapolis,1950-01-05,1.922667,0.614369,3.414,2.104,1,5,0.085906,0.996303,...,,,,,,,,1.310,0.622623,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311191,Wilmington,2020-12-27,1.928364,1.373229,5.386,2.423,1,362,-0.055879,0.998438,...,5.150214,1.410250,3.264,6.770,7.0,14.0,30.0,2.963,1.222864,4.319
311192,Wilmington,2020-12-28,1.692818,0.804025,3.594,2.423,1,363,-0.038696,0.999251,...,5.051357,1.334441,3.264,6.742,7.0,14.0,30.0,1.171,0.483285,4.319
311193,Wilmington,2020-12-29,1.889909,0.559250,3.133,2.423,1,364,-0.021501,0.999769,...,4.856714,1.334054,3.264,6.742,7.0,14.0,30.0,0.710,0.293025,4.319
311194,Wilmington,2020-12-30,2.333591,0.734587,4.157,2.423,1,365,-0.004301,0.999991,...,4.761143,1.409600,3.133,6.742,7.0,14.0,30.0,1.734,0.715641,4.319


In [13]:
# Keep only the most important / robust features (based on feature importance)
keep_features = [
    # flood memory
    "flood_lag_1","flood_lag_2","flood_lag_3","flood_lag_7","flood_lag_14",
    "flood_count_7","flood_count_14","flood_count_30",
    # magnitude
    "distance_ratio","max_minus_threshold",
    "max_lag_1","max_lag_2","max_lag_3","max_lag_7","max_lag_14",
]

# Use only features that exist in the dataframe (guards against naming differences)
feature_cols = [f for f in keep_features if f in df_daily.columns]

print("Using features:", feature_cols)

model_df = df_daily.dropna(subset=feature_cols + ['flood']).copy()
model_df


Using features: ['flood_lag_1', 'flood_lag_2', 'flood_lag_3', 'flood_lag_7', 'flood_lag_14', 'flood_count_7', 'flood_count_14', 'flood_count_30', 'distance_ratio', 'max_minus_threshold', 'max_lag_1', 'max_lag_2', 'max_lag_3', 'max_lag_7', 'max_lag_14']


Unnamed: 0,station_name,time,daily_mean,daily_std,daily_max,flood_threshold,flood,day_of_year,doy_sin,doy_cos,...,max_roll_mean_14,max_roll_std_14,max_roll_min_14,max_roll_max_14,flood_count_7,flood_count_14,flood_count_30,max_minus_threshold,distance_ratio,recent_max_minus_thr_7
30,Annapolis,1950-01-31,2.595875,1.164657,5.852,2.104,1,31,0.508356,0.861147,...,4.667714,1.493375,2.682,6.919,7.0,14.0,30.0,3.748,1.781368,4.419
31,Annapolis,1950-02-01,2.608500,1.712253,7.193,2.104,1,32,0.523094,0.852275,...,4.861500,1.455282,2.682,6.919,7.0,14.0,30.0,5.089,2.418725,4.419
32,Annapolis,1950-02-02,2.101875,1.689728,6.736,2.104,1,33,0.537677,0.843151,...,5.074857,1.566498,2.682,7.193,7.0,14.0,30.0,4.632,2.201520,5.089
33,Annapolis,1950-02-03,1.535500,1.156793,4.602,2.104,1,34,0.552101,0.833777,...,5.101000,1.592612,2.682,7.193,7.0,14.0,30.0,2.498,1.187262,5.089
34,Annapolis,1950-02-04,1.408333,0.611631,3.018,2.104,1,35,0.566362,0.824157,...,4.935500,1.507259,2.682,7.193,7.0,14.0,30.0,0.914,0.434410,5.089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311191,Wilmington,2020-12-27,1.928364,1.373229,5.386,2.423,1,362,-0.055879,0.998438,...,5.150214,1.410250,3.264,6.770,7.0,14.0,30.0,2.963,1.222864,4.319
311192,Wilmington,2020-12-28,1.692818,0.804025,3.594,2.423,1,363,-0.038696,0.999251,...,5.051357,1.334441,3.264,6.742,7.0,14.0,30.0,1.171,0.483285,4.319
311193,Wilmington,2020-12-29,1.889909,0.559250,3.133,2.423,1,364,-0.021501,0.999769,...,4.856714,1.334054,3.264,6.742,7.0,14.0,30.0,0.710,0.293025,4.319
311194,Wilmington,2020-12-30,2.333591,0.734587,4.157,2.423,1,365,-0.004301,0.999991,...,4.761143,1.409600,3.133,6.742,7.0,14.0,30.0,1.734,0.715641,4.319


In [14]:
horizon = 14

X_list, Y_list, meta = [], [], []  # meta keeps station/time for debugging

for station, sdf in model_df.groupby("station_name"):
    sdf = sdf.sort_values("time").reset_index(drop=True)

    # index i is "today"; we predict next 14 days
    for i in range(len(sdf) - horizon):
        X_list.append(sdf.loc[i, feature_cols].to_numpy())
        Y_list.append(sdf.loc[i+1:i+horizon, "flood"].to_numpy())
        meta.append((station, sdf.loc[i, "time"]))

X = np.vstack(X_list)
Y = np.vstack(Y_list)  # shape: (n_samples, 14)

In [15]:
# from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

pos = int((Y == 1).sum())
neg = int((Y == 0).sum())
spw = neg / max(pos, 1)


clf = XGBClassifier(
    n_estimators=260,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_lambda=2.0,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=spw,
)

clf.fit(X, Y)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
TRAINING_STATIONS = [
    'Annapolis','Atlantic_City','Charleston','Washington',
    'Wilmington','Eastport','Portland','Sewells_Point','Sandy_Hook'
]

TESTING_STATIONS = [
    'Lewes','Fernandina_Beach','The_Battery'
]

In [17]:
X_train, Y_train = [], []
X_test, Y_test = [], []

for station, sdf in model_df.groupby("station_name"):
    sdf = sdf.sort_values("time").reset_index(drop=True)

    for i in range(len(sdf) - horizon):
        features = sdf.loc[i, feature_cols].to_numpy()
        target = sdf.loc[i+1:i+horizon, "flood"].to_numpy()

        if station in TRAINING_STATIONS:
            X_train.append(features)
            Y_train.append(target)
        elif station in TESTING_STATIONS:
            X_test.append(features)
            Y_test.append(target)

X_train = np.vstack(X_train)
Y_train = np.vstack(Y_train)
X_test = np.vstack(X_test)
Y_test = np.vstack(Y_test)

In [18]:
from sklearn.model_selection import train_test_split

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=42,
    stratify=Y_train.max(axis=1)  # stratify by "any flood in horizon"
)

In [19]:
def proba_class1(multi_clf, probas_list):
    """
    Returns (n_samples, n_outputs) array of P(class=1) for MultiOutputClassifier,
    handling cases where an output has only one class.
    """
    # If sklearn returns a single ndarray already, just use it
    if isinstance(probas_list, np.ndarray):
        # sometimes shape could be (n, n_outputs) already
        if probas_list.ndim == 2:
            return probas_list
        raise ValueError(f"Unexpected probas ndarray shape: {probas_list.shape}")

    out = []
    for j, p in enumerate(probas_list):
        est = multi_clf.estimators_[j]
        classes = getattr(est, "classes_", None)

        p = np.asarray(p)

        if p.ndim == 1:
            # already P(class=1)
            out.append(p)

        elif p.ndim == 2 and p.shape[1] == 2:
            # choose the column that corresponds to class 1
            if classes is None:
                out.append(p[:, 1])
            else:
                idx1 = int(np.where(classes == 1)[0][0]) if 1 in classes else None
                out.append(p[:, idx1] if idx1 is not None else np.zeros(p.shape[0]))

        elif p.ndim == 2 and p.shape[1] == 1:
            # only one class was seen during training for this output
            if classes is not None and len(classes) == 1 and classes[0] == 1:
                out.append(np.ones(p.shape[0]))   # always class 1
            else:
                out.append(np.zeros(p.shape[0]))  # class 1 never occurs

        else:
            raise ValueError(f"Unexpected proba shape for output {j}: {p.shape}")

    return np.column_stack(out)

In [20]:
clf.fit(X_tr, Y_tr)

val_probas_list = clf.predict_proba(X_val)
val_probs = proba_class1(clf, val_probas_list)   # (n_val, 14)
test_probas_list = clf.predict_proba(X_test)
test_probs = proba_class1(clf, test_probas_list) # (n_test, 14)

In [21]:
from sklearn.metrics import f1_score
import numpy as np

best_thr, best_f1 = 0.5, -1.0
for thr in np.linspace(0.05, 0.95, 91):
    Y_val_pred = (val_probs > thr).astype(int)
    f1 = f1_score(Y_val.flatten(), Y_val_pred.flatten())
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

prob_shift = 0.5 - best_thr
print("best_thr:", best_thr, "best_f1:", best_f1, "prob_shift:", prob_shift)

best_thr: 0.24 best_f1: 0.9802812839253416 prob_shift: 0.26


In [22]:
test_probs = np.clip(test_probs + prob_shift, 0, 1)
Y_pred = (test_probs > 0.5).astype(int)

y_true = Y_test.flatten()
y_pred = Y_pred.flatten()

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix
)

acc = accuracy_score(y_true, y_pred)
f1  = f1_score(y_true, y_pred)
mcc = matthews_corrcoef(y_true, y_pred)
cm  = confusion_matrix(y_true, y_pred)

print("Accuracy:", acc)
print("F1:", f1)
print("MCC:", mcc)
print("Confusion matrix:\n", cm)

Accuracy: 0.9551291602514224
F1: 0.9747867108508206
MCC: 0.7711922326043238
Confusion matrix:
 [[ 95395  25452]
 [ 23334 943073]]


In [23]:
import pandas as pd

importances = clf.feature_importances_
feat_imp = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values("importance", ascending=False)

print(feat_imp.head(40))

                feature  importance
2           flood_lag_3    0.236382
1           flood_lag_2    0.194738
0           flood_lag_1    0.178735
4          flood_lag_14    0.165135
7        flood_count_30    0.045811
3           flood_lag_7    0.042962
8        distance_ratio    0.034582
6        flood_count_14    0.030738
5         flood_count_7    0.016949
9   max_minus_threshold    0.014238
14           max_lag_14    0.010675
10            max_lag_1    0.008854
11            max_lag_2    0.008112
12            max_lag_3    0.006634
13            max_lag_7    0.005455


In [25]:
import pickle

artifact = {
    "model": clf,
    "feature_cols": feature_cols,
    "horizon": 14
}

with open("260222_reduced_model.pkl", "wb") as f:
    pickle.dump(artifact, f)