# ICE Arrest Forecast Analysis
This notebook walks through loading the ICE arrest data, building features, training a Random Forest, validating on a hold-out period, and forecasting 12 months into the future. Each section includes explanations.

## 1. Imports
We start by importing the necessary libraries for data manipulation, modeling, and visualization.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from pathlib import Path
import plotly.express as px

## 2. Constants & File Paths
Define directory structure, filenames, and model hyperparameters.

In [2]:
import os
print(os.getcwd())

# Project root and data paths
PROJECT_ROOT = Path().resolve().parents[1]
print(PROJECT_ROOT)
TARGET_FILE = "ICE_data.csv"
FILEPATH = PROJECT_ROOT / "cs506-final-project" / "data" / "raw" / TARGET_FILE

# Model & forecasting parameters
N_LAGS = 3
RF_ESTIMATORS = 100
RF_RANDOM_STATE = 42
HORIZON = 6       # months for validation
FREQ = "ME"       # month-end frequency

/home/pard3sh/School/cs506/cs506-final-project/notebooks
/home/pard3sh/School/cs506


## 3. Load & Rename Data
Read the CSV and rename columns for use in time series forecasting.

In [3]:
# Load dataset
df = pd.read_csv(FILEPATH)
df = df.rename(columns={
    "Month-Year": "ds",
    "Administrative Arrests": "y",
    "Country of Citizenship": "Citizenship"
})
# Parse dates
df["ds"] = pd.to_datetime(df["ds"], format="%b %Y")
df.head()

Unnamed: 0,Criminality,Area of Responsibility (AOR),Citizenship,Fiscal Year,Fiscal Quarter,Fiscal Month,ds,y
0,Criminal Conviction,Atlanta,COLOMBIA,2022,2,4,2022-01-01,10
1,Criminal Conviction,Atlanta,COLOMBIA,2022,3,8,2022-05-01,10
2,Criminal Conviction,Atlanta,COLOMBIA,2023,1,1,2022-10-01,12
3,Criminal Conviction,Atlanta,COLOMBIA,2023,2,5,2023-02-01,18
4,Criminal Conviction,Atlanta,COLOMBIA,2024,1,1,2023-10-01,16


## 4. Build Monthly Panel
Create a complete monthly time series for each citizenship, filling missing months with zero arrests.

In [4]:
# Unique citizenships and full date index
eths = df["Citizenship"].unique()
idx = pd.date_range(df["ds"].min(), df["ds"].max(), freq=FREQ)

# Build panel
panels = []
for eth in eths:
    sub = df[df["Citizenship"] == eth].set_index("ds")
    monthly = sub["y"].resample(FREQ).sum().reindex(idx, fill_value=0)
    tmp = monthly.to_frame().rename_axis("ds").reset_index()
    tmp["Citizenship"] = eth
    panels.append(tmp)
panel = pd.concat(panels, ignore_index=True)
panel.head()

Unnamed: 0,ds,y,Citizenship
0,2020-10-31,0,COLOMBIA
1,2020-11-30,0,COLOMBIA
2,2020-12-31,12,COLOMBIA
3,2021-01-31,10,COLOMBIA
4,2021-02-28,0,COLOMBIA


## 5. Feature Engineering
Create time index, seasonality encodings, lag features, and rolling statistics.

In [5]:
def make_features(data, n_lags=N_LAGS):
    g = data.sort_values("ds").copy()
    g["t"] = np.arange(len(g))
    g["month_sin"] = np.sin(2 * np.pi * g["ds"].dt.month / 12)
    g["month_cos"] = np.cos(2 * np.pi * g["ds"].dt.month / 12)
    for lag in range(1, n_lags + 1):
        g[f"y_lag{lag}"] = g["y"].shift(lag)
    g["roll_mean_3"] = g["y"].shift(1).rolling(3).mean()
    g["roll_std_3"] = g["y"].shift(1).rolling(3).std().fillna(0)
    return g.dropna()

feature_cols = ["t", "month_sin", "month_cos"] + [f"y_lag{i}" for i in range(1, N_LAGS+1)] + ["roll_mean_3", "roll_std_3"]


## 6. Train/Validation Split
Split the data temporally into a training set and a 6‑month validation window.

In [6]:
# Determine cutoff
last_date = panel["ds"].max()
offset = pd.tseries.frequencies.to_offset(FREQ)
cutoff = last_date - HORIZON * offset

train_panel = panel[panel["ds"] <= cutoff]
val_panel = panel[(panel["ds"] > cutoff) & (panel["ds"] <= last_date)]

## 7. Train Random Forest
Build the training matrix per citizenship and fit a single model on pooled data.

In [7]:
# Prepare training set
train_frames = []
for eth, grp in train_panel.groupby("Citizenship"):
    train_frames.append(make_features(grp))
train_df = pd.concat(train_frames, ignore_index=True)
X_train = train_df[feature_cols]
y_train = train_df["y"]

# Fit model
model = RandomForestRegressor(n_estimators=RF_ESTIMATORS, random_state=RF_RANDOM_STATE)
model.fit(X_train, y_train)

## 8. Walk‑Forward Validation
For each citizenship, forecast the next 6 months rolling forward, compute Euclidean error.

In [8]:
# Validate
val_preds = []
for eth, grp in train_panel.groupby("Citizenship"):
    buf = grp[["ds","y"]].reset_index(drop=True)
    future_dates = pd.date_range(start=cutoff+offset, periods=HORIZON, freq=FREQ)
    for dt in future_dates:
        feats = make_features(buf).iloc[[-1]][feature_cols]
        y_pred = model.predict(feats)[0]
        actual = val_panel[(val_panel.Citizenship==eth)&(val_panel.ds==dt)]["y"]
        y_true = actual.iloc[0] if not actual.empty else np.nan
        val_preds.append({"Citizenship":eth,"ds":dt,"y_true":y_true,"y_pred":y_pred})
        buf = pd.concat([buf,pd.DataFrame([{"ds":dt,"y":y_true}])],ignore_index=True)

val_df = pd.DataFrame(val_preds).dropna(subset=["y_true"])
mse = mean_squared_error(val_df["y_true"], val_df["y_pred"])
print(f"Validation MSE: {mse:.2f}")

mean_y = val_df["y_true"].mean()
std_y  = val_df["y_true"].std()
print(f"Mean true arrests/month: {mean_y:.1f}  (±{std_y:.1f})")
print(f"Model   RMSE: {np.sqrt(mse):.1f}")


from sklearn.metrics import mean_squared_error

# build naive: predict “y_t = y_{t-1}” -- needed to compare with naive approach to see if our model had any sort of sway
naive_preds = []
for eth, grp in train_panel.groupby("Citizenship"):
    buf = grp[["ds","y"]].reset_index(drop=True)
    future_dates = pd.date_range(start=cutoff+offset, periods=HORIZON, freq=FREQ)
    for dt in future_dates:
        y_prev = buf["y"].iloc[-1]
        naive_preds.append({"Citizenship": eth, "ds": dt, "y_naive": y_prev})
        # append the true future so the next step has the real last value
        actual = val_panel[(val_panel.Citizenship==eth)&(val_panel.ds==dt)]["y"].iloc[0]

naive_df = pd.DataFrame(naive_preds)
rmse_naive = mean_squared_error(val_df["y_true"], naive_df["y_naive"])
print(f"Naïve RMSE: {rmse_naive:.1f}")
print(f"Your RMSE:  {np.sqrt(mse):.1f}")

Validation MSE: 6711.42
Mean true arrests/month: 156.7  (±580.7)
Model   RMSE: 81.9
Naïve RMSE: 6629.0
Your RMSE:  81.9


# Naive Comparison 
1− (6629 / 82) ≈0.988

Compared to the Naive approach, our approach has a ~98.8% reduction in error. 

## 9. Forecast Next 12 Months
Roll forward each series 12 months to generate future predictions.

In [9]:
# Future forecast
FUTURE_HORIZON = 12
future_preds = []
for eth in eths:
    buf = panel[panel.Citizenship==eth][["ds","y"]].reset_index(drop=True)
    future_dates = pd.date_range(start=panel.ds.max()+offset, periods=FUTURE_HORIZON, freq=FREQ)
    for dt in future_dates:
        feats = make_features(buf).iloc[[-1]][feature_cols]
        y_fore = model.predict(feats)[0]
        future_preds.append({"Citizenship":eth,"ds":dt,"y_pred":y_fore})
        buf = pd.concat([buf,pd.DataFrame([{"ds":dt,"y":y_fore}])],ignore_index=True)

future_df = pd.DataFrame(future_preds)

## 10. Plot Forecasts
Visualize the 12‑month forecast for the top 5 citizenships and for all citizenships.

In [17]:
# Top 5 forecast
recent = panel[panel.ds > panel.ds.max() - 6*offset]
top5 = recent.groupby("Citizenship")["y"].sum().nlargest(5).index
fig1 = px.line(future_df[future_df.Citizenship.isin(top5)], x="ds", y="y_pred", color="Citizenship",
               title="12‑Month Forecast (Top 5)")
fig1.update_layout(xaxis_tickangle=-45)
fig1.show()

# All citizenships small multiples
fig2 = px.line(future_df, x="ds", y="y_pred", facet_col="Citizenship", facet_col_wrap=4, height=1500,
               title="12‑Month Forecast (All Citizenship)")
for anno in fig2.layout.annotations:
    anno.text = anno.text.split("=")[-1]
fig2.update_xaxes(tickangle=-45)
fig2.update_layout(showlegend=False)

fig2.show()