# Assignment — Tasks 1.1, 1.2 and 2.1 (Inline, CSV-only)

Este notebook contém:
- **Task 1.1**: preparação de dados, verificação de 258 vs 257, anomalias e colisões, trajetórias (plots inline);
- **Task 1.2**: baseline `StandardScaler → LinearRegression`, RMSE (train/val/test), y–ŷ (inline), submissão `baseline-model.csv`;
- **Task 2.1**: função `validate_poly_regression(...)`, testes por grau (1–14), `LinearRegression` e `RidgeCV`, gráfico RMSE vs grau + nº de features, 10 execuções e histograma do grau ótimo.

**Nota:** Os gráficos são mostrados **inline** (não se guardam imagens). Apenas são guardados **CSVs** de submissão.


In [None]:
# Imports & config
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import mean_squared_error

%matplotlib inline

# Config
SEED = 123
TRAIN_PATH = '../data/X_train.csv'
TEST_PATH  = '../data/X_test.csv'

SPLIT_TRAIN = 0.70
SPLIT_VAL   = 0.15
SPLIT_TEST  = 0.15
assert abs(SPLIT_TRAIN + SPLIT_VAL + SPLIT_TEST - 1.0) < 1e-9

## Task 1.1 — Data Preparation & Validation (Inline Plots, CSV-only)

In [None]:
# Load data
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print('Train shape:', train.shape, '| Test shape:', test.shape)
display(train.head())

# Unique time steps
unique_t = np.sort(train['t'].unique())
STEPS_PER_TRAJ = len(unique_t)
print('Unique time steps (t):', STEPS_PER_TRAJ, '| t[0]=', unique_t[0], '| t[-1]=', unique_t[-1])

# Trajectory/step indices
train = train.reset_index(drop=True)
train['traj_idx'] = train.index // STEPS_PER_TRAJ
train['step_idx'] = train.index % STEPS_PER_TRAJ

# Count per trajectory
per_traj_counts = train.groupby('traj_idx').size()
display(per_traj_counts.describe())

# Check 258 lines per trajectory
expected_lines = 258
n_traj = per_traj_counts.shape[0]
n_exact_258 = int((per_traj_counts == expected_lines).sum())
print(f'Trajetórias com exatamente {expected_lines} linhas: {n_exact_258}/{n_traj}')
if STEPS_PER_TRAJ != expected_lines:
    print('NOTA: Este dataset tem', STEPS_PER_TRAJ, 'instantes únicos de t, não 258. '
          'Documenta nos slides. O código lida com ambos.')

# Attach initial conditions (step 0) to all rows
init_rows = (train[train['step_idx']==0]
             [['traj_idx','x_1','y_1','x_2','y_2','x_3','y_3']]
             .rename(columns={'x_1':'x0_1','y_1':'y0_1','x_2':'x0_2','y_2':'y0_2','x_3':'x0_3','y_3':'y0_3'}))
train = train.merge(init_rows, on='traj_idx', how='left')

# Check zero velocities at t=0
vel_zero_sum = (train[train['step_idx']==0][['v_x_1','v_y_1','v_x_2','v_y_2','v_x_3','v_y_3']].abs().sum().sum())
print('Soma abs de velocidades em t=0 (deve ser 0):', float(vel_zero_sum))

# Collision detection
target_cols  = ['x_1','y_1','x_2','y_2','x_3','y_3']

def first_zero_step(g):
    mask = (g[target_cols].abs().sum(axis=1) == 0.0)
    idx = np.where(mask.values)[0]
    return int(idx[0]) if len(idx)>0 else np.nan

first_zero = train.groupby('traj_idx', group_keys=False).apply(first_zero_step)
n_collide  = int(first_zero.notna().sum())
print('Trajetórias com colisão:', n_collide)

# Verify rule: after first zero, all next are zero
violations = 0
for tidx, fz in first_zero.dropna().items():
    fz = int(fz)
    g = train[train['traj_idx']==tidx].sort_values('step_idx')
    tail = g[g['step_idx']>=fz][target_cols].abs().sum(axis=1)
    if not np.all(tail.values == 0.0):
        violations += 1
print('Violam a regra "após zero, tudo zero":', violations)

# Remove post-collision rows (keep valid targets only)
valid_mask  = (train[target_cols].abs().sum(axis=1) > 0.0)
train_valid = train[valid_mask].copy()

# Features to match X_test
feature_cols = ['t','x0_1','y0_1','x0_2','y0_2','x0_3','y0_3']
print('train_valid shape:', train_valid.shape)

### Visualizações (inline)

In [None]:
# Histograma do primeiro step zero (colisão)
if n_collide > 0:
    plt.figure(figsize=(6,4))
    first_zero.dropna().astype(int).plot(kind='hist', bins=30)
    plt.title('Distribuição do primeiro step zero (colisão)')
    plt.xlabel('step_idx da colisão')
    plt.ylabel('freq')
    plt.tight_layout()
    plt.show()
else:
    print('Sem colisões detetadas para o histograma.')

# 3 trajetórias aleatórias
np.random.seed(SEED)
all_traj = train['traj_idx'].unique()
sample_trajs = np.random.choice(all_traj, size=min(3, len(all_traj)), replace=False)

for i, ti in enumerate(sample_trajs, start=1):
    g = train[train['traj_idx']==ti].sort_values('step_idx')
    plt.figure(figsize=(5,5))
    plt.plot(g['x_1'], g['y_1'], label='Body 1')
    plt.plot(g['x_2'], g['y_2'], label='Body 2')
    plt.plot(g['x_3'], g['y_3'], label='Body 3')
    plt.title(f'Trajetória {ti} (sample #{i})')
    plt.xlabel('x'); plt.ylabel('y'); plt.axis('equal'); plt.legend()
    plt.tight_layout()
    plt.show()

# Uma trajetória com colisão (se existir)
if n_collide > 0:
    collide_tid = first_zero.dropna().index[0]
    g = train[train['traj_idx']==collide_tid].sort_values('step_idx')
    plt.figure(figsize=(5,5))
    plt.plot(g['x_1'], g['y_1'], label='Body 1')
    plt.plot(g['x_2'], g['y_2'], label='Body 2')
    plt.plot(g['x_3'], g['y_3'], label='Body 3')
    plt.title(f'Trajetória com colisão (id={collide_tid})')
    plt.xlabel('x'); plt.ylabel('y'); plt.axis('equal'); plt.legend()
    plt.tight_layout()
    plt.show()

## Task 1.2 — Baseline: StandardScaler + LinearRegression

In [None]:
# Split 70/15/15 por trajetória (sem vazamento)
traj_ids = np.array(sorted(train_valid['traj_idx'].unique()))
n_traj = len(traj_ids)
traj_train, traj_temp = train_test_split(traj_ids, test_size=(SPLIT_VAL + SPLIT_TEST), random_state=SEED, shuffle=True)
traj_val, traj_test   = train_test_split(traj_temp, test_size=(SPLIT_TEST/(SPLIT_VAL+SPLIT_TEST)), random_state=SEED, shuffle=True)

def subset_by_trajs(df, trajs): return df[df['traj_idx'].isin(trajs)]

df_tr = subset_by_trajs(train_valid, traj_train)
df_va = subset_by_trajs(train_valid, traj_val)
df_te = subset_by_trajs(train_valid, traj_test)

print('Traj counts -> train/val/test:', len(traj_train), len(traj_val), len(traj_test))
print('Proporções (traj):', 
      round(len(traj_train)/n_traj,3), 
      round(len(traj_val)/n_traj,3), 
      round(len(traj_test)/n_traj,3))

# Prova de não-sobreposição das condições iniciais
def initial_tuple(df):
    starts = df[df['step_idx']==0][['x0_1','y0_1','x0_2','y0_2','x0_3','y0_3']].copy()
    starts['key'] = list(map(tuple, starts.values))
    return set(starts['key'].tolist())

init_tr = initial_tuple(df_tr); init_va = initial_tuple(df_va); init_te = initial_tuple(df_te)
overlap_any = (init_tr & init_va) | (init_tr & init_te) | (init_va & init_te)
print('Sobreposição de condições iniciais entre conjuntos? ->', 'SIM' if len(overlap_any)>0 else 'NÃO')

In [None]:
# Matrices
feature_cols = ['t','x0_1','y0_1','x0_2','y0_2','x0_3','y0_3']
target_cols  = ['x_1','y_1','x_2','y_2','x_3','y_3']

X_tr = df_tr[feature_cols].values
y_tr = df_tr[target_cols].values
X_va = df_va[feature_cols].values
y_va = df_va[target_cols].values
X_te_local = df_te[feature_cols].values
y_te_local = df_te[target_cols].values

# Baseline pipeline
baseline = Pipeline([('scaler', StandardScaler()), ('linreg', LinearRegression())])
baseline.fit(X_tr, y_tr)

y_pred_tr = baseline.predict(X_tr)
y_pred_va = baseline.predict(X_va)
y_pred_te = baseline.predict(X_te_local)

rmse_tr = np.sqrt(mean_squared_error(y_tr, y_pred_tr))
rmse_va = np.sqrt(mean_squared_error(y_va, y_pred_va))
rmse_te = np.sqrt(mean_squared_error(y_te_local, y_pred_te))

print(f'RMSE treino:      {rmse_tr:.6f}')
print(f'RMSE validação:   {rmse_va:.6f}')
print(f'RMSE teste local: {rmse_te:.6f}')

In [None]:
# y–ŷ plot (validação) inline
def plot_y_yhat_inline(y_test, y_pred):
    labels = ['x_1','y_1','x_2','y_2','x_3','y_3']
    MAX = 500
    idx = np.random.choice(len(y_test), min(MAX, len(y_test)), replace=False)
    plt.figure(figsize=(10,10))
    for i in range(6):
        x0 = np.min(y_test[idx,i]); x1 = np.max(y_test[idx,i])
        ax = plt.subplot(3,2,i+1)
        ax.scatter(y_test[idx,i], y_pred[idx,i], s=8)
        ax.plot([x0,x1],[x0,x1])
        ax.set_xlabel('True '+labels[i]); ax.set_ylabel('Predicted '+labels[i])
        ax.set_aspect('equal','box')
    plt.tight_layout()
    plt.show()

plot_y_yhat_inline(y_va, y_pred_va)

### Submissão Kaggle — `baseline-model.csv` (treino em train+val)

In [None]:
X_tr_full = pd.concat([df_tr[feature_cols], df_va[feature_cols]], axis=0).values
y_tr_full = pd.concat([df_tr[target_cols],  df_va[target_cols]],  axis=0).values

baseline_full = Pipeline([('scaler', StandardScaler()), ('linreg', LinearRegression())]).fit(X_tr_full, y_tr_full)

X_submit = test[['t','x0_1','y0_1','x0_2','y0_2','x0_3','y0_3']].values
y_submit = baseline_full.predict(X_submit)

submission = pd.DataFrame({
    'Id': test['Id'],
    'x_1': y_submit[:,0],
    'y_1': y_submit[:,1],
    'x_2': y_submit[:,2],
    'y_2': y_submit[:,3],
    'x_3': y_submit[:,4],
    'y_3': y_submit[:,5],
})
submission.to_csv('../outputs/baseline-model.csv', index=False)
print('Wrote baseline-model.csv')

## Task 2.1 — Polynomial Regression Validation

In [None]:
def validate_poly_regression(X_train, y_train, X_val, y_val, 
                             regressor=None, 
                             degrees=range(1,15), 
                             subset_frac=None, 
                             random_state=None):
    """Valida regressão polinomial para vários graus e retorna o melhor pipeline e RMSE."
    """
    if regressor is None or regressor == 'linear':
        reg = LinearRegression()
    elif regressor == 'ridge':
        reg = RidgeCV(alphas=np.logspace(-4, 4, 13))
    else:
        reg = regressor
    
    # Optional subsample for speed
    if subset_frac is not None and 0 < subset_frac < 1.0:
        n_sub = max(1000, int(len(X_train) * subset_frac))
        rng = np.random.RandomState(random_state)
        idx = rng.choice(len(X_train), size=n_sub, replace=False)
        X_tr = X_train[idx]
        y_tr = y_train[idx]
    else:
        X_tr, y_tr = X_train, y_train
    
    best_rmse = np.inf
    best_model = None
    rmse_by_d = []
    feats_by_d = []
    results = {}
    
    for d in degrees:
        poly = PolynomialFeatures(degree=d, include_bias=False)
        pipe = Pipeline([('poly', poly), ('scaler', StandardScaler()), ('reg', reg)])
        pipe.fit(X_tr, y_tr)
        y_pred = pipe.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        n_out_feats = pipe.named_steps['poly'].n_output_features_
        results[d] = {'rmse': rmse, 'n_features': n_out_feats}
        rmse_by_d.append(rmse)
        feats_by_d.append(n_out_feats)
        print(f"grau={d:2d} | n_features={n_out_feats:5d} | RMSE_val={rmse:.6f}")
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = pipe
    
    # Plot inline: RMSE vs degree and number of features (twin axis)
    fig, ax1 = plt.subplots(figsize=(7,4))
    ax1.plot(list(degrees), rmse_by_d, marker='o')
    ax1.set_xlabel('Grau polinomial')
    ax1.set_ylabel('RMSE (validação)')
    ax2 = ax1.twinx()
    ax2.plot(list(degrees), feats_by_d, marker='x')
    ax2.set_ylabel('Nº de features')
    plt.title('RMSE vs Grau (linha) e Nº de Features (cruzes)')
    plt.tight_layout()
    plt.show()
    
    return best_model, best_rmse, results

### Run: LinearRegression

In [None]:
best_lin_model, best_lin_rmse, lin_results = validate_poly_regression(
    X_tr, y_tr, X_va, y_va,
    regressor='linear',
    degrees=range(1,15),
    subset_frac=0.15,
    random_state=SEED
)
print('\nMelhor (Linear): RMSE_val =', best_lin_rmse)

### Run: RidgeCV

In [None]:
best_ridge_model, best_ridge_rmse, ridge_results = validate_poly_regression(
    X_tr, y_tr, X_va, y_va,
    regressor='ridge',
    degrees=range(1,15),
    subset_frac=0.15,
    random_state=SEED
)
print('\nMelhor (RidgeCV): RMSE_val =', best_ridge_rmse)

### 10 execuções (distribuição do grau ótimo)

In [None]:
def run_many(n_runs=10, regressor='ridge', subset_frac=0.10, degrees=range(1,15)):
    best_degrees = []
    best_rmses   = []
    for i in range(n_runs):
        seed_i = SEED + i
        model, rmse, res = validate_poly_regression(
            X_tr, y_tr, X_va, y_va,
            regressor=regressor,
            degrees=degrees,
            subset_frac=subset_frac,
            random_state=seed_i
        )
        d_best = min(res.keys(), key=lambda d: res[d]['rmse'])
        best_degrees.append(d_best)
        best_rmses.append(rmse)
        print(f"[Run {i+1:02d}] Best degree = {d_best} | RMSE_val = {rmse:.6f}")
    return np.array(best_degrees), np.array(best_rmses)

best_deg_ridge, best_rmse_ridge = run_many(n_runs=10, regressor='ridge', subset_frac=0.10)

plt.figure(figsize=(6,4))
plt.hist(best_deg_ridge, bins=np.arange(0.5,14.6,1), rwidth=0.85)
plt.xticks(range(1,15))
plt.xlabel('Grau selecionado (RidgeCV)')
plt.ylabel('Frequência em 10 execuções')
plt.title('Distribuição do grau ótimo (amostra 10 execuções)')
plt.tight_layout()
plt.show()

print('Graus escolhidos (RidgeCV):', best_deg_ridge.tolist())
print('Mais frequente:', int(pd.Series(best_deg_ridge).mode().iloc[0]))