In [57]:
import pandas as pd
import numpy as np
np.random.seed(0)

N_TRAJECTORIES = 5000
TRAJECTORY_LENGTH = 257
VALIDATION_SPLIT = 0.2  # fraction of the training data used for validation

In [59]:
# X_train shape: (1285000, 14)
# 1285000 / 257 = 5000 trajectories
data = pd.read_csv("X_train.csv")
n_lines = data.shape[0]
assert n_lines / TRAJECTORY_LENGTH == N_TRAJECTORIES

### Train/Validation Split

In [61]:
# array([0, 1, ..., 4999]) with IDs of all trajectories
all_trajectories = np.arange(N_TRAJECTORIES)
# number of trajectories (e.g., 1000) used for validation
n_trajectories_val = int(N_TRAJECTORIES * 0.2)
# indices (from 0 to 4999) that identify the validation trajectories
trajectories_val = np.random.choice(N_TRAJECTORIES, n_trajectories_val, replace=False)
# the train trajectories IDs are those remaining from removing the validation trajectories from all 5000
trajectories_train = np.setdiff1d(all_trajectories, trajectories_val)
n_trajectories_train = N_TRAJECTORIES - n_trajectories_val

# now we find the indices of the rows corresponding to each set (train and validation) of trajectories
validation_indices = trajectories_val.repeat(TRAJECTORY_LENGTH - 1) * (TRAJECTORY_LENGTH - 1)
validation_indices += np.tile(np.arange(TRAJECTORY_LENGTH - 1), n_trajectories_val)
train_indices = trajectories_train.repeat(TRAJECTORY_LENGTH - 1) * (TRAJECTORY_LENGTH - 1)
train_indices += np.tile(np.arange(TRAJECTORY_LENGTH - 1), n_trajectories_train)

In [62]:
X = data.iloc[all_trajectories * TRAJECTORY_LENGTH][["x_1", "y_1", "x_2", "y_2", "x_3", "y_3"]]
X = X.to_numpy().repeat(TRAJECTORY_LENGTH - 1, axis=0)
X = np.concatenate((X, data[data.index % TRAJECTORY_LENGTH != 0][["t"]].to_numpy()), axis=1)

y = data[data.index % TRAJECTORY_LENGTH != 0][["x_1", "y_1", "x_2", "y_2", "x_3", "y_3"]].to_numpy()

X_train, y_train = X[train_indices], y[train_indices]
X_train_remove = np.where(~X_train[:, :-1].any(axis=1))[0]
y_train_remove = np.where(~y_train.any(axis=1))[0]
train_remove = np.concatenate((X_train_remove, y_train_remove))
X_train, y_train = np.delete(X_train, train_remove, axis=0), np.delete(y_train, train_remove, axis=0)
X_val, y_val = X[validation_indices], y[validation_indices]
X_val_remove = np.where(~X_val[:, :-1].any(axis=1))[0]
y_val_remove = np.where(~y_val.any(axis=1))[0]
val_remove = np.concatenate((X_val_remove, y_val_remove))
X_val, y_val = np.delete(X_val, val_remove, axis=0), np.delete(y_val, val_remove, axis=0)

### Baseline

In [64]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [65]:
pipeline_baseline = Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())])
pipeline_baseline.fit(X_train, y_train)
y_pred = pipeline_baseline.predict(X_val)
rms = root_mean_squared_error(y_pred, y_val)
rms

1.3156123818381802

In [79]:
def prepare_submission(pipeline, out_filename):
    data_test = pd.read_csv("X_test.csv")
    columns = data_test.columns.tolist()
    id_column = data_test[columns[0]]
    X_test_columns = columns[2:] + [columns[1]]
    X_test = data_test[X_test_columns]

    y_test = pipeline.predict(X_test)
    y_test_df = pd.DataFrame(y_test, columns=["x_1", "y_1", "x_2", "y_2", "x_3", "y_3"])
    y_test_df.insert(0, "Id", id_column)
    y_test_df.to_csv(out_filename, index=False)

In [81]:
prepare_submission(pipeline_baseline, "baseline-model.csv")



### Polynomial Regression

In [83]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import RidgeCV

In [84]:
def validate_poly_regression(X_train, y_train, X_val, y_val, regressor=None, degrees=range(1, 15), max_features=None):
    best_rms = np.inf
    best_model = None
    best_degree = 1
    for d in degrees:
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("poly", PolynomialFeatures(degree=d)),
            ("regressor", regressor)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        rms = root_mean_squared_error(y_pred, y_val)
        print(f"Degree {d}:\t{rms}")
        if rms < best_rms:
            best_rms = rms
            best_model = pipeline
            best_degree = d
    return best_model, best_rms, best_degree

# fraction of the training data used for validating polynomial regression
TRAIN_SUBSET = 0.05

n_examples_poly = int(X_train.shape[0] * TRAIN_SUBSET)
degrees = []
for _ in range(10):
    examples_poly = np.random.choice(X_train.shape[0], n_examples_poly, replace=False)
    X_poly, y_poly = X_train[examples_poly], y_train[examples_poly]
    _, _, degree = validate_poly_regression(X_poly, y_poly, X_val, y_val, regressor=LinearRegression(), degrees=range(1, 5))
    degrees.append(degree)

# NOTAS:
# - Vê o valor de `validate_poly_regression.n_output_features_`.
#   Para o relatório, arranja uma expressão que dá o número de features para um dado grau
#   do polinómio e um número de features iniciais.
# - Escolhe o melhor grau de PolynomialFeatures, e testa numa célula abaixo um pipeline com vários valores de RidgeCV.
#   O "melhor" grau não é necessariamente o com RMS mais baixo, take computation into account.
# - Faz um histograma com os valores em `degrees`.

Degree 1:	1.3156620397000758
Degree 2:	1.2797493522311183
Degree 3:	1.2396277227105201
Degree 4:	1.1983344105136073
Degree 1:	1.315745705734393
Degree 2:	1.2804139633966338
Degree 3:	1.2409536341127765
Degree 4:	1.1974383421807977
Degree 1:	1.315663001254351
Degree 2:	1.2796200171114434
Degree 3:	1.239410845102844
Degree 4:	1.2082505645988686
Degree 1:	1.3156513665304541
Degree 2:	1.2805047257427924
Degree 3:	1.24242588270372
Degree 4:	1.2036115116951076
Degree 1:	1.3158078480921622
Degree 2:	1.2801106608133876
Degree 3:	1.239416615665233
Degree 4:	1.1983902416890981
Degree 1:	1.3158190070082123
Degree 2:	1.2808869972774277
Degree 3:	1.2485283230808177
Degree 4:	1.20214364727661
Degree 1:	1.3158246693670346
Degree 2:	1.2808922978504744
Degree 3:	1.2412462188614117
Degree 4:	1.2032635149574524
Degree 1:	1.315596353435877
Degree 2:	1.2801407384578927
Degree 3:	1.2397331994357053
Degree 4:	1.1974839616765305
Degree 1:	1.3157213140295294
Degree 2:	1.2807503933709146
Degree 3:	1.25665507526

In [85]:
# select this from the experiments above
poly_degree = 3
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=poly_degree)),
    ("regressor", RidgeCV(alphas=[0.1, 1.0, 10.0], scoring="neg_root_mean_squared_error"))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
rms = root_mean_squared_error(y_pred, y_val)
print(rms)
# NOTAS:
# - Experimenta com o poly_degree o mais alto possível (máx. 5).
# - Usar Ridge parece não fazer diferença nenhuma. Isto pode ser devido a estarmos a usar polinómios de relativamente
#   baixo grau (isto pode ser útil no report). Talvez com o poly_degree = 5 faça diferença.

1.2398491225971104


In [86]:
ridge_cv = pipeline.named_steps['regressor']
print(f"best alpha:\t{ridge_cv.alpha_}")
print(f"best score:\t{ridge_cv.best_score_}")

best alpha:	10.0
best score:	-1.3421121858473835


In [None]:
# having determined that ridge is useless, our best polynomial submission will just be 7
pipeline_poly = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=3)),
    ("regressor", LinearRegression())
])
pipeline_poly.fit(X_train, y_train)
prepare_submission(pipeline_poly, "polynomial_submission.csv")
y_pred = pipeline_poly.predict(X_val)
rms = root_mean_squared_error(y_pred, y_val)
print(rms)



### Feature Engineering my fucking limit

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
# create new features (distances between each of the three bodies)
def add_features(X):
    # pairwise distances
    d_1_2 = np.linalg.norm(X[:, 0:2] - X[:, 2:4], axis=1).reshape(-1, 1)
    d_1_3 = np.linalg.norm(X[:, 0:2] - X[:, 4:6], axis=1).reshape(-1, 1)
    d_2_3 = np.linalg.norm(X[:, 2:4] - X[:, 4:6], axis=1).reshape(-1, 1)
    # pairwise angles
    a_1_2 = np.arctan((X[:, 3] - X[:, 1]) / (X[:, 2] - X[:, 0])).reshape(-1, 1)
    a_1_3 = np.arctan((X[:, 5] - X[:, 1]) / (X[:, 4] - X[:, 0])).reshape(-1, 1)
    a_2_3 = np.arctan((X[:, 5] - X[:, 3]) / (X[:, 4] - X[:, 1])).reshape(-1, 1)
    # center of mass
    cm = X[:, 0:2] + X[:, 2:4] + X[:, 4:6]

    return np.hstack([X, d_1_2, d_1_3, d_2_3, a_1_2, a_1_3, a_2_3, cm])

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("distance_features", FunctionTransformer(add_features)),
    ("poly", PolynomialFeatures(degree=2)),
    ("regressor", LinearRegression())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
rms = root_mean_squared_error(y_pred, y_val)
print(rms)
# no feats -> 1.316
# only distances -> 1.301
# distances + angles -> 1.290
# distances + angles + cm -> 1.290
# P2 + only distances -> 1.252
# P2 + distances + angles -> 1.268
# P2 + distances + angles + cm -> 1.205 (13.8s)
# P2 + distances + cm -> 1.257
#
#
# FINAL: maybe do P3/4 + distances + angles + cm?


### k-Nearest Nipples

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
def validate_knn_regression(X_train, y_train, X_val, y_val, nns=range(30,40)):
    results = {}
    for k in nns:
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("regressor", KNeighborsRegressor(k))
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        rms = root_mean_squared_error(y_pred, y_val)
        results[k] = rms
    return results
        
# fraction of the training data used for validating KNN neighbour choice
KNN_SUBSET = 1

n_examples_knn = int(X_train.shape[0] * KNN_SUBSET)
examples_knn = np.random.choice(X_train.shape[0], n_examples_knn, replace=False)
X_knn, y_knn = X_train[examples_knn], y_train[examples_knn]
results = validate_knn_regression(X_knn, y_knn, X_val, y_val)
results

In [None]:
# having determined that ridge is useless, our best polynomial submission will just be 7
pipeline_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=3)),
    ("regressor", KNeighborsRegressor(32))
])
pipeline_knn.fit(X_train, y_train)
prepare_submission(pipeline_knn, "knn_submission.csv")
y_pred = pipeline_knn.predict(X_val)
rms = root_mean_squared_error(y_pred, y_val)
print(rms)