In [3]:
from book_matrix import S21UserBookMatrixBuilder
from pipeline_manager import S21AgePipelineManager

In [4]:
ratios = (0.7, 0.15, 0.15)

builder = S21UserBookMatrixBuilder('../../datasets/data/Books/Ratings.csv', '../../datasets/data/books/Users.csv')
split = builder.build_split(
    ratios,
    seed=42,
    min_age=5,
    max_age=100,
    top_n_books=20000,
)

X_train, X_val, X_test, y = split.X_train, split.X_val, split.X_test, split.y

pm = S21AgePipelineManager(n_iter=5)

In [5]:
pm.fit_linear(X_train, y)
lin_model, lin_params = pm.get_linear()

In [6]:
pm.fit_linear(X_train, y, 'PCA')
lin_model_pca, lin_params_pca = pm.get_linear()

In [7]:
pm.fit_linear(X_train, y, 'UMAP')
lin_model_umap, lin_params_umap = pm.get_linear()

In [8]:
pm.fit_forest(X_train, y)
rf_model, rf_params = pm.get_forest()

In [9]:
pm.fit_forest(X_train, y, 'PCA')
forest_model_pca, forest_params_pca = pm.get_forest()

In [10]:
pm.fit_forest(X_train, y, 'UMAP')
forest_model_umap, forest_params_umap = pm.get_forest()

In [11]:
models = (
    ("Ridge", lin_model),
    ("Ridge+PCA", lin_model_pca),
    ("Ridge+UMAP", lin_model_umap),
    ("RF", rf_model),
    ("RF+PCA", forest_model_pca),
    ("RF+UMAP", forest_model_umap),
)

pm.evaluate(models, X_train, X_val, y)

Ridge [train] -> MAE: 6.079, RMSE: 9.303, R2: 0.535
Ridge [val]   -> MAE: 13.696, RMSE: 19.862, R2: -1.117
Ridge+PCA [train] -> MAE: 11.064, RMSE: 13.584, R2: 0.010
Ridge+PCA [val]   -> MAE: 11.118, RMSE: 13.634, R2: 0.002
Ridge+UMAP [train] -> MAE: 11.123, RMSE: 13.641, R2: 0.001
Ridge+UMAP [val]   -> MAE: 11.134, RMSE: 13.658, R2: -0.001
RF [train] -> MAE: 5.882, RMSE: 8.195, R2: 0.640
RF [val]   -> MAE: 11.187, RMSE: 13.835, R2: -0.027
RF+PCA [train] -> MAE: 7.303, RMSE: 9.625, R2: 0.503
RF+PCA [val]   -> MAE: 10.926, RMSE: 13.673, R2: -0.003
RF+UMAP [train] -> MAE: 9.943, RMSE: 12.316, R2: 0.186
RF+UMAP [val]   -> MAE: 11.180, RMSE: 13.687, R2: -0.005


### Ridge:
Дольше всего обучается UMAP, затем идет PCA и меньше всего времени занимает обучение без уменьшения размерности.
### Лес
Дольше всего так же обучается UMAP, затем идет модель без уменьшения размерности и быстрее всего обучилась модель с PCA

---
Модели дают слабые результаты из-за слабой взаимосвязи между возрастом и тем, какие книги оценил пользователь. Есть переобучение.