In [1]:
from book_matrix import S21UserBookMatrixBuilder
from pipeline_manager import S21AgePipelineManager 

In [2]:
ratios = (0.7, 0.15, 0.15)

builder = S21UserBookMatrixBuilder('../../datasets/data/books/Ratings.csv', '../../datasets/data/books/Users.csv')
split = builder.build_split(
    ratios,
    seed=42,
    min_age=5,
    max_age=100,
    top_n_books=20000,
)

X_train, X_val, X_test, y = split.X_train, split.X_val, split.X_test, split.y

pm = S21AgePipelineManager(n_iter=5)

In [3]:
pm.fit_linear(X_train, y)
lin_model, lin_params = pm.get_linear()

In [4]:
pm.fit_linear(X_train, y, 'PCA')
lin_model_pca, lin_params_pca = pm.get_linear()

In [5]:
pm.fit_linear(X_train, y, 'UMAP')
lin_model_umap, lin_params_umap = pm.get_linear()

In [6]:
pm.fit_forest(X_train, y)
rf_model, rf_params = pm.get_forest()

In [7]:
pm.fit_forest(X_train, y, 'PCA')
forest_model_pca, forest_params_pca = pm.get_forest()

In [8]:
pm.fit_forest(X_train, y, 'UMAP')
forest_model_umap, forest_params_umap = pm.get_forest()

In [9]:
models = (
    ("Ridge", lin_model),
    ("Ridge+PCA", lin_model_pca),
    ("Ridge+UMAP", lin_model_umap),
    ("RF", rf_model),
    ("RF+PCA", forest_model_pca),
    ("RF+UMAP", forest_model_umap),
)

pm.evaluate(models, X_train, X_val, y)

Ridge [train] -> MAE: 6.079, RMSE: 9.303, R2: 0.535
Ridge [val]   -> MAE: 13.696, RMSE: 19.862, R2: -1.117
Ridge+PCA [train] -> MAE: 11.071, RMSE: 13.587, R2: 0.009
Ridge+PCA [val]   -> MAE: 11.119, RMSE: 13.635, R2: 0.002
Ridge+UMAP [train] -> MAE: 11.109, RMSE: 13.627, R2: 0.003
Ridge+UMAP [val]   -> MAE: 11.142, RMSE: 13.659, R2: -0.001
RF [train] -> MAE: 5.879, RMSE: 8.194, R2: 0.640
RF [val]   -> MAE: 11.185, RMSE: 13.834, R2: -0.027
RF+PCA [train] -> MAE: 7.623, RMSE: 9.956, R2: 0.468
RF+PCA [val]   -> MAE: 11.032, RMSE: 13.560, R2: 0.013
RF+UMAP [train] -> MAE: 10.059, RMSE: 12.465, R2: 0.166
RF+UMAP [val]   -> MAE: 11.191, RMSE: 13.686, R2: -0.005


### Ridge:
Дольше всего обучается UMAP, затем идет PCA и меньше всего времени занимает обучение без уменьшения размерности.
### Лес
Дольше всего так же обучается UMAP, затем идет модель без уменьшения размерности и быстрее всего обучилась модель с PCA

---
Модели дают слабые результаты из-за слабой взаимосвязи между возрастом и тем, какие книги оценил пользователь. Есть переобучение.