In [17]:
from final.cleaning import import_dataset, clean_whole_df, filter_df, clean_test_df, mongo_connect
from final.model_processing import partition, run_model
from final.dashboard import rate

In [18]:
host = 'mongodb://root:example@localhost'
port = 27017
db_name = 'movielens_2'

mongo_connect(host, port, db_name)

df = import_dataset(host, port, db_name)
cleaned_df = clean_whole_df(df)

In [19]:
%%time

filtering_opts = {
    "min_mean_rating": 1.5,
    "max_mean_rating": 4.5,
    "movies_threshold": 35,
    "movies_few_notes": True,
    "users_threshold": 45,
    "users_few_notes": True,
    "users_no_discriminating": True,
    "users_constant_dt": True,
}

filtered_df = filter_df(cleaned_df, **filtering_opts)

Nombre de ratings par utilisateur :
count    6040.000000
mean      163.412417
std       188.350206
min        16.000000
25%        44.000000
50%        95.000000
75%       204.000000
max      1999.000000
Name: count, dtype: float64


CPU times: total: 422 ms
Wall time: 1.27 s


In [20]:
partition_opts = {"test_size": 0.8, "mini_size": 0.03}

train_df, test_df, train_mini, test_mini = partition(filtered_df, partition_opts)
train_df.shape, test_df.shape

((42251, 4), (169007, 4))

In [21]:
opts = {
    "n_components": 10,
    "max_iter": 200,
    "normalize": {"should": True, "min": 1, "max": 5},
}

model, predict_matrix = run_model(train_df, opts)
model, predict_matrix.shape

(NMF(n_components=10), (10714678, 3))

In [22]:
cleaned_test_df = clean_test_df(train_df, test_df)

In [23]:
%%time

options = {
    "mse": True,
    "top_10": True,
    "bottom_10": True,
    "ndcg" : True
}


rating_train = rate(predict_matrix, train_df, options)
rating_test = rate(predict_matrix, cleaned_test_df, options)
rating_train_mse, rating_train_top_10, rating_train_bottom_10, rating_train_ndcg  = rating_train
rating_test_mse, rating_test_top_10, rating_test_bottom_10, rating_test_ndcg  = rating_test
rating_train, rating_test


CPU times: total: 1.73 s
Wall time: 3.38 s


([7.357743511108944,
  1.3320085600733722,
  1.6634821155609907,
  0.9701236690100279],
 [7.283043642232046,
  1.2918120066091427,
  1.6977694143565265,
  0.9736897438840799])

In [24]:
import mlflow


component = 10
iteration = 205

params = {
    'n_components' : component,
    'max_iter' : iteration
}



In [25]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [26]:


# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metrics
    
    mlflow.log_metric("mse_test", rating_test_mse),
    mlflow.log_metric("top_10_test", rating_test_top_10),
    mlflow.log_metric("worse_10_test", rating_test_bottom_10),
    mlflow.log_metric("ndcg_test", rating_test_ndcg),
    mlflow.log_metric("mse_train", rating_train_mse),
    mlflow.log_metric("top_10_train", rating_train_top_10),
    mlflow.log_metric("worse_10_train", rating_train_bottom_10),
    mlflow.log_metric("ndcg_train", rating_train_ndcg)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("On training datas", f"On the training database, components {component}, iteration {iteration} ")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="NMF_Model",
        signature=False,
        input_example=predict_matrix,
        registered_model_name=f"NMF_on_train_movies_users {component} components and {iteration} iteration max",
    )

Successfully registered model 'NMF_on_train_movies_users 10 components and 205 iteration max'.
2024/05/06 12:37:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NMF_on_train_movies_users 10 components and 205 iteration max, version 1
Created version '1' of model 'NMF_on_train_movies_users 10 components and 205 iteration max'.
