In [None]:
!pip install mlflow



In [21]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.preprocessing import LabelEncoder

In [20]:
import pickle

In [None]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [None]:
from implicit.evaluation import ndcg_at_k, mean_average_precision_at_k

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
# запуск MLflow UI
mlflow.set_tracking_uri("/content/drive/MyDrive/mlflow_runs")

In [32]:
mlflow.set_experiment("ials")

2025/03/25 21:50:20 INFO mlflow.tracking.fluent: Experiment with name 'ials' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/drive/MyDrive/mlflow_runs/689428780021007894', creation_time=1742939420650, experiment_id='689428780021007894', last_update_time=1742939420650, lifecycle_stage='active', name='ials', tags={}>

In [13]:
history = pd.read_csv("User Listening History.csv")

In [14]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [15]:
def split_data(history, test_size=0.2, random_state=42):
    history = history.copy()
    np.random.seed(random_state)
    history["rand"] = np.random.rand(len(history))

    test_samples = history[history["rand"] < test_size].drop(columns=["rand"])
    train_data = history[history["rand"] >= test_size].drop(columns=["rand"])

    test_data = test_samples.groupby("user_id")["track_id"].apply(list).to_dict()
    return train_data, test_data

In [16]:
def prepare_data(history_path, minfo_path):
    history = pd.read_csv(history_path, dtype={"user_id": "str", "track_id": "str"})
    minfo = pd.read_csv(minfo_path, dtype={"track_id": "str"})

    history = history[
        (history["playcount"] > 0)
        & (history["user_id"].str.len() > 5)
        & (history["track_id"].isin(minfo["track_id"]))
    ].copy()
    history = history.groupby(["user_id", "track_id"])["playcount"].sum().reset_index()

    user_encoder = LabelEncoder()
    track_encoder = LabelEncoder()
    history["user_encoded"] = user_encoder.fit_transform(history["user_id"])
    history["track_encoded"] = track_encoder.fit_transform(history["track_id"])

    row = history["user_encoded"].values
    col = history["track_encoded"].values
    data = 1 + 0.5 * np.log1p(history["playcount"].values)

    sparse_matrix = csr_matrix(
        (data, (row, col)),
        shape=(len(user_encoder.classes_), len(track_encoder.classes_)),
    )

    # фильтрация неактивных пользователей
    user_activity = np.diff(sparse_matrix.indptr)
    active_users_mask = user_activity > 0
    sparse_matrix = sparse_matrix[active_users_mask]

    active_user_ids = user_encoder.classes_[active_users_mask]
    user_encoder = LabelEncoder()
    user_encoder.fit(active_user_ids)

    return sparse_matrix, user_encoder, track_encoder, minfo

In [17]:
def train_model(sparse_matrix):
    user_item_matrix = sparse_matrix.tocsr()

    model = AlternatingLeastSquares(
        factors=64, regularization=0.5, iterations=50, random_state=42, dtype=np.float32
    )

    alpha = 10  # слишком много треков с низким числом прослушиваний, и когда задаешь большую alpha, модель начинает их переоценивать
    model.fit(user_item_matrix * alpha)

    return model, user_item_matrix

In [18]:
def get_recommendations(
    user_id,
    model,
    user_encoder,
    track_encoder,
    minfo,
    user_item_matrix,
    history_df,
    top_k=75,
):
    try:
        if user_id not in user_encoder.classes_:
            return pd.DataFrame(columns=["artist", "name"])

        internal_id = user_encoder.transform([user_id])[0]

        if internal_id >= user_item_matrix.shape[0]:
            return pd.DataFrame(columns=["artist", "name"])

        # Получаем рекомендации
        item_ids, scores = model.recommend(
            internal_id,
            user_item_matrix[internal_id],
            N=top_k * 3,  # Берем с запасом
            filter_already_liked_items=False,
        )

        # прослушанные треки пользователя, чтобы рекомендавались не они
        listened_tracks = history_df[history_df["user_id"] == user_id][
            "track_id"
        ].unique()

        track_ids = track_encoder.inverse_transform(item_ids)
        recommendations = minfo[minfo["track_id"].isin(track_ids)]

        recommendations = recommendations[
            ~recommendations["track_id"].isin(listened_tracks)
        ]

        return recommendations.head(top_k)

    except Exception as e:
        print(f"Ошибка: {str(e)}")
        return pd.DataFrame(columns=["artist", "name"])

In [33]:
train, test = split_data(history, test_size=0.2)
sparse_matrix, user_encoder, track_encoder, minfo = prepare_data(
    "User Listening History.csv", "Music Info.csv"
)


with mlflow.start_run():

    # логируем параметры модели
    mlflow.log_param("factors", 64)
    mlflow.log_param("regularization", 0.5)
    mlflow.log_param("iterations", 80)
    mlflow.log_param("alpha", 10)
    mlflow.log_param("k", 75)

    model, user_item_matrix = train_model(sparse_matrix)

    user_id = "b80344d063b5ccb3212f76538f3d9e43d87dca9e"

    if user_id in user_encoder.classes_:
        recs = get_recommendations(
            user_id,
            model,
            user_encoder,
            track_encoder,
            minfo,
            user_item_matrix,
            history,
        )
        print(f"\nТоп-75 рекомендаций для {user_id}:")
        for i, (artist, name) in enumerate(zip(recs["artist"], recs["name"]), 1):
            print(f"{i}. {artist} - {name}")
    else:
        print("Пользователь не найден в системе")

    # подготовка данных к логам метрик

    user_encoder = dict(zip(user_encoder.classes_, range(len(user_encoder.classes_))))
    track_encoder = dict(
        zip(track_encoder.classes_, range(len(track_encoder.classes_)))
    )
    train["user_encoded"] = train["user_id"].map(user_encoder)
    train["track_encoded"] = train["track_id"].map(track_encoder)

    train_csr = csr_matrix(
        (
            train["playcount"].values,
            (train["user_encoded"].values, train["track_encoded"].values),
        )
    )
    user_idxn, track_idxn = [], []

    for user, tracks in test.items():
        if user in user_encoder:
            for track in tracks:
                if track in track_encoder:
                    user_idxn.append(user_encoder[user])
                    track_idxn.append(track_encoder[track])

    user_idxn = np.array(user_idxn)
    track_idxn = np.array(track_idxn)

    test_csr = csr_matrix(
        (np.ones_like(user_idxn), (user_idxn, track_idxn)),
        shape=(len(user_encoder), len(track_encoder)),
    )

    k = 75
    ndcg_at_75 = ndcg_at_k(model, train_csr, test_csr, k)
    map_at_75 = mean_average_precision_at_k(model, train_csr, test_csr, k)
    print(f"NDCG_{k}: {ndcg_at_75:.4f}, MAP_{k}: {map_at_75:.4f}")

    # лог метрик
    mlflow.log_metric("NDCG_75", ndcg_at_75)
    mlflow.log_metric("MAP_75", map_at_75)

    mlflow.sklearn.log_model(model, "ials")

    # сохраняем модель и энкодеры для использования в боте
    with open("als_model.pkl", "wb") as f:
        pickle.dump(
            {
                "model": model,
                "user_encoder": user_encoder,
                "track_encoder": track_encoder,
                "minfo": minfo,
                "user_item_matrix": user_item_matrix,
                "history_df": history,
            },
            f,
        )

    # логируем артефакты
    mlflow.log_artifact("als_model.pkl")
    mlflow.log_artifact("Music Info.csv")
    mlflow.log_artifact("User Listening History.csv")

  0%|          | 0/50 [00:00<?, ?it/s]


Топ-75 рекомендаций для b80344d063b5ccb3212f76538f3d9e43d87dca9e:
1. Weezer - Buddy Holly
2. Creedence Clearwater Revival - Fortunate Son
3. Rage Against the Machine - Bulls on Parade
4. Fleetwood Mac - Go Your Own Way
5. The Strokes - Is This It
6. Kings of Leon - Revelry
7. Weezer - My Name Is Jonas
8. The Shins - Caring Is Creepy
9. Alice Cooper - Poison
10. Bon Iver - For Emma
11. The Shins - Phantom Limb
12. The Shins - Sleeping Lessons
13. Incubus - Pardon Me
14. Tom Petty - Free Fallin'
15. Bon Iver - Team
16. Fleet Foxes - Tiger Mountain Peasant Song
17. Creedence Clearwater Revival - Lookin' Out My Back Door
18. Korn - Word Up!
19. The Shins - So Says I
20. Fleet Foxes - Your Protector
21. Foo Fighters - DOA
22. Incubus - Dig
23. Death Cab for Cutie - The New Year
24. The Killers - Losing Touch
25. Fleet Foxes - Sun It Rises
26. Incubus - Nice To Know You
27. Kiss - Rock And Roll All Nite
28. Kings of Leon - Charmer
29. Fleet Foxes - Quiet Houses
30. Fleet Foxes - Heard Them 

  0%|          | 0/636794 [00:00<?, ?it/s]

  0%|          | 0/636794 [00:00<?, ?it/s]

NDCG_75: 0.3766, MAP_75: 0.2543


