In [33]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.preprocessing import LabelEncoder

In [15]:
!pip install implicit



In [27]:
history = pd.read_csv("User Listening History.csv")

0.7.2


In [17]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [65]:
from implicit.evaluation import ndcg_at_k, mean_average_precision_at_k

In [79]:
mlflow.set_experiment("ials")

<Experiment: artifact_location='mlflow-artifacts:/117777105458341967', creation_time=1739880424523, experiment_id='117777105458341967', last_update_time=1739880424523, lifecycle_stage='active', name='ials', tags={}>

In [None]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [143]:
def split_data(history, test_size=0.2, random_state=42):
    # –°–æ–∑–¥–∞–µ–º –∫–æ–ø–∏—é, —á—Ç–æ–±—ã –Ω–µ –º–µ–Ω—è—Ç—å –∏—Å—Ö–æ–¥–Ω—ã–π DataFrame
    history = history.copy()
    np.random.seed(random_state)
    # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º —Å–ª—É—á–∞–π–Ω–æ–µ —á–∏—Å–ª–æ –¥–ª—è –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–∏
    history["rand"] = np.random.rand(len(history))

    # –°—Ç—Ä–æ–∫–∏ —Å rand < test_size –ø–æ–ø–∞–¥—É—Ç –≤ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫—É
    test_samples = history[history["rand"] < test_size].drop(columns=["rand"])
    train_data = history[history["rand"] >= test_size].drop(columns=["rand"])

    # –§–æ—Ä–º–∏—Ä—É–µ–º —Ç–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ –∫–∞–∫ —Å–ª–æ–≤–∞—Ä—å {user_id: [track_id]}
    test_data = test_samples.groupby("user_id")["track_id"].apply(list).to_dict()
    return train_data, test_data

In [None]:
def prepare_data(history_path, minfo_path):
    history = pd.read_csv(history_path, dtype={"user_id": "str", "track_id": "str"})
    minfo = pd.read_csv(minfo_path, dtype={"track_id": "str"})

    history = history[
        (history["playcount"] > 0)
        & (history["user_id"].str.len() > 5)
        & (history["track_id"].isin(minfo["track_id"]))
    ].copy()
    history = history.groupby(["user_id", "track_id"])["playcount"].sum().reset_index()
    if len(history) == 0:
        raise ValueError("–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏")

    user_encoder = LabelEncoder()
    track_encoder = LabelEncoder()
    history["user_encoded"] = user_encoder.fit_transform(history["user_id"])
    history["track_encoded"] = track_encoder.fit_transform(history["track_id"])

    row = history["user_encoded"].values
    col = history["track_encoded"].values
    data = 1 + 0.5 * np.log1p(history["playcount"].values)

    sparse_matrix = csr_matrix(
        (data, (row, col)),
        shape=(len(user_encoder.classes_), len(track_encoder.classes_)),
    )

    # —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—è –Ω–µ–∞–∫—Ç–∏–≤–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
    user_activity = np.diff(sparse_matrix.indptr)
    active_users_mask = user_activity > 0
    sparse_matrix = sparse_matrix[active_users_mask]

    active_user_ids = user_encoder.classes_[active_users_mask]
    user_encoder = LabelEncoder()
    user_encoder.fit(active_user_ids)

    return sparse_matrix, user_encoder, track_encoder, minfo

In [None]:
def train_model(sparse_matrix):
    user_item_matrix = sparse_matrix.tocsr()

    model = AlternatingLeastSquares(
        factors=64, regularization=0.5, iterations=50, random_state=42, dtype=np.float32
    )

    alpha = 10  # —Å–ª–∏—à–∫–æ–º –º–Ω–æ–≥–æ —Ç—Ä–µ–∫–æ–≤ —Å –Ω–∏–∑–∫–∏–º —á–∏—Å–ª–æ–º –ø—Ä–æ—Å–ª—É—à–∏–≤–∞–Ω–∏–π, –∏ –∫–æ–≥–¥–∞ –∑–∞–¥–∞–µ—à—å –±–æ–ª—å—à—É—é alpha, –º–æ–¥–µ–ª—å –Ω–∞—á–∏–Ω–∞–µ—Ç –∏—Ö –ø–µ—Ä–µ–æ—Ü–µ–Ω–∏–≤–∞—Ç—å
    model.fit(user_item_matrix * alpha)

    return model, user_item_matrix

In [None]:
def get_recommendations(
    user_id,
    model,
    user_encoder,
    track_encoder,
    minfo,
    user_item_matrix,
    history_df,
    top_k=10,
):
    try:
        if user_id not in user_encoder.classes_:
            return pd.DataFrame(columns=["artist", "name"])

        internal_id = user_encoder.transform([user_id])[0]

        if internal_id >= user_item_matrix.shape[0]:
            return pd.DataFrame(columns=["artist", "name"])

        # –ü–æ–ª—É—á–∞–µ–º —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏
        item_ids, scores = model.recommend(
            internal_id,
            user_item_matrix[internal_id],
            N=top_k * 2,  # –ë–µ—Ä–µ–º —Å –∑–∞–ø–∞—Å–æ–º
            filter_already_liked_items=False,
        )

        # –ø—Ä–æ—Å–ª—É—à–∞–Ω–Ω—ã–µ —Ç—Ä–µ–∫–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è, —á—Ç–æ–±—ã —Ä–µ–∫–æ–º–µ–Ω–¥–∞–≤–∞–ª–∏—Å—å –Ω–µ –æ–Ω–∏
        listened_tracks = history_df[history_df["user_id"] == user_id][
            "track_id"
        ].unique()

        track_ids = track_encoder.inverse_transform(item_ids)
        recommendations = minfo[minfo["track_id"].isin(track_ids)]

        recommendations = recommendations[
            ~recommendations["track_id"].isin(listened_tracks)
        ]

        return recommendations.head(top_k)

    except Exception as e:
        print(f"–û—à–∏–±–∫–∞: {str(e)}")
        return pd.DataFrame(columns=["artist", "name"])

In [159]:
train, test = split_data(history, test_size=0.2)
sparse_matrix, user_encoder, track_encoder, minfo = prepare_data(
    "User Listening History.csv", "Music Info.csv"
)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
with mlflow.start_run():

    # –ª–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏
    mlflow.log_param("factors", 64)
    mlflow.log_param("regularization", 0.5)
    mlflow.log_param("iterations", 50)
    mlflow.log_param("alpha", 10)
    mlflow.log_param("k", 20)

    model, user_item_matrix = train_model(sparse_matrix)

    user_id = "b80344d063b5ccb3212f76538f3d9e43d87dca9e"

    if user_id in user_encoder.classes_:
        recs = get_recommendations(
            user_id,
            model,
            user_encoder,
            track_encoder,
            minfo,
            user_item_matrix,
            history,
        )
        print(f"\n–¢–æ–ø-10 —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π –¥–ª—è {user_id}:")
        for i, (artist, name) in enumerate(zip(recs["artist"], recs["name"]), 1):
            print(f"{i}. {artist} - {name}")
    else:
        print("–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å –Ω–µ –Ω–∞–π–¥–µ–Ω –≤ —Å–∏—Å—Ç–µ–º–µ")

    # –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∫ –ª–æ–≥–∞–º –º–µ—Ç—Ä–∏–∫

    user_encoder = dict(zip(user_encoder.classes_, range(len(user_encoder.classes_))))
    track_encoder = dict(
        zip(track_encoder.classes_, range(len(track_encoder.classes_)))
    )
    train["user_encoded"] = train["user_id"].map(user_encoder)
    train["track_encoded"] = train["track_id"].map(track_encoder)

    train_csr = csr_matrix(
        (
            train["playcount"].values,
            (train["user_encoded"].values, train["track_encoded"].values),
        )
    )
    user_idxn, track_idxn = [], []

    for user, tracks in test.items():
        if user in user_encoder:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å –≤ train
            for track in tracks:
                if track in track_encoder:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ —Ç—Ä–µ–∫ –≤ train
                    user_idxn.append(user_encoder[user])
                    track_idxn.append(track_encoder[track])

    user_idxn = np.array(user_idxn)
    track_idxn = np.array(track_idxn)

    test_csr = csr_matrix(
        (np.ones_like(user_idxn), (user_idxn, track_idxn)),
        shape=(len(user_encoder), len(track_encoder)),
    )

    k = 10
    ndcg_at_10 = ndcg_at_k(model, train_csr, test_csr, 10)
    map_at_10 = mean_average_precision_at_k(model, train_csr, test_csr, 10)
    print(f"NDCG_{k}: {ndcg_at_10:.4f}, MAP_{k}: {map_at_10:.4f}")

    # –ª–æ–≥ –º–µ—Ç—Ä–∏–∫
    mlflow.log_metric("NDCG_10", ndcg_at_10)
    mlflow.log_metric("MAP_10", map_at_10)

    mlflow.sklearn.log_model(model, "ials")

  0%|          | 0/50 [00:00<?, ?it/s]


–¢–æ–ø-10 —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π –¥–ª—è b80344d063b5ccb3212f76538f3d9e43d87dca9e:
1. Kings of Leon - Revelry
2. Fleet Foxes - Tiger Mountain Peasant Song
3. Fleet Foxes - Your Protector
4. Fleet Foxes - Sun It Rises
5. Jack Johnson - Staple It Together
6. Fleet Foxes - Oliver James
7. John Mayer - Heartbreak Warfare
8. Zero 7 - In the Waiting Line
9. Fleet Foxes - White Winter Hymnal
10. Jack Johnson - The News


  0%|          | 0/637276 [00:00<?, ?it/s]

  0%|          | 0/637276 [00:00<?, ?it/s]

NDCG_10: 0.3083, MAP_10: 0.2364




üèÉ View run shivering-boar-491 at: http://127.0.0.1:5000/#/experiments/117777105458341967/runs/6e266e528fe84a85b90b085e376a8856
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/117777105458341967
