In [3]:
import numpy as np
import pandas as pd
import gc

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train["id"] = -1
test["target"] = -1
data = train.append(test)
del train
del test
gc.collect()

In [None]:
data["user_id"] = data["msno"].astype("category").cat.codes.copy()
data["item_id"] = data["song_id"].astype("category").cat.codes.copy()
data.drop(["msno", "song_id"], inplace=True)
gc.collect()
train_data = data[data["id"] == -1][["user_id", "item_id", "target"]]
test_data = data[data["id"] != -1][["user_id", "item_id", "id"]]
del data
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split

train_train, train_validation = train_test_split(train_data, train_size=0.8)
del train_data
gc.collect()

In [None]:
# создаём разреженную матрицу item*user
from scipy.sparse import coo_matrix

sparse_train = coo_matrix((
    train_train["target"].astype(np.float32),
    (
        train_train["user_id"],
        train_train["item_id"]
    )
))
sample_weight = coo_matrix((
    np.ones(train_train.shape[0]),
    (
        train_train["user_id"],
        train_train["item_id"]
    )
))

In [None]:
# функция, которая красиво печатает информацию о разреженных матрицах

def sparse_info(sparse_matrix) -> None:
    print("Размерности матрицы: {}".format(sparse_matrix.shape))
    print("Ненулевых элементов в матрице: {}".format(sparse_matrix.nnz))
    print("Доля ненулевых элементов: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )
    print("Среднее значение ненулевых элементов: {}".format(sparse_matrix.data.mean()))
    print("Максимальное значение ненулевых элементов: {}".format(sparse_matrix.data.max()))
    print("Минимальное значение ненулевых элементов: {}".format(sparse_matrix.data.min()))

In [None]:
avg_target = train_train["target"].mean()
print(avg_target)
def get_recs(dataset, item_features):
    return dataset.shape[0] * [avg_target]
train_validation["target"] = avg_target
item_features = None

In [None]:
from sklearn.metrics import roc_auc_score

train_recs = get_recs(train_train, item_features)
print("train AUC: {}".format(roc_auc_score(train_train["target"], train_recs)))
validation_recs = get_recs(train_validation, item_features)
print("test AUC: {}".format(roc_auc_score(train_validation["target"], validation_recs)))