## Модель

In [1]:
import pandas as pd
import numpy as np
import collections
from surprise import Reader, Dataset, SVD, accuracy, KNNBasic
from surprise.model_selection import train_test_split
import pickle

In [None]:
ratings = pd.read_csv("../final_data/vydacha_rated.csv", usecols=["Штрих-код", "ИД читателя", "rating"])
ratings.rename({"Штрих-код": "itemid", "ИД читателя": "userid"}, inplace=True, axis=1)
# ratings["rating"] = 1.0
ratings.tail()

In [None]:
ratings.rename({"Штрих-код": "itemid", "ИД читателя": "userid"}, inplace=True, axis=1)

In [None]:
ratings.fillna(0, inplace=True)

In [None]:
count = collections.Counter(ratings.userid)

In [None]:
len(count)

In [None]:
reader = Reader(rating_scale=(0, 1))

In [None]:
data = Dataset.load_from_df(ratings, reader)
trainset, testset = train_test_split(data, test_size=.25)

In [None]:
algo = SVD(verbose=True)
# algo = KNNBasic(verbose=True)

In [None]:
algo.fit(trainset)

In [None]:
predictions = algo.test(testset)

In [None]:
# Then compute RMSE
accuracy.rmse(predictions)

### Train on full dataset

In [None]:
X = data.build_full_trainset()

In [None]:
algo = SVD(verbose=True)

In [None]:
algo.fit(X)

In [None]:
with open("recom_model.pickle","wb") as pickle_out:
    pickle.dump(algo, pickle_out)

In [None]:
algo.compute_similarities()

In [None]:
ratings[ratings["userid"] == 24001]

In [None]:
ratings.head(3)

In [None]:
algo.predict(uid=24001, iid=381000177719, verbose=True)

## Рекомендация книг для пользователей

In [2]:
from multiprocessing import Pool
import functools

In [3]:
# ratings2 = pd.read_csv(
#     "../final_data/vydacha_rated.csv",
#     usecols=[
#         "ИД читателя",
#         "Штрих-код",
#         "doc_id",
#         "p100a",
#         "p245a",
#         "p650a",
#         "p521a",
#         "rating",
#     ],
# )

In [4]:
# ratings.rename(
#     {
#         "Штрих-код": "itemid",
#         "ИД читателя": "userid",
#         "p100a": "author",
#         "p245a": "title",
#         "p650a": "category",
#         "p521a": "age"
#     },
#     inplace=True,
#     axis=1,
# )

In [5]:
# database = ratings.drop_duplicates(subset=["doc_id"]).copy()
# database[["age"]] = temp[["age"]].fillna(value="0+").copy()
# books = database.itemid.values.copy()

# database["age"].unique()

# database["age_cat"] = (
#     database["age"]
#     .map(
#         {
#             "0+": 0,
#             "16+": 16,
#             "12+": 12,
#             "18+": 18,
#             "6+": 6,
#             "16+ ; 18+": 16,
#             "6+ ; 0+": 0,
#             "0+ ; 6+": 0,
#             "6+ ; 12+": 6,
#         }
#     )
#     .values
# )

In [6]:
database = pd.read_csv("database.csv")

books = database.itemid.values.copy()

In [7]:
# readers_birthday = pd.read_csv(
#     "../final_data/Читатели.csv", usecols=["ID читателя", "Дата рождения"]
# )

# readers_birthday.rename(
#     {"ID читателя": "userid", "Дата рождения": "date"}, axis=1, inplace=True
# )

# readers_birthday["date"] = pd.to_datetime(readers_birthday["date"])

# readers_birthday["age"] = readers_birthday.date.apply(lambda x: 2020 - x.year)

In [8]:
readers_birthday = pd.read_csv("readers_birthday.csv")

algo = pickle.load(open("recom_model.pickle", "rb"))

In [10]:
def get_predictoins(input_iid, input_uid):
    return algo.predict(uid=input_uid, iid=input_iid, verbose=False).est

def get_books(input_uid):
    with Pool(15) as p:
        pred_rating = list(
            p.map(functools.partial(get_predictoins, input_uid=input_uid), books)
        )

    pred_book = [(rating, book) for rating, book in zip(pred_rating, books)]

    final_pred_books = [
        x[1] for x in sorted(pred_book, key=lambda x: x[0], reverse=True)[:30]
    ]
    user_age = (readers_birthday[readers_birthday.userid == input_uid]["age"]).values[0]
    df = database[database.itemid.isin(final_pred_books)]

    return df[(df.age_cat <= user_age)]

In [16]:
result = get_books(777)

In [19]:
result.head()

Unnamed: 0,userid,itemid,doc_id,author,title,category,age,rating,age_cat
365,479,381001176957,8264,Устинова Татьяна Витальевна,Седьмое небо,Художественная литература,0+,0.405556,0
14506,995,381001541557,115156,Пушкин Александр Сергеевич,Евгений Онегин; Драмы,Художественная литература,0+,0.737864,0
15245,7337,381000838665,141343,Твен Марк,Принц и нищий,Художественная литература,0+,1.0,0
21486,16292,381008170149,145403,Матюшкина Екатерина Александровна,Детективное бюро Фу-Фу и Кис-Киса,Художественная литература,0+,0.41954,0
26084,4377,381013343637,227102,Севела Эфраим,Викинг,Художественная литература,0+,0.61165,0


In [24]:
print("Основываясь на ваших предпочтениях мы рекомендуем вам следующие книги:")
i = 1
for _, row in result.iterrows():
    author = row["author"]
    title = row["title"]
    print(f"{i}) ", row["author"], '"'+row["title"]+ '"')
    i += 1
    if i == 5:
        break

Основываясь на ваших предпочтениях мы рекомендуем вам следующие книги:
1)  Устинова Татьяна Витальевна "Седьмое небо"
2)  Пушкин Александр Сергеевич "Евгений Онегин; Драмы"
3)  Твен Марк "Принц и нищий"
4)  Матюшкина Екатерина Александровна "Детективное бюро Фу-Фу и Кис-Киса"


In [28]:
key_words = result["category"].unique()

import string

table = str.maketrans("", "", string.punctuation)

with open("raw_dict.pickle","rb") as pickle_in:
    cat_to_vec = pickle.load(pickle_in)

request_words = [w.translate(table).lower() for w in " ".join(key_words).lower().split()]

request_input = np.array([cat_to_vec.get(key) for key in key_words if cat_to_vec.get(key) is not None])
embed_vector = compose_embedd_vector(words=request_input, age=[1, 1, 1, 1, 1])
age_request = user_age