In [52]:
import geopandas as gpd
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

Рестораны:
1. `chefmozaccepts.csv` (1314 строк): информация об оплате.
2. `chefmozcuisine.csv` (916 строк): типы кухни.
3. `chefmozhours4.csv` (2339 строк): часы работы.
4. `chefmozparking.csv` (702 строки): информация о парковке.
5. `geoplaces2.csv` (130 строк): геолокация и дополнительная информация.

Пользователи:
1. `usercuisine.csv` (330 строк): предпочтения кухни пользователей.
2. `userpayment.csv` (177 строк): способы оплаты.
3. `userprofile.csv` (138 строк): профили пользователей (например, курение, уровень дохода, активность).

Рейтинги:
1. `rating_final.csv` (1161 строк): оценки ресторанов пользователями по нескольким параметрам (еда, сервис).

### Функции

In [53]:
def create_matrix(df, n_users, n_items):
    matrix = np.zeros((n_users, n_items))
    for row in df.itertuples():
        matrix[row.user_idx, row.item_idx] = row.rating # Можно заменить колонку на 'food_rating' or 'service_rating'
    return matrix

In [54]:
# Функция для расчета RMSE
def calculate_rmse(R_test, model):
    xs, ys = R_test.nonzero()
    predicted = []
    actual = []
    # progress_bar = tqdm(total=len(xs), desc="Calculating RMSE")
    for x, y in zip(xs, ys):
        predicted.append(model.predict(x, y))
        actual.append(R_test[x, y])
        # progress_bar.update(1)
    # progress_bar.close()
    return np.sqrt(np.mean((np.array(predicted) - np.array(actual)) ** 2))


# Функция для расчета Average Precision (AP) для одного пользователя
def average_precision_at_k(recommended_items, relevant_items, k):
    # Проверяем, есть ли релевантные элементы
    if len(relevant_items) == 0:
        return 0.0
    
    score = 0.0
    num_hits = 0.0
    for i in range(1, k+1):
        if recommended_items[i-1] in relevant_items:
            num_hits += 1.0
            score += num_hits / i
    return score / min(len(relevant_items), k)


def mean_average_precision_at_k(model, R_train, R_test, top_k=50, grade=5):
    aps = []
    num_users = R_test.shape[0]
    for user in range(num_users):
        # Получаем релевантные айтемы из тестового набора (например, рейтинги >= 4)
        relevant_items = np.where(R_test[user, :] >= grade)[0]
        if len(relevant_items) == 0:
            continue  # Пропускаем пользователей без релевантных айтемов в тесте

        # Получаем предсказанные рейтинги для всех айтемов
        scores = model.predict(user, np.arange(model.num_items))
        
        # Убираем индексы из тренировочного набора данных
        train_items = np.where(R_train[user, :] > 0)[0]
        scores[train_items] = -np.inf  # Присваиваем им минимально возможное значение
        
        # Сортируем айтемы по убыванию предсказанных рейтингов
        recommended_items = np.argsort(-scores)

        # Вычисляем Average Precision для текущего пользователя
        ap = average_precision_at_k(recommended_items, relevant_items, top_k)
        aps.append(ap)

    return np.mean(aps)

In [55]:
def add_context_weight(row, name_column, type, rating='rating'):
    if row[name_column] == type:
        return row[rating] + 0.5  # добавляем вес
    else:
        return row[rating]

In [56]:
def split(data, test_size=0.3, random_state=42):
    n_users = data['userID'].nunique()
    n_items = data['placeID'].nunique()

    # Создание индексов для пользователей и фильмов
    user_ids = data['userID'].astype('category').cat.codes.values
    item_ids = data['placeID'].astype('category').cat.codes.values

    data['user_idx'] = user_ids
    data['item_idx'] = item_ids

    # Разделение данных на тренировочные и тестовые наборы
    train_df, test_df = train_test_split(data, test_size=test_size, random_state=random_state)
    return n_users, n_items, train_df, test_df

In [57]:
class SVD:
    def __init__(self, R, K=20, alpha=0.002, beta=0.02, iterations=100):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha  # Скорость обучения
        self.beta = beta  # Регуляризация
        self.iterations = iterations
        self.rmse_values = []  # Список для хранения значений RMSE
        self.map_values = []  # Список для хранения значений MAP


    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R > 0])

        self.samples = [
            (i, j, self.R[i, j])
            for i, j in zip(*np.nonzero(self.R))
        ]

        progress_bar = tqdm(total=self.iterations, desc="Training SVD")
        for iteration in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            self.rmse_values.append(rmse)  # Сохраняем значение RMSE
            progress_bar.set_postfix({'RMSE': f"{rmse:.4f}"})
            progress_bar.update(1)
        progress_bar.close()

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.predict(i, j)
            error = r - prediction

            # Обновление биасов
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (error - self.beta * self.b_i[j])

            # Обновление скрытых факторов
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j, :])


    def predict(self, i, j):
        return self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)


    def rmse(self):
        xs, ys = self.R.nonzero()
        predicted = []
        actual = []
        for x, y in zip(xs, ys):
            predicted.append(self.predict(x, y))
            actual.append(self.R[x, y])
        return np.sqrt(np.mean((np.array(predicted) - np.array(actual)) ** 2))

### Подготовка данных

In [58]:
# Загрузка файлов
accepts = pd.read_csv('data/restaurant/chefmozaccepts.csv')
cuisine = pd.read_csv('data/restaurant/chefmozcuisine.csv')
hours = pd.read_csv('data/restaurant/chefmozhours4.csv')
parking = pd.read_csv('data/restaurant/chefmozparking.csv')
geoplaces = gpd.read_file('data/restaurant/geoplaces2.csv')

user_cuisine = pd.read_csv('data/restaurant/usercuisine.csv')
user_payment = pd.read_csv('data/restaurant/userpayment.csv')
user_profile = pd.read_csv('data/restaurant/userprofile.csv')

rating = pd.read_csv('data/restaurant/rating_final.csv')

In [59]:
# Приведение всех placeID к строковому типу для merge
accepts['placeID'] = accepts['placeID'].astype(str)
cuisine['placeID'] = cuisine['placeID'].astype(str)
hours['placeID'] = hours['placeID'].astype(str)
parking['placeID'] = parking['placeID'].astype(str)
geoplaces['placeID'] = geoplaces['placeID'].astype(str)
rating['placeID'] = rating['placeID'].astype(str)
rating['placeID'] = rating['placeID'].astype(str)
cuisine['placeID'] = cuisine['placeID'].astype(str)

### Предварительная фильтрация

In [60]:
# Добавляем информацию о типе кухни и среднему чеку. Это будет использовано для фильтрации
data = rating.merge(cuisine, on='placeID', how='left') \
             .merge(geoplaces[['placeID', 'price']], on='placeID', how='left')

In [61]:
# Если кухня не мексиканская
data.loc[data['Rcuisine'] != 'Mexican', ['rating', 'food_rating', 'service_rating']] = 0

# Только дешевая кухня
# data.loc[data['price'].isin(['medium', 'high']), ['rating', 'food_rating', 'service_rating']] = 0

* Матрица рейтингов. Только по колонке "rating"

In [62]:
n_users, n_items, train_df, test_df = split(data)

R_train = create_matrix(train_df, n_users, n_items)
R_test = create_matrix(test_df, n_users, n_items)

In [63]:
param_grid = {
    'K': [2, 10, 50],
    'iterations': [10, 50, 100],
    'alpha': [0.001, 0.02, 0.2],
    'beta': [0.02]
}

results = []

for K in param_grid['K']:
    for iterations in param_grid['iterations']:
        for alpha in param_grid['alpha']:
            for beta in param_grid['beta']:
                # Запуск моделей
                svd = SVD(R_train, K=K, iterations=iterations, alpha=alpha, beta=beta)
                svd.train()
                svd_rmse = calculate_rmse(R_test, svd)
                svd_map_2 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=2)
                svd_map_1 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=1)
                results.append([K, iterations, alpha, beta, svd_rmse, svd_map_2, svd_map_1])

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

In [64]:
df = pd.DataFrame(results, columns=['K', 'Итерации', 'Alpha', 'Beta', 'SVD RMSE', 'SVD MAP 2', 'SVD MAP 1']).sort_values(['SVD MAP 2', 'SVD MAP 1'], ascending=False)
print(tabulate(df, headers='keys', tablefmt='psql'))  

+----+-----+------------+---------+--------+------------+-------------+-------------+
|    |   K |   Итерации |   Alpha |   Beta |   SVD RMSE |   SVD MAP 2 |   SVD MAP 1 |
|----+-----+------------+---------+--------+------------+-------------+-------------|
| 10 |  10 |         10 |   0.02  |   0.02 |   0.452507 |   0.237328  |   0.197209  |
| 24 |  50 |        100 |   0.001 |   0.02 |   0.461317 |   0.233085  |   0.196856  |
| 19 |  50 |         10 |   0.02  |   0.02 |   0.449413 |   0.232374  |   0.195875  |
| 21 |  50 |         50 |   0.001 |   0.02 |   0.475265 |   0.227508  |   0.191013  |
| 18 |  50 |         10 |   0.001 |   0.02 |   0.493369 |   0.220785  |   0.178029  |
| 12 |  10 |         50 |   0.001 |   0.02 |   0.476544 |   0.211106  |   0.168905  |
| 15 |  10 |        100 |   0.001 |   0.02 |   0.466944 |   0.191584  |   0.169009  |
| 13 |  10 |         50 |   0.02  |   0.02 |   0.470867 |   0.191198  |   0.144423  |
| 17 |  10 |        100 |   0.2   |   0.02 |   0.50129

### Контекстная рекомендация

In [65]:
data = rating.merge(cuisine, on='placeID', how='left') \
             .merge(geoplaces[['placeID', 'price']], on='placeID', how='left')

n_users, n_items, train_df, test_df = split(data)
train_df['rating'] = train_df.apply(add_context_weight, axis=1,args=('Rcuisine', 'Mexican'))
test_df.loc[test_df['Rcuisine'] != 'Mexican', ['rating', 'food_rating', 'service_rating']] = 0

R_train = create_matrix(train_df, n_users, n_items)
R_test = create_matrix(test_df, n_users, n_items)

In [66]:
param_grid = {
    'K': [2, 10, 50],
    'iterations': [10, 50, 100],
    'alpha': [0.001, 0.02],
    'beta': [0.02]
}

results = []

for K in param_grid['K']:
    for iterations in param_grid['iterations']:
        for alpha in param_grid['alpha']:
            for beta in param_grid['beta']:
                # Запуск моделей
                svd = SVD(R_train, K=K, iterations=iterations, alpha=alpha, beta=beta)
                svd.train()
                svd_rmse = calculate_rmse(R_test, svd)
                svd_map_2 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=2)
                svd_map_1 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=1)
                results.append([K, iterations, alpha, beta, svd_rmse, svd_map_2, svd_map_1])

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

In [67]:
df = pd.DataFrame(results, columns=['K', 'Итерации', 'Alpha', 'Beta', 'SVD RMSE', 'SVD MAP 2', 'SVD MAP 1']).sort_values(['SVD MAP 2', 'SVD MAP 1'], ascending=False)
print(tabulate(df, headers='keys', tablefmt='psql'))

+----+-----+------------+---------+--------+------------+-------------+-------------+
|    |   K |   Итерации |   Alpha |   Beta |   SVD RMSE |   SVD MAP 2 |   SVD MAP 1 |
|----+-----+------------+---------+--------+------------+-------------+-------------|
| 14 |  50 |         50 |   0.001 |   0.02 |   0.446261 |   0.257107  |   0.214148  |
| 12 |  50 |         10 |   0.001 |   0.02 |   0.47634  |   0.243842  |   0.204735  |
|  8 |  10 |         50 |   0.001 |   0.02 |   0.450366 |   0.243083  |   0.215028  |
| 10 |  10 |        100 |   0.001 |   0.02 |   0.443846 |   0.234929  |   0.198726  |
|  6 |  10 |         10 |   0.001 |   0.02 |   0.478632 |   0.229243  |   0.195271  |
| 16 |  50 |        100 |   0.001 |   0.02 |   0.442672 |   0.207717  |   0.17811   |
|  7 |  10 |         10 |   0.02  |   0.02 |   0.455672 |   0.202269  |   0.189521  |
| 13 |  50 |         10 |   0.02  |   0.02 |   0.462968 |   0.192889  |   0.166354  |
| 15 |  50 |         50 |   0.02  |   0.02 |   0.55733

### Classic SVD

In [68]:
data = rating.merge(cuisine, on='placeID', how='left') \
             .merge(geoplaces[['placeID', 'price']], on='placeID', how='left')

In [71]:
n_users, n_items, train_df, test_df = split(data)

test_df.loc[test_df['Rcuisine'] != 'Mexican', ['rating', 'food_rating', 'service_rating']] = 0

R_train = create_matrix(train_df, n_users, n_items)
R_test = create_matrix(test_df, n_users, n_items)

In [72]:
param_grid = {
    'K': [2, 10, 50],
    'iterations': [10, 50, 100],
    'alpha': [0.001, 0.02, 0.2],
    'beta': [0.02]
}

results = []

for K in param_grid['K']:
    for iterations in param_grid['iterations']:
        for alpha in param_grid['alpha']:
            for beta in param_grid['beta']:
                # Запуск моделей
                svd = SVD(R_train, K=K, iterations=iterations, alpha=alpha, beta=beta)
                svd.train()
                svd_rmse = calculate_rmse(R_test, svd)
                svd_map_2 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=2)
                svd_map_1 = mean_average_precision_at_k(svd, R_train, R_test, top_k=50, grade=1)
                results.append([K, iterations, alpha, beta, svd_rmse, svd_map_2, svd_map_1])

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/10 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/50 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

Training SVD:   0%|          | 0/100 [00:00<?, ?it/s]

In [73]:
df = pd.DataFrame(results, columns=['K', 'Итерации', 'Alpha', 'Beta', 'SVD RMSE', 'SVD MAP 2', 'SVD MAP 1']).sort_values(['SVD MAP 2', 'SVD MAP 1'], ascending=False)
print(tabulate(df, headers='keys', tablefmt='psql'))

+----+-----+------------+---------+--------+------------+-------------+-------------+
|    |   K |   Итерации |   Alpha |   Beta |   SVD RMSE |   SVD MAP 2 |   SVD MAP 1 |
|----+-----+------------+---------+--------+------------+-------------+-------------|
|  4 |   2 |         50 |   0.02  |   0.02 |   0.503334 |   0.126704  |   0.0730712 |
| 12 |  10 |         50 |   0.001 |   0.02 |   0.46152  |   0.0682597 |   0.0460555 |
| 18 |  50 |         10 |   0.001 |   0.02 |   0.485122 |   0.0529464 |   0.0420663 |
| 15 |  10 |        100 |   0.001 |   0.02 |   0.436162 |   0.0519864 |   0.0379007 |
| 21 |  50 |         50 |   0.001 |   0.02 |   0.456798 |   0.0506502 |   0.0404747 |
|  0 |   2 |         10 |   0.001 |   0.02 |   0.665919 |   0.0481157 |   0.101338  |
| 24 |  50 |        100 |   0.001 |   0.02 |   0.442905 |   0.0466611 |   0.0387294 |
|  1 |   2 |         10 |   0.02  |   0.02 |   0.522874 |   0.0445338 |   0.0331465 |
| 11 |  10 |         10 |   0.2   |   0.02 |   0.48566

### Выводы

1. SVD работает.
2. Предварительная фильтрация работает еще лучше. Также обучается быстрее, так как данных становится меньше.
3. Контекстная рекомендация работает еще лучше, чем предварительная фильтрация.
    * Требует меньше эпох дял обучения для более высокого качества, чем контекстная рекомендация.
