In [7]:
import geopandas as gpd
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split



Рестораны:
1. `chefmozaccepts.csv` (1314 строк): информация об оплате.
2. `chefmozcuisine.csv` (916 строк): типы кухни.
3. `chefmozhours4.csv` (2339 строк): часы работы.
4. `chefmozparking.csv` (702 строки): информация о парковке.
5. `geoplaces2.csv` (130 строк): геолокация и дополнительная информация.

Пользователи:
1. `usercuisine.csv` (330 строк): предпочтения кухни пользователей.
2. `userpayment.csv` (177 строк): способы оплаты.
3. `userprofile.csv` (138 строк): профили пользователей (например, курение, уровень дохода, активность).

Рейтинги:
1. `rating_final.csv` (1161 строк): оценки ресторанов пользователями по нескольким параметрам (еда, сервис).

In [2]:
# Загрузка файлов
accepts = pd.read_csv('data/restaurant/chefmozaccepts.csv')
cuisine = pd.read_csv('data/restaurant/chefmozcuisine.csv')
hours = pd.read_csv('data/restaurant/chefmozhours4.csv')
parking = pd.read_csv('data/restaurant/chefmozparking.csv')
geoplaces = gpd.read_file('data/restaurant/geoplaces2.csv')

user_cuisine = pd.read_csv('data/restaurant/usercuisine.csv')
user_payment = pd.read_csv('data/restaurant/userpayment.csv')
user_profile = pd.read_csv('data/restaurant/userprofile.csv')

rating = pd.read_csv('data/restaurant/rating_final.csv')

In [3]:
n_users = rating['userID'].nunique()
n_items = rating['placeID'].nunique()

# Создание индексов для пользователей и фильмов
user_ids = rating['userID'].astype('category').cat.codes.values
item_ids = rating['placeID'].astype('category').cat.codes.values

rating['user_idx'] = user_ids
rating['item_idx'] = item_ids

# Разделение данных на тренировочные и тестовые наборы
train_df, test_df = train_test_split(rating, test_size=0.3, random_state=42)

# Матрица рейтингов
def create_matrix(df, n_users, n_items):
    matrix = np.zeros((n_users, n_items))
    for row in df.itertuples():
        matrix[row.user_idx, row.item_idx] = row.rating
    return matrix

R_train = create_matrix(train_df, n_users, n_items)
R_test = create_matrix(test_df, n_users, n_items)

In [4]:
class SVD:
    def __init__(self, R, K=20, alpha=0.002, beta=0.02, iterations=100):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha  # Скорость обучения
        self.beta = beta  # Регуляризация
        self.iterations = iterations
        self.rmse_values = []  # Список для хранения значений RMSE
        self.map_values = []  # Список для хранения значений MAP


    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R > 0])

        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]

        progress_bar = tqdm(total=self.iterations, desc="Training SVD")
        for iteration in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            self.rmse_values.append(rmse)  # Сохраняем значение RMSE
            progress_bar.set_postfix({'RMSE': f"{rmse:.4f}"})
            progress_bar.update(1)
        progress_bar.close()

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.predict(i, j)
            error = r - prediction

            # Обновление биасов
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (error - self.beta * self.b_i[j])

            # Обновление скрытых факторов
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j, :])


    def predict(self, i, j):
        return self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)


    def rmse(self):
        xs, ys = self.R.nonzero()
        predicted = []
        actual = []
        for x, y in zip(xs, ys):
            predicted.append(self.predict(x, y))
            actual.append(self.R[x, y])
        return np.sqrt(np.mean((np.array(predicted) - np.array(actual)) ** 2))

In [6]:
K = 10
iterations = 10000
alpha = 0.02
beta = 0.02

svd = SVD(R_train, K=K, iterations=iterations, alpha=alpha, beta=beta)
svd.train()





KeyboardInterrupt: 

Training SVD:   6%|▋         | 637/10000 [00:20<02:11, 71.04it/s, RMSE=0.0369]

In [3]:
# Приведение всех placeID к строковому типу
accepts['placeID'] = accepts['placeID'].astype(str)
cuisine['placeID'] = cuisine['placeID'].astype(str)
hours['placeID'] = hours['placeID'].astype(str)
parking['placeID'] = parking['placeID'].astype(str)
geoplaces['placeID'] = geoplaces['placeID'].astype(str)
rating['placeID'] = rating['placeID'].astype(str)

# Объединение таблиц по placeID
restaurants = accepts.merge(cuisine, on='placeID', how='left') \
                     .merge(hours, on='placeID', how='left') \
                     .merge(parking, on='placeID', how='left') \
                     .merge(geoplaces, on='placeID', how='left')

# Объединение информации о пользователях
users = user_cuisine.merge(user_payment, on='userID', how='left') \
                   .merge(user_profile, on='userID', how='left')

data = rating.merge(users, on='userID', how='left') \
             .merge(restaurants, on='placeID', how='left')

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [5]:
# Пример: Фильтрация ресторанов по типу кухни и цене
filtered_restaurants = restaurants[(restaurants['Rcuisine'].isin(['Italian', 'Japanese', 'Mexican'])) &
                                   (restaurants['price'].isin(['medium', 'high']))]

In [6]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Подготовка данных для Surprise
reader = Reader(rating_scale=(0, 2))
surprise_data = Dataset.load_from_df(train[['userID', 'placeID', 'rating']], reader)
trainset = surprise_data.build_full_trainset()

# Обучение модели
algo = SVD()
algo.fit(trainset)

# Предсказания на тестовой выборке
test_filtered = test[test['placeID'].isin(filtered_restaurants['placeID'])]
predictions = algo.test(test_filtered[['userID', 'placeID', 'rating']].values.tolist())

In [7]:
def map_at_k(predictions, k=50):
    # Группируем предсказания по userID
    from collections import defaultdict
    user_pred = defaultdict(list)
    for pred in predictions:
        user_pred[pred.uid].append((pred.iid, pred.est))
    
    average_precisions = []
    for user, items in user_pred.items():
        # Сортируем по оценке
        items_sorted = sorted(items, key=lambda x: x[1], reverse=True)[:k]
        # Истинные релеванты
        true_items = test_filtered[test_filtered['userID'] == user]['placeID'].tolist()
        hits = 0
        sum_precisions = 0
        for i, (item, _) in enumerate(items_sorted, 1):
            if item in true_items:
                hits += 1
                sum_precisions += hits / i
        if hits > 0:
            average_precisions.append(sum_precisions / hits)
        else:
            average_precisions.append(0)
    
    return sum(average_precisions) / len(average_precisions)

map50 = map_at_k(predictions, k=50)
print(f'MAP@50: {map50}')

MAP@50: 1.0
