In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [None]:

# Чтение данных
events = pd.read_csv("dfev.csv")
properties = pd.concat([
    pd.read_csv("item_properties_part1.csv.zip"),
    pd.read_csv("item_properties_part2.csv.zip")
])
categories = pd.read_csv("category_tree.csv")

# Добавление новых признаков
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
properties['event_datetime'] = pd.to_datetime(properties['timestamp'], unit='ms')
events['day_of_week'] = events['event_datetime'].map(lambda x: x.weekday())
events['Year'] = events['event_datetime'].map(lambda x: x.year)
events['Month'] = events['event_datetime'].map(lambda x: x.month)
events['Day'] = events['event_datetime'].map(lambda x: x.day)
events['Hour'] = events['event_datetime'].map(lambda x: x.hour)
events['minute'] = events['event_datetime'].map(lambda x: x.minute)

def get_time_periods(hour):
    if hour >= 3 and hour < 7:
        return 'Dawn'
    elif hour >= 7 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 16:
        return 'Afternoon'
    elif hour >= 16 and hour < 22:
        return 'Evening'
    else:
        return 'Night'

events['Day Period'] = events['Hour'].map(get_time_periods)

# Фильтрация данных
transaction_events = events[events['event'] == 'transaction']

# Разделение данных на обучающую и тестовую выборки
train, test = train_test_split(transaction_events, test_size=0.2, shuffle=False)

# Предобработка данных
top_properties = properties.drop_duplicates(['itemid', 'property']).groupby("property")['itemid'].count().sort_values(ascending=False)[:20]

properties_filtered = properties[properties['property'].isin(set(top_properties.index))]

# Преобразование свойств товаров с буквенно-цифровыми значениями, разделенными пробелами, в числовые признаки
encoder = OneHotEncoder(sparse=False)
vectorizer = CountVectorizer()
properties_vectorized = vectorizer.fit_transform(properties_filtered['value'][:10])
properties_encoded = encoder.fit_transform(properties_vectorized.toarray())
properties_df = pd.DataFrame(properties_encoded, columns=encoder.get_feature_names_out())
properties_df['itemid'] = properties_filtered['itemid'][:10]
properties_df


# Создаём dataframe, содержащий все itemid и их свойства
item_properties = properties_df.groupby('itemid').mean().reset_index()

# Формируем данные для модели
reader = Reader(rating_scale=(0, 5))
train_data = Dataset.load_from_df(train[['visitorid', 'itemid']], reader)
test_data = Dataset.load_from_df(test[['visitorid', 'itemid']], reader)

# Создаём и обучаем модель
svd = SVD()
cross_validate(svd, train_data, measures=['RMSE', 'MAE', 'Precision@3'], cv=3, verbose=True)
svd.fit(train_data.build_full_trainset())

# Получаем факторную матрицу для товаров
item_factors = svd.qi.T

# Добавляем информацию о свойствах товаров к факторной матрице
item_factors_with_properties = np.hstack((item_factors, item_properties.drop('itemid', axis=1).values))

# Нормализуем полученную матрицу
item_factors_with_properties_normalized = item_factors_with_properties / np.linalg.norm(item_factors_with_properties, axis=1)[:, np.newaxis]

# Вычисляем косинусное сходство между товарами
item_similarity = np.dot(item_factors_with_properties_normalized, item_factors_with_properties_normalized.T)

# Предсказываем 3 наиболее предпочтительных товара для заданного пользователя
user_id = 'example_user_id'
user_items = train[train['visitorid'] == user_id]['itemid'].tolist()

# Вычисляем среднее значение факторной матрицы для пользователя
user_factors = np.mean(item_factors_with_properties[train['itemid'].isin(user_items)], axis=0)

# Вычисляем сходство между пользователем и всеми товарами
user_item_similarity = np.dot(user_factors, item_factors_with_properties_normalized.T)

# Сортируем товары по убыванию сходства и выбираем топ-3
top_3_items_indices = np.argsort(user_item_similarity)[-3:]
top_3_items = [(train_data.to_raw_iid(pred.iid), pred.est) for pred in test_data.testset.items if pred.iid in top_3_items_indices]
print("Top 3 recommended items for user {}: {}".format(user_id, top_3_items))



In [None]:
# gemeni
from surprise import KNNWithMeans, Dataset, Reader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
events = pd.read_csv("dfev.csv")
properties = pd.concat([
    pd.read_csv("item_properties_part1.csv.zip"),
    pd.read_csv("item_properties_part2.csv.zip")
])
# # Preprocess item properties
# categorical_properties = ["property1", "property2", ...]  # Replace with actual categorical property names
# numerical_properties = ["property3", "property4", ...]  # Replace with actual numerical property names

# # One-hot encode categorical properties
# ohe = OneHotEncoder(sparse=False)
# encoded_categorical = ohe.fit_transform(properties_filtered[categorical_properties])

# # Scale numerical properties
# scaler = StandardScaler()
# scaled_numerical = scaler.fit_transform(properties_filtered[numerical_properties])

# # Combine encoded features
# properties_features = pd.DataFrame(
#     np.hstack([encoded_categorical, scaled_numerical]),
#     columns=ohe.get_feature_names_out().tolist() + numerical_properties,
#     index=properties_filtered.index,
# )


# Prepare data for Surprise
transactions = events[events["event"] == "transaction"]
transactions["rating"] = 1
reader = Reader(rating_scale=(1, 1))
data = Dataset.load_from_df(transactions[["visitorid", "itemid", "rating"]].rename(columns={"visitorid": "uid", "itemid": "iid"}), reader)

# # Integrate item properties
# for item_id, features in properties_features.iterrows():
#     data.add_item_features(item_id, features)

# Build and train the model
model = KNNWithMeans(sim_options={"name": "cosine", "user_based": True})
model.fit(data.build_full_trainset())
