In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [27]:
ratings = pd.read_csv('res.csv')
ratings = ratings.rename(columns={'client_id':'userId', 'item_name':'productId', 'score':'rating'})
del ratings['Unnamed: 0']
ratings_df = ratings

ratings_df.head(10)

Unnamed: 0,userId,productId,rating
0,14795,ГЖ-7-1,3.0
1,14795,ГМ-50-1,3.0
2,14795,ГМ-55-1,3.0
3,14795,РМ-8-1,3.0
4,14795,АКС-1-41,3.0
5,14795,АКС-1-42,3.0
6,14795,АО-8-28,3.0
7,14795,АО-8-70,3.0
8,14795,АО-9-56,3.0
9,14795,ДЖП-104-2,3.0


Далее для удобства переделаем немного ид пользователей и артикулы товаров. Потом все вернем взад)

In [3]:
ratings_df_sample = ratings_df

n_users = len(ratings_df_sample['userId'].unique())
n_products = len(ratings_df_sample['productId'].unique())
(n_users, n_products)

(2539, 4014)

In [4]:
product_ids = ratings_df_sample['productId'].unique()

def scale_product_id(product_id):
    scaled = np.where(product_ids == product_id)[0][0] + 1
    return scaled

ratings_df_sample['productId'] = ratings_df_sample['productId'].apply(scale_product_id)

In [5]:
user_ids = ratings_df_sample['userId'].unique()

def scale_user_id(user_id):
    scaled = np.where(user_ids == user_id)[0][0] + 1
    return scaled

ratings_df_sample['userId'] = ratings_df_sample['userId'].apply(scale_user_id)

In [6]:
ratings_df_sample.head()

Unnamed: 0,userId,productId,rating
0,1,1,3.0
1,1,2,3.0
2,1,3,3.0
3,1,4,3.0
4,1,5,3.0


In [7]:
ratings_df_sample.userId.max()

2539

Разобьем на тренировочные и валидационные наборы

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_data, test_data = train_test_split(ratings_df_sample, test_size=0.2)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))
print(train_data.userId.unique().size, train_data.productId.unique().size)
print(test_data.userId.unique().size, test_data.productId.unique().size)

Train shape: (14485, 3)
Test shape: (3622, 3)
2379 3726
1390 1973


Сделаем функцию, которая будет оценивать точность алгоритма (RMSE)

In [10]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
    # Оставим оценки, предсказанные алгоритмом, только для соотвествующего набора данных
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    # Оставим оценки, которые реально поставил пользователь, только для соотвествующего набора данных
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
    
    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)

Сформируем матрицы размера (n_users, n_products) для обучающего и тестового наборов таким образом, чтобы элемент в ячейке [i, j] отражал оценку i-го пользователя j-му товару:

In [11]:
train_data_matrix = np.zeros((n_users, n_products))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
    
test_data_matrix = np.zeros((n_users, n_products))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [12]:
train_data_matrix.shape,test_data_matrix.shape

((2539, 4014), (2539, 4014))

In [13]:
train_data_matrix

array([[ 3.,  3.,  3., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 10.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

### Высчитываем косинусное расстояние

Один из важных моментов в коллаборативной фильтрации — найти похожих пользователей для User-Based и похожие объекты (в нашем случае ткани) для Item-Based коллаборативной фильтрации.

In [14]:
from  sklearn.metrics.pairwise import pairwise_distances

# считаем косинусное расстояние для пользователей и товаров 
# (построчно и поколоночно соотвественно).
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')


In [15]:
user_similarity.shape, item_similarity.shape

((2539, 2539), (4014, 4014))

In [16]:
col = ratings_df_sample['userId'].unique()

In [17]:
df_similarity_user = pd.DataFrame(user_similarity, index=col, columns=col)

In [18]:
df_similarity_user.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2530,2531,2532,2533,2534,2535,2536,2537,2538,2539
1,0.0,1.0,0.886285,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,1.0,0.946902,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.886285,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.946902,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
df_similarity_item = pd.DataFrame(item_similarity, index=ratings_df_sample['productId'].unique(), columns=ratings_df_sample['productId'].unique())

In [20]:
# ratings_df_sample['productId'].unique()

In [21]:
df_similarity_item.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,4005,4006,4007,4008,4009,4010,4011,4012,4013,4014
1,0.0,0.712652,0.0,0.051317,0.860124,0.916795,1.0,0.879127,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.712652,0.0,0.712652,0.727398,0.959807,0.976091,1.0,0.965267,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.0,0.712652,0.0,0.051317,0.860124,0.916795,1.0,0.879127,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.051317,0.727398,0.051317,0.0,0.867302,0.921065,1.0,0.885329,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.860124,0.959807,0.860124,0.867302,0.0,0.909479,0.905276,0.915464,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# User-based collaborative filtering
def naive_predict(top):
    # Структура для хранения для каждого пользователя оценки товаров top наиболее похожих на него пользователей:
    # top_similar_ratings[0][1] - оценки всех товаров одного из наиболее похожих пользователей на пользователя с ид 0.
    # Здесь 1 - это не ид пользователя, а просто порядковый номер.
    top_similar_ratings = np.zeros((n_users, top, n_products))
    # создали трехмерный массив (702,7,8227)
    for i in range(n_users):
        # Для каждого пользователя необходимо получить наиболее похожих пользователей:
        # Нулевой элемент не подходит, т.к. на этом месте находится похожесть пользователя самого на себя
        top_sim_users = user_similarity[i].argsort()[1:top + 1]
        # возвращает отсортированный список индексов пользователей
        
        # берём только оценки из "обучающей" выборки 
        top_similar_ratings[i] = train_data_matrix[top_sim_users]

    pred = np.zeros((n_users, n_products))
    for i in range(n_users):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred


def naive_predict_item(top):
    top_similar_ratings = np.zeros((n_products, top, n_users))

    for i in range(n_products):
        top_sim_movies = item_similarity[i].argsort()[1:top + 1]
        top_similar_ratings[i] = train_data_matrix.T[top_sim_movies]
        
    pred = np.zeros((n_products, n_users))
    for i in range(n_products):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred.T

naive_pred = naive_predict(15)
print('User-based CF RMSE: ', rmse(naive_pred, test_data_matrix))

naive_pred_item = naive_predict_item(15)
print('Item-based CF RMSE: ', rmse(naive_pred_item, test_data_matrix))



User-based CF RMSE:  6.484130139191519
Item-based CF RMSE:  6.486445834481041


Рекомендации с учётом средних оценок похожих пользователей

In [23]:
def k_fract_predict(top):
    top_similar = np.zeros((n_users, top))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]#[-top:]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_products))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix[indexes])
        
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        
        pred[i] = product / denominator
    
    return pred


def k_fract_predict_item(top):
    flag = True
    top_similar = np.zeros((n_products, top))
    
    for i in range(n_products):
        movies_sim = item_similarity[i]
        top_sim_movies = movies_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies.T[j]
            
    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_products, n_users))
    
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = item_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix.T[indexes])
        
        denominator = abs_sim[i][indexes].sum()
        denominator = denominator if denominator != 0 else 1
        
        pred[i] = product / denominator
        
    return pred.T


k_predict = k_fract_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))




User-based CF RMSE:  6.484989111053678
Item-based CF RMSE:  6.504408813671239


Рекомендации на основе средних оценок пользователей и матрицы “похожести”

In [24]:
def k_fract_mean_predict(top):
    top_similar = np.zeros((n_users, top))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_products))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]
        
        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
        diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        
        pred[i] = mean_rating + numerator / denominator
        
    return pred

def k_fract_mean_predict_item(top):
    top_similar = np.zeros((n_products, top))
    
    for i in range(n_products):
        movie_sim = item_similarity[i]
        top_sim_movies = movie_sim.argsort()[1:top + 1]
        
        for j in range(top):
            top_similar[i, j] = top_sim_movies[j]
    
    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_products, n_users))
    
    for i in range(n_products):
        indexes = top_similar[i].astype(np.int)
        numerator = item_similarity[i][indexes]
        
        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        mean_rating = 0 if np.isnan(mean_rating) else mean_rating
        
        diff_ratings = train_data_matrix.T[indexes] - mean_rating
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        denominator = denominator if denominator != 0 else 1
        
        pred[i] = mean_rating + numerator / denominator
                
    return pred.T

k_predict = k_fract_mean_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_mean_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))


User-based CF RMSE:  3.1283462646174294
Item-based CF RMSE:  6.45662001795291


In [26]:
pd.DataFrame(k_fract_mean_predict(7), index=user_ids, columns=product_ids).head()

Unnamed: 0,ГЖ-7-1,ГМ-50-1,ГМ-55-1,РМ-8-1,АКС-1-41,АКС-1-42,АО-8-28,АО-8-70,АО-9-56,ДЖП-104-2,...,ПД150-24-2,СЖ-119-1,БГЛ-23-14,ПП-73-2,ПВГ-28-6,ПЛК-206-1,ПЛК-277-1,ПП-40-1,ПП-7-1,БГЛ-34-12
14795,3.138729,3.138729,3.138729,3.138729,7.452682,3.138729,3.138729,3.138729,3.138729,3.138729,...,3.138729,3.138729,3.138729,3.138729,3.138729,3.138729,3.138729,3.138729,3.138729,3.138729
17255,2.362905,2.362905,2.362905,2.362905,2.909695,2.362905,2.362905,2.362905,2.362905,2.362905,...,2.362905,2.362905,2.362905,2.362905,2.362905,2.362905,2.362905,2.362905,2.362905,2.362905
21303,10.376108,10.376108,10.376108,10.376108,10.376108,10.376108,9.989999,10.376108,9.989999,9.989999,...,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999
22469,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,...,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427,3.368427
60327,10.418571,10.418571,10.418571,10.418571,10.418571,10.418571,9.989999,10.418571,9.989999,9.989999,...,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999,9.989999
