## Модуль 5.1 - Рекомендательные системы
### Подготовка датасетов для прототипа системы

## Импорт библиотек

In [1]:
import pandas as pd
import json 

## Получаем данные

In [2]:
df = pd.read_csv('data4/train.zip', low_memory=False)

In [3]:
df.head()

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,userid,itemid,rating
0,5.0,True,"10 4, 2016",B01CPNIEQG,Heather,These are my FAVORITE spices in my collection....,Must Add to your Spice kitchen!,1475539200,,,,102179,37138,1.0
1,5.0,True,"03 1, 2016",B006F63M8U,Linda Odom,Add A package to my Coffee and it makes a good...,Milk Chocolate Swiis MIss Hot Cocoa Mix,1456790400,,{'Size:': ' 60-0.73 oz Envelopes'},,3625,17322,1.0
2,5.0,True,"06 26, 2016",B00112O8NG,DesertBlossom,"I love the Torani syrups, but not the prices o...","Love these sugar free syrups, but didn't love ...",1466899200,28.0,,,39495,5600,1.0
3,5.0,True,"09 5, 2013",B00MRAQUM8,Connie L. Reaves,Very fragrant. The price can't be beat. I do l...,Nice !,1378339200,,,,72854,30249,1.0
4,5.0,True,"09 14, 2016",B000R7RN0G,Mike P.,These are my favorite jalapeno chips so far. T...,These are my favorite jalapeno chips so far,1473811200,,{'Flavor:': ' Jalapeo'},,74859,4349,1.0


In [4]:
df.shape

(857895, 14)

## Обработка данных

In [5]:
# Удалим дубликаты, т.к. пользователь может оценить один товар один раз.
df.drop_duplicates(inplace=True)
df.shape

(826895, 14)

In [6]:
# Удалим строки с пропущенными значениями, где их мало.
df.dropna(subset=['reviewerName', 'reviewText', 'summary'], inplace=True)

In [7]:
# Признак reviewTime
df['reviewTime'] = pd.to_datetime(df['reviewTime'], format="%m %d, %Y")

### Удаление дубликатов оценок

Один пользователь может оценить товар только один раз.

In [8]:
duplicates = df[df.duplicated(subset = ['itemid','userid'])]
len(duplicates)

14192

In [9]:
duplicates.index.values

array([  9326,  10863,  11572, ..., 857835, 857836, 857890], dtype=int64)

In [10]:
df = df.drop(duplicates.index.values)

In [11]:
df.shape

(812132, 14)

Сохраним в csv-файл только некоторые столбцы.

In [12]:
df[['overall', 'reviewTime', 'asin', 'userid', 'itemid', 'rating']].head()

Unnamed: 0,overall,reviewTime,asin,userid,itemid,rating
0,5.0,2016-10-04,B01CPNIEQG,102179,37138,1.0
1,5.0,2016-03-01,B006F63M8U,3625,17322,1.0
2,5.0,2016-06-26,B00112O8NG,39495,5600,1.0
3,5.0,2013-09-05,B00MRAQUM8,72854,30249,1.0
4,5.0,2016-09-14,B000R7RN0G,74859,4349,1.0


In [13]:
# сохраним в виде csv-файла
df[['overall', 'reviewTime', 'asin', 'userid', 'itemid', 'rating']].to_csv('rating.csv', index = False)

### Подготовка датафрейма товаров

In [17]:
items = pd.DataFrame()
items['rating_avg'] = df.groupby('itemid')['rating'].mean()
items['number_ratings'] = df.groupby('itemid')['rating'].count()
items['overall_avg'] = df.groupby('itemid')['overall'].mean()
items['satisfaction_avg'] = (df.groupby('itemid')['rating'].mean() * 100).astype(int)
items.reset_index(inplace=True)

In [21]:
items.head()

Unnamed: 0,itemid,rating_avg,number_ratings,overall_avg,satisfaction_avg
0,0,1.0,8,4.875,100
1,1,0.761905,21,4.190476,76
2,2,1.0,6,4.833333,100
3,3,0.905405,74,4.581081,90
4,4,0.8,10,4.4,80


In [22]:
len(items)

41302

### Создание категории товара
Добавим к таблице товаров дополнительные столбцы: категорию товара, название и цену.

In [23]:
# Считываем json-файл

meta = []
with open("data4/meta.json", 'r') as json_file:
    for line in json_file:
        meta.append(json.loads(line))

In [24]:
meta[0]['category']

['Grocery & Gourmet Food', 'Dairy, Cheese & Eggs', 'Cheese', 'Gouda']

In [25]:
meta[0]['price']

'$41.91'

In [26]:
meta[0]['title']

'Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs'

In [27]:
# Создадим столбец asin в датафрейме items

items['asin'] = items['itemid'].apply(lambda x: df[df['itemid'] == x]['asin'].head(1).values[0])

Напишем функции для заполнения новых столбцов.

In [28]:
def find_category(asin):
    """ Принимает asin товара.
        Возвращает категорию товара.
    """
    category = ""
    for cat in meta:
        if cat['asin'] == asin:
            try:
                category = cat['category'][2]
            except:
                try:
                    category = cat['category'][1]
                except:
                    category = ""
            break
    return category


def find_price(asin):
    """ Принимает asin товара.
        Возвращает цену товара.
    """
    price = ""
    for cat in meta:
        if cat['asin'] == asin:
            try:
                price = cat['price']
            except:
                price = ""
            break
    return price


def find_title(asin):
    """ Принимает asin товара.
        Возвращает название товара.
    """
    title = ""
    for cat in meta:
        if cat['asin'] == asin:
            try:
                title = cat['title']
            except:
                title = ""
            break
    return title

In [29]:
items['category'] = items['asin'].apply(find_category)

In [30]:
items['title'] = items['asin'].apply(find_title)

In [31]:
items['price'] = items['asin'].apply(find_price)

In [33]:
tags_dict = {}
for ind, tag in enumerate(items['category'].unique()):
    tags_dict[ind] = tag
    
df_tags = pd.DataFrame(tags_dict.items(), columns=['category_id', 'category'])
df_tags.head()

Unnamed: 0,category_id,category
0,0,"Coffee, Tea & Cocoa"
1,1,Sauces
2,2,Food Coloring
3,3,Jerky & Dried Meats
4,4,Hard Candy & Lollipops


In [34]:
len(df_tags)

235

In [35]:
items = items.merge(df_tags, left_on='category', right_on='category')

In [36]:
items.head()

Unnamed: 0,itemid,rating_avg,number_ratings,overall_avg,satisfaction_avg,asin,category,title,price,category_id
0,0,1.0,8,4.875,100,4639725183,"Coffee, Tea & Cocoa",Lipton Yellow Label Finest Blend Tea Bags 100 ...,$12.98,0
1,1,0.761905,21,4.190476,76,4639725043,"Coffee, Tea & Cocoa",Lipton Yellow Label Tea (loose tea) - 450g,$12.46,0
2,2,1.0,6,4.833333,100,5463213682,"Coffee, Tea & Cocoa",Organo Gold Cafe Supreme 100% Certified Ganode...,$29.90,0
3,8,0.913043,46,4.73913,91,B00005344V,"Coffee, Tea & Cocoa",Traditional Medicinals Organic Breathe Easy Se...,$28.68,0
4,10,0.6,5,3.4,60,B00005IX97,"Coffee, Tea & Cocoa","Espressione 100% Arabica Coffee, 150-Count Pods",$64.88,0


## Обучаем модель

In [39]:
import scipy.sparse as sparse

from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
import sklearn
from sklearn.model_selection import train_test_split

import scipy.sparse as sparse

In [40]:
ratings_coo = sparse.coo_matrix((df['rating'].astype(int),
                                 (df['userid'],
                                  df['itemid'])))
ratings_coo

<127496x41320 sparse matrix of type '<class 'numpy.int32'>'
	with 812132 stored elements in COOrdinate format>

In [41]:
feature_ratings = sparse.coo_matrix(([1]*len(items),(items.itemid, items.category_id)))
feature_ratings

<41320x235 sparse matrix of type '<class 'numpy.int32'>'
	with 41302 stored elements in COOrdinate format>

In [42]:
NUM_THREADS = 4 #число потоков
NUM_COMPONENTS = 235 #число параметров вектора 
NUM_EPOCHS = 20 #число эпох обучения

model = LightFM(learning_rate=0.1, loss='logistic', no_components=NUM_COMPONENTS)
model = model.fit(ratings_coo, epochs=NUM_EPOCHS, 
                  num_threads=NUM_THREADS,
                  item_features=feature_ratings,
                  verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [43]:
# Достаём эмбеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [44]:
item_embeddings.shape

(41320, 235)

In [45]:
import nmslib

#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')

#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

#Вспомогательная функция для поиска по графу
def nearest_items_nms(item_id, index, n=10):
    nn = index.knnQuery(item_embeddings[item_id], k=n)
    return nn

#### Проверяем модель

Давайте попробуем написать рекомендации к товару в категории Coffee, Tea & Cocoa.

In [46]:
df_tags[df_tags['category_id'] == 0]

Unnamed: 0,category_id,category
0,0,"Coffee, Tea & Cocoa"


In [47]:
items[items['category_id'] == 0].head()

Unnamed: 0,itemid,rating_avg,number_ratings,overall_avg,satisfaction_avg,asin,category,title,price,category_id
0,0,1.0,8,4.875,100,4639725183,"Coffee, Tea & Cocoa",Lipton Yellow Label Finest Blend Tea Bags 100 ...,$12.98,0
1,1,0.761905,21,4.190476,76,4639725043,"Coffee, Tea & Cocoa",Lipton Yellow Label Tea (loose tea) - 450g,$12.46,0
2,2,1.0,6,4.833333,100,5463213682,"Coffee, Tea & Cocoa",Organo Gold Cafe Supreme 100% Certified Ganode...,$29.90,0
3,8,0.913043,46,4.73913,91,B00005344V,"Coffee, Tea & Cocoa",Traditional Medicinals Organic Breathe Easy Se...,$28.68,0
4,10,0.6,5,3.4,60,B00005IX97,"Coffee, Tea & Cocoa","Espressione 100% Arabica Coffee, 150-Count Pods",$64.88,0


In [48]:
nearest_items = nearest_items_nms(0, nms_idx)[0]
nearest_items

array([16541, 40695, 40788, 41143, 41228, 41196, 41268, 41274, 41318,
       37727])

In [49]:
#Выводим похожие товары.

items[items['itemid'].isin(nearest_items)]

Unnamed: 0,itemid,rating_avg,number_ratings,overall_avg,satisfaction_avg,asin,category,title,price,category_id
2776,16541,0.8,45,4.266667,80,B005RFBNP4,"Coffee, Tea & Cocoa",First Colony Organic Fair Trade Whole Bean Cof...,$15.99,0
6835,37727,0.7,10,4.0,70,B01F2TSY1A,"Coffee, Tea & Cocoa",Pink Stork Lactation: Herbal Mint Nursing Supp...,$11.99,0
7133,40695,1.0,4,5.0,100,B00KWL5STA,"Coffee, Tea & Cocoa",Ghirardelli Hot Cocoa Mix Mocha (Pack of 2),$14.99,0
7140,40788,1.0,3,4.666667,100,B00N3462T8,"Coffee, Tea & Cocoa",Victor Allen's Coffee 12-Count Single Serve Cu...,$14.24,0
7169,41143,1.0,4,4.75,100,B014JRXCLI,"Coffee, Tea & Cocoa",Great Iced Tea or Hot All year - Holiday Spice...,$15.99,0
7175,41196,0.5,4,4.0,50,B019J2LI8M,"Coffee, Tea & Cocoa","Twisted Pine Dark Italian Roast, Extra Bold Co...",,0
7177,41228,1.0,4,4.75,100,B01B7BVIUQ,"Coffee, Tea & Cocoa","MORINGA GINGER TEA - USDA Organic, Exotic Blen...",$13.95,0
7179,41268,0.8,5,4.2,80,B01EUM91YE,"Coffee, Tea & Cocoa",20 Count - Variety Starbucks Flavored Coffee K...,$19.99,0
7180,41274,1.0,4,4.75,100,B01F7AWF32,"Coffee, Tea & Cocoa",International Delight Hazelnut Coffee Creamer ...,,0
7181,41318,1.0,2,5.0,100,B01HGBEYRU,"Coffee, Tea & Cocoa","Nescafe Taster's Choice Instant Coffee, House ...",,0


Подготовим свою модель к внедрению в продакшен

In [50]:
import pickle
with open('item_embeddings.pickle', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

## Пример работы прототипа

In [51]:
rating_df = pd.read_csv('master/prod/rating.csv')
items_df = pd.read_csv('master/prod/items.csv')

- В поле "User ID" указывается id пользователя (целое число).

In [52]:
user_id = 39495

- На экран выводятся не более 5 последних оценок этого пользователя в порядке убывания.

In [60]:
#Получаем пять последних оценок пользователя
user_df = rating_df[rating_df['userid'] == user_id].sort_values(by='reviewTime', ascending=False).head(5)
user_df = user_df.merge(items, left_on='asin', right_on='asin', how='inner', suffixes=('_left', '_right'))
user_df = user_df[['reviewTime', 'category', 'title', 'overall', 'itemid_left']]
user_df.rename(columns={'itemid_left': 'itemid'}, inplace=True)

In [61]:
user_df

Unnamed: 0,reviewTime,category,title,overall,itemid
0,2016-06-26,"Bottled Beverages, Water & Drink Mixes","Torani Syrup, Sugar Free Vanilla, 25.4 oz",5.0,5600
1,2016-06-26,Bouillon,Hormel Herb Ox Chicken Bouillon 50 Packets,5.0,19196
2,2016-06-26,Nut & Seed Butters,"Bell Plantation PB2 Powdered Peanut Butter, Ne...",5.0,26755
3,2015-01-07,Nuts & Seeds,"Planters Peanuts, Honey Roasted & Salted, 52 O...",3.0,4689
4,2015-01-07,Nuts & Seeds,"Planters Dry Roasted Peanuts, Dry Roasted, Lig...",3.0,1378


- На основе последних 3-х оценок делаются рекомендации

In [68]:
itemid = user_df.iloc[0]['itemid']
itemid

5600

In [69]:
nearest_items = nearest_items_nms(itemid, nms_idx)[0]
nearest_items

array([ 9131,  9310,  9312, 40996, 41174, 41240, 41199, 41281, 41261,
       41267])

In [70]:
#Выводим похожие товары.

items_df[items_df['itemid'].isin(nearest_items)].sort_values(by='satisfaction_avg', ascending=False).head(3)

Unnamed: 0,itemid,rating_avg,number_ratings,overall_avg,satisfaction_avg,asin,category,title,price,category_id
21846,9310,1.0,4,5.0,100,B001OVSRW2,"Bottled Beverages, Water & Drink Mixes","Whiskey Sour Bar-Tenders Instant Cocktail Mix,...",$39.49,33
23593,41199,1.0,3,5.0,100,B019YK6AX2,"Bottled Beverages, Water & Drink Mixes","Crystal Light Tea Sticks, Lemonade, 32ct 2pk (...",$25.57,33
23597,41261,1.0,4,5.0,100,B01E5LYDRU,"Bottled Beverages, Water & Drink Mixes","Lakewood Organic Pure Veggie V12 Juice, 32 Oun...",,33


In [81]:
items['title'] = items['title'].apply(lambda x: x if len(x) < 400 else '')

In [83]:
# сохраним в виде csv-файла
items.to_csv('items.csv', index = False)