# 1. Global settings

## 1.1 Import

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
import json

import pandas_profiling
import nltk

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse 

import pickle

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()


from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
import sklearn
from sklearn.model_selection import train_test_split


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from pandas import Panel


## 1.2 Global settings

In [2]:
COL_USER = "userid"
COL_ITEM = "itemid"
COL_RATING = "overall"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "timestamp"
stop_words = stopwords.words("english")

LR = 0.07
LOSS_FUNCTION = 'logistic'
LEARNING_SCHEDULE = 'adagrad'
RANDOM_STATE = 42

NUM_THREADS = 4 #число потоков
NUM_COMPONENTS = 30 #число параметров вектора 
NUM_EPOCHS = 20 #число эпох обучения


# 2. DATA

## 2.1 Data downloading

In [3]:
# train = pd.read_csv('../input/recommendationsv4/train.csv')
train = pd.read_csv('../input/processed-data/train_processed(2).csv')
test = pd.read_csv('../input/recommendationsv4/test.csv')
submission = pd.read_csv('/kaggle/input/recommendationsv4/sample_submission.csv')

# reading metadata json
with open('/kaggle/input/recommendationsv4/meta_Grocery_and_Gourmet_Food.json') as f:
    meta_list = []
    for line in f.readlines():
        meta_list.append(json.loads(line))
        
meta = pd.DataFrame(meta_list)

# dropping duplicates
train.drop_duplicates(inplace = True)

# merging train and meta on asin column (Amazon Standard Identification Number)
train = pd.merge(train, meta, on='asin')
test = pd.merge(test, meta, on='asin')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# train.to_csv('train.csv') #checkpoint
# test.to_csv('test.csv')

In [4]:
del(meta) #to leave some RAM

## 2.2 Data understanding

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.head(3)

In [None]:
test.head(3)

## 2.3 Data transformation
### 2.3.1 Simple cleaning

In [None]:
def get_timestamp(df):
    """converting unixtime to datetime"""
    return df.apply(
    lambda x:  datetime.utcfromtimestamp(x['unixReviewTime']).strftime('%Y-%m-%d'), axis = 1)
train[COL_TIMESTAMP] = get_timestamp(train)
test[COL_TIMESTAMP] = get_timestamp(test)

In [5]:
train.main_cat = train.main_cat.fillna('Other') #filling empty categories with "other"
test.main_cat = test.main_cat.fillna('Other')

dic_verified = {
    True: 1,
    False: 0
}
train['verified'] = train['verified'].map(dic_verified) #replacing "verified"-feature with ints
test['verified'] = test['verified'].map(dic_verified)

### 2.3.1 Brief feature engineering

For faster execution and to not concentrate too much on feature processing, I'll make features only from review summaries and titles (because it may contain such information as "non-gluten", "halal", "vegetarian", brands, specific product types references and so on.

In [6]:
from tqdm import tqdm

stemmer = PorterStemmer()

def get_sentence(sentence, cond = "not"):
    """tokenizing sentences in review and deleting stopwords"""
    if cond =="stem":
        words = nltk.word_tokenize(sentence)
        without_stop_words = [stemmer.stem(word) for word in words if not word in stop_words]
    else: without_stop_words = [word for word in sentence.split(' ') if not word in stop_words]
    return ' '.join(without_stop_words)

def get_features(series):
    vectorizer = CountVectorizer(min_df = 0.05) # I wanted to try tfidf, but these amount of RAM is quiet limited to have such
    list_rvw = series.fillna('noreview') # a variety of types
    values = vectorizer.fit_transform(list_rvw)

    # Get the features as a pandas DataFrame
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(values.toarray(), columns = feature_names)

It's possible to lemmatize features, but I won’t do it for now because lemmatization on this dataset would be a time-consuming procedure. Stemming would be a better option.

In [7]:
#getting lemmatized summary sentences and tokenized titles
#train['summary'] = train['summary'].progress_apply(lambda x: str(get_sentence(x, "stem")) if type(x) == str else x)
train['token_title'] = train['title'].progress_apply(lambda x: str(get_sentence(x)) if type(x) == str else x)

HBox(children=(FloatProgress(value=0.0, max=876561.0), HTML(value='')))




In [9]:
#extruding feature from summaries and titles
summary_features = get_features(train['summary'])
summary_features['itemid'] = train['itemid']
title_features = get_features(train['token_title'])

In [10]:
title_features['itemid'] = train['itemid']

In [11]:
lst = ['overall', 'verified', 'unixReviewTime',
       'vote',  'userid', 'itemid',
       'rating', 'main_cat']
df_train = train.loc[:,lst]

In [12]:
del(train)

In [None]:
#summary_features.reset_index(inplace=True)

In [None]:
#summary_features.head(3)

In [13]:
title_features = title_features.loc[:,'bag':'itemid']
title_features

Unnamed: 0,bag,bags,box,chocolate,coffee,count,dark,free,gluten,no,organic,ounce,oz,pack,sugar,tea,itemid
0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,37138
1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,37138
2,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,37138
3,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,37138
4,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,37138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876556,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,38934
876557,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,38250
876558,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,29571
876559,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,26244


In [None]:
df_train = df_train.merge(title_features, on = 'itemid', how='left')
df_train = df_train.merge(summary_features, on = 'itemid', how='left')

In [None]:
# train['also_buy'] = train['also_buy'].replace('[','').replace(']','')
# also_buy = train['itemid'].join(train['also_buy'].str.get_dummies(sep = ','))

In [None]:
df_train.vote.fillna(0, inplace = True)
df_train['overall'] = df_train['overall'].apply(lambda x: int(x))
df_train['rating'] = df_train['overall'].apply(lambda x: int(x))

In [None]:
df_train.to_csv('df_train.csv') #checkpoint

### 2.3.2 Further data preparation for LightFM

In [None]:
# df_train = pd.read_csv('../input/last-attempt/df_train.csv') #load from checkpoint
# df_test = pd.read_csv('../input/last-attempt/df_test.csv')
# with open('../input/last-attempt/model(1).pkl', 'rb') as f:
#     model = pickle.load(f)

In [None]:
features_user_train = df_train[['userid', 'verified', 'vote']]
features_item_train = df_train[['itemid', 'main_cat', 'five',
                                'good', 'great', 'love', 'star',
                                'tast','bag', 'bags', 'box', 'chocolate',
                                'coffee', 'count', 'dark', 'free', 'gluten', 'organic',
                                'ounce', 'oz', 'pack', 'sugar', 'tea']]
df_train = df_train[['userid','itemid','rating']]
df_test = test[['userid','itemid']]

In [None]:
del(test)

#### Item featurs building

In [None]:
def get_items(item, features):
    item_f = []
    col = []
    unique_f1 = []
    for column in features.drop([item], axis=1):
        col += [column]*len(features[column].unique())
        unique_f1 += list(features[column].unique())
    for x,y in zip(col, unique_f1):
        res = str(x)+ ":" +str(y)
        item_f.append(res)
        print(res)
    return item_f

item_f_train = get_items('itemid', features_item_train)
user_f_train = get_items('userid', features_user_train)
item_f_test = get_items('itemid', features_item_test)
user_f_test = get_items('userid', features_user_test)

In [None]:
from lightfm.data import Dataset
# we call fit to supply userid, item id and user/item features
dataset1 = Dataset()
dataset1.fit(
        df_train['userid'].unique(), # all the users
        df_train['itemid'].unique(), # all the items
        user_features = user_f_train,
        item_features = item_f_train)

In [None]:
(interactions, weights) = dataset1.build_interactions([(x[0], x[1], x[2]) for x in df_train.values ])

In [None]:
item_pattern = [x + ':' for x in features_item_train.drop(['itemid'], axis=1)]

def make_feat_list(llist, pattern):
    result = list()
    for x,y in zip(item_pattern,llist):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

In [None]:
ad_subset = features_item_train.drop(['itemid'], axis=1)
ad_list = [x.tolist() for x in ad_subset.values]
item_feature_list = []
for item in ad_list:
    item_feature_list.append(make_feat_list(item, item_pattern))
print(f'Sample: {item_feature_list[0:5]}')

In [None]:
item_tuple = list(zip(features_item_train.itemid, item_feature_list))
print(f'Sample:{item_tuple[0:5]}')

In [None]:
item_features_train = dataset1.build_item_features(item_tuple, normalize= False)
item_features_train.todense()

#### Building user features

In [None]:
user_pattern = [x + ':' for x in features_user_train.drop(['userid'], axis=1)]
ad_subset = features_user_train.drop(['userid'], axis=1)
ad_list = [x.tolist() for x in ad_subset.values]
user_feature_list = []
for item in ad_list:
    item_feature_list.append(make_feat_list(item, user_pattern))
print(f'Sample: {item_feature_list[0:5]}')

user_tuple = list(zip(features_user_train.userid, user_feature_list))

In [None]:
user_features_train = dataset1.build_user_features(user_tuple, normalize= False)
user_features_train.todense()


### 2.3.3 Model creation for production

In [None]:

# dictionaries of user/item/features names
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset1.mapping()

model = LightFM(
    learning_rate=LR,
    loss=LOSS_FUNCTION,
    no_components=NUM_COMPONENTS,
    learning_schedule = LEARNING_SCHEDULE,
    random_state = RANDOM_STATE
)

from tqdm.notebook import tqdm
pbar = tqdm()

model.fit(interactions, # spase matrix representing whether user and item interacted
    user_features = user_features_train,
    item_features = item_features_train, #user and item features sparse matrices
    sample_weight = weights, # represents how much users and items are interacting or ratings
    epochs=NUM_EPOCHS)

### 2.3.3 Precision and recall at k score calculation

In [None]:
prec_score = precision_at_k(
                     model,
                     df_train,
                     num_threads=NUM_THREADS,
                     k=10,
                    item_features=item_features_train,
                    user_features=user_features_train).mean()
 
recall_at_k = recall_at_k(model,
                     df_train,
                     num_threads=NUM_THREADS,
                     k=10,
                    user_features=user_features_train,
                    item_features=item_features_train).mean()

print(recall_at_k,prec_score)

### 2.3.4 Pickling model

In [None]:
import pickle

In [None]:
# Create an variable to pickle and open it in write mode
model_pickle = open('model.pkl', 'wb')
pickle.dump(model, model_pickle)
model_pickle.close()

### 2.3.5 Predict/submit

In [None]:
user_ids = df_test.userid.apply(lambda x: user_id_map[x])
item_ids = df_test.itemid.apply(lambda x: item_id_map[x])
preds = model.predict(user_ids, item_ids, user_features=user_features_train, item_features=item_features_train)

In [None]:
preds.min(), preds.max()

In [None]:
normalized_preds = (preds - preds.min())/(preds - preds.min()).max()

In [None]:
normalized_preds.min(), normalized_preds.max()

In [None]:
submission = pd.read_csv('/kaggle/input/recommendationsv4/sample_submission.csv')
submission['rating']= normalized_preds
submission.to_csv('submission_log.csv', index=False)

In [None]:
submission

In [None]:
submission.to_csv('submission_log.csv', index=False)

### 2.3.6 Extruding embeddings for application

In [None]:
item_biases, item_embeddings = model.get_item_representations(features=item_features_train)
user_biases, user_embeddings = model.get_user_representations(features=user_features_train)


In [None]:
item_emb_pickle = open('item_emb.pkl', 'wb')
user_emb_pickle = open('user_emb.pkl', 'wb')
pickle.dump(item_embeddings, item_emb_pickle)
pickle.dump(user_embeddings, user_emb_pickle)
item_emb_pickle.close()
user_emb_pickle.close()

In [None]:
pip install nmslib

In [None]:
import nmslib
#search graph
nms_idx_i = nmslib.init(method='hnsw', space='cosinesimil')
nms_idx_u = nmslib.init(method='hnsw', space='cosinesimil')
 
#adding items tograph
nms_idx_i.addDataPointBatch(item_embeddings)
nms_idx_i.createIndex(print_progress=True)
nms_idx_u.addDataPointBatch(user_embeddings)
nms_idx_u.createIndex(print_progress=True)

In [None]:
#getting nearest items on the graph
def nearest_items_nms(itemid, index, n=10):
    nn = index.knnQuery(item_embeddings[itemid], k=n)
    return nn

def nearest_items_nms_u(itemid, index, n=10):
    nn = index.knnQuery(user_embeddings[itemid], k=n)
    return nn

In [None]:
nbm = nearest_items_nms_u(14112,nms_idx_u)[0]
df_train[df_train.itemid.isin(nbm)]

### 3. Conclusion

* Score действительно падает при добавлении матрицы фичей, но по моему мнению все равно важно опираться на признаки относящиеся к составу и специфике продукта. В этом плане достаточно информативными оказались заголовки, токенизацией которых я и генерировал большинство признаков. Поэтому я не делал упор на достижение сильно большего чем бейзлайн показателя.
* В качестве метрики я бы скорее использовал не roc auc, а precision at k и recall at k. Так как класс "релевантных товаров" для нас интереснее чем класс "нерелеватных". И нам интересно насколько хорошо модель вычисляет именно релевантные товары при данных базовых вероятностях.
* Проблемой оказалась достаточно большая ресурсоемкость предподготовки модели и самой модели, что отчасти решалось удалением неиспользуемых переменных.