In [1]:
from typing import Tuple, Dict, Any
!pip install -r ../../../requirements.txt



In [2]:
import os
import sys

parent_dir = os.path.abspath('..')
sys.path.insert(0, parent_dir)

import pandas as pd
import pickle
import kagglehub
import numpy as np
from scipy.sparse import csr_matrix

from library.rating import get_explicit_rating, get_implicit_rating_out_of_positive_ratings

from collections import defaultdict

# Feature selection

The only dataset that is necessary for our purposes is **review** dataset since:
- it contains the information about explicit ratings (the mean of the field **stars** for pairs of users and items, check the chapter **Feature engineering** for more details)
- it contains the information for implicit rating (check the chapter **Feature engineering** for more details)
- it already contains only those users who provided at least one review and those items that received at least one estimation

Normally datasets are downloaded from the Kaggle, but in the purpose of time-saving the local path placed instead.
If you want to download the datasets again, just uncomment the code below.

In [3]:
path = "/Users/simon/.cache/kagglehub/datasets/yelpprojectpurpose/yelp-parquet/versions/1"

# path = kagglehub.dataset_download("yelpprojectpurpose/yelp-parquet")
print("Path to dataset files:", path)

Path to dataset files: /Users/simon/.cache/kagglehub/datasets/yelpprojectpurpose/yelp-parquet/versions/1


In [4]:
!ls $path

Attribute.parquet Checkin.parquet   Review.parquet    User .parquet
Business.parquet  Hours.parquet     Tip.parquet


Reviews' features:
- `review_id` | `user_id` | `business_id` - id of the review and foreign keys (one user can leave several reviews for one item)
- `stars` - **explicit rating** provided by user for the particular item in the particular moment
- `useful` | `funny` | `cool`  - user's flags about (presumably) review. We don't drop this feature since it's necessary to get any evidences that theory about the nature of the feature is right - check the **opportunity of usage them for implicit rating** 
- `text` - the content of review (can be useful for potential sentimental analysis)
- `date` - the timestamp of review

In [5]:
review_df = pd.read_parquet(f"{path}/Review.parquet", engine="pyarrow")
review_df

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0
...,...,...,...,...,...,...,...,...,...
5261663,PoGSiNz1X5SUu0qEt-qM5w,mPjPyipaD0C_myqWqDipZg,Ngk84Ax1tXgpoJFEGxot3w,1,2011-11-21,"Bought groupon $39 for 4 months from groupon, ...",36,3,3
5261664,-CJNPrDWgIkorx4iEZJXIg,mPjPyipaD0C_myqWqDipZg,pOEL97ld-FJMKO8Ki8JmYg,3,2016-04-30,"Spring rolls was pretty good, cod was a bit ra...",0,0,0
5261665,W9eVvOcpBvG6lpJPoJOxuA,mPjPyipaD0C_myqWqDipZg,5ubokMNw8qfbX2WtxgJG1Q,4,2011-10-23,"Had a 8 dish set meal, was enough for 10 peopl...",3,0,1
5261666,hqQ1UTFKMN2P1ezUow48OQ,mPjPyipaD0C_myqWqDipZg,EO3i5kTUG7_S2OIQ23sdSA,3,2011-11-07,"A small, cozy family run Authentic korean rest...",2,0,0


Reasons of feature dropping:
- since `useful | funny | cool` features describe the preferences of other users about this particular review, not an item, they can't be used for calculations of **explicit** and **implicit** ratings (our assumption is that these features are reactions that user can give to review that theoretically describes the user-to-user relations) 
- `review_id` won't be dropped, but will be used for indexing since it's unique field
- `text` of review won't be used for implicit or explicit ratings so this feature can be also dropped 

In [6]:
REMAINED_FEATURES = ['review_id', 'user_id', 'business_id', 'stars', 'date']

filtered_review_df = review_df[REMAINED_FEATURES]
filtered_review_df.set_index('review_id', inplace=True)
filtered_review_df

Unnamed: 0_level_0,user_id,business_id,stars,date
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28
n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28
MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28
IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28
L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28
...,...,...,...,...
PoGSiNz1X5SUu0qEt-qM5w,mPjPyipaD0C_myqWqDipZg,Ngk84Ax1tXgpoJFEGxot3w,1,2011-11-21
-CJNPrDWgIkorx4iEZJXIg,mPjPyipaD0C_myqWqDipZg,pOEL97ld-FJMKO8Ki8JmYg,3,2016-04-30
W9eVvOcpBvG6lpJPoJOxuA,mPjPyipaD0C_myqWqDipZg,5ubokMNw8qfbX2WtxgJG1Q,4,2011-10-23
hqQ1UTFKMN2P1ezUow48OQ,mPjPyipaD0C_myqWqDipZg,EO3i5kTUG7_S2OIQ23sdSA,3,2011-11-07


# Train / validation / test split

# Feature engineering 

Below predefined constants for this section

In [7]:
# The threshold for implicit ratings calculations (only positive ratings are considered and explicit ratings are from 1 to 5)
IMPLICIT_THRESHOLD = 4

## Explicit rating 

### Train data

In [8]:
explicit_ratings, user_to_index, item_to_index = get_explicit_rating(filtered_review_df, "user_id", "business_id", "stars")
explicit_ratings

<1326101x174567 sparse matrix of type '<class 'numpy.int64'>'
	with 5261666 stored elements in Compressed Sparse Row format>

In [9]:
implicit_ratings = get_implicit_rating_out_of_positive_ratings(filtered_review_df, "user_id", "business_id", "stars", IMPLICIT_THRESHOLD)
implicit_ratings

defaultdict(dict,
            {'---1lKK3aKOuomHnwAkAow': {'--9e1ONYQuAa-CB_Rrw7Tw': 1,
              '-ErwgUmZ1-jHW_rSu55jlg': 1,
              '1Vn_lex3LGGwuTo-xeJnww': 1,
              '2Cs9bSN-fMnY3H-0pFP1mg': 1,
              '2VNa2kbbt4o8nQmPKIcHVQ': 1,
              '4RoTEeqB_MNn6yaqZmlZHg': 1,
              '5FPQOwwPkBEiy8df8d0SPQ': 1,
              '5aeR9KcboZmhDZlFscnYRA': 1,
              '5rxJpTkeJa5rxMvL2NbSnQ': 1,
              'A0X1baHPgw9IiBRivu0G9g': 1,
              'AZlnpvILz5cEWJifjr2CSQ': 1,
              'Bz8iVsCAv-8t8FsQLCXFPQ': 1,
              'CWNMLT-ppaUjLMmrnYDPVg': 1,
              'CeqWpwHBoaxwRcv5btnv6g': 1,
              'D1PhUlkQA1ZsVe9Cx4yqOw': 1,
              'DV13F0bhe55dV1AhwoO50g': 1,
              'DXlDzOcpdUE_F21tok0fgw': 1,
              'E3m-twP4h0-qzakUTBYDpw': 1,
              'GGCVNcBQ9WGviYNiaR8tBw': 1,
              'GT0K4EdSSxe_LMU6SPr-_A': 1,
              'Gaasy9YbPGVc8KcXcAIqEw': 1,
              'Gdv3qhsDeQzZ2Ag-Tzq6vA': 1,
          

### Validation data

### Test data

## Implicit rating

# Hyperparameters tuning

# Model testing

# Model saving 

The following code saves the result object to reuse **the trained model** in the service

In [10]:
with open("./models/svd_pp.pkl", "wb") as f:
    pickle.dump(svd_pp, f)

NameError: name 'svd_pp' is not defined