## Setup

In [4]:
import os
project_name = "reco-chef"; branch = "30music"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [5]:
if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout "{branch}"
else:
    %cd "{project_path}"

/content/reco-chef
Initialized empty Git repository in /content/reco-chef/.git/
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (198/198), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 198 (delta 63), reused 176 (delta 42), pack-reused 0[K
Receiving objects: 100% (198/198), 50.75 KiB | 12.69 MiB/s, done.
Resolving deltas: 100% (63/63), done.
From https://github.com/sparsh-ai/reco-chef
 * branch            30music    -> FETCH_HEAD
 * [new branch]      30music    -> origin/30music
Branch '30music' set up to track remote branch '30music' from 'origin'.
Switched to a new branch '30music'


In [32]:
!git status

On branch 30music
Your branch is up to date with 'origin/30music'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   params.yaml[m
	[31mmodified:   src/eval/__init__.py[m
	[31mdeleted:    src/eval/itempop.py[m
	[31mmodified:   src/train.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31msrc/evaluate.py[m
	[31msrc/models/knn.py[m

no changes added to commit (use "git add" and/or "git commit -a")


In [33]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

[30music 8bdd8b4] commit
 6 files changed, 2495 insertions(+), 65 deletions(-)
 delete mode 100644 src/eval/itempop.py
 create mode 100644 src/evaluate.py
 create mode 100644 src/models/knn.py
 rewrite src/train.py (64%)
Counting objects: 10, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (10/10), done.
Writing objects: 100% (10/10), 10.91 KiB | 5.45 MiB/s, done.
Total 10 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/sparsh-ai/reco-chef.git
   f57d404..8bdd8b4  30music -> 30music


In [None]:
!dvc status

In [6]:
!pip install -q -U -e .

In [8]:
%reload_ext autoreload
%autoreload 2

In [None]:
!dvc pull ./data/bronze/30music/sessions_sample_10.parquet.snappy.dvc

In [16]:
!python src/prepare.py data/bronze/30music/sessions_sample_10.parquet.snappy data/silver/30music

Number of items: 1000
Number of users: 4165
Number of sessions: 6765
Session length:
	Average: 4.29
	Median: 3.0
	Min: 1
	Max: 148
Sessions per user:
	Average: 1.62
	Median: 1.0
	Min: 1
	Max: 13
Most popular items: [('443', 207), ('1065', 155), ('67', 146), ('2308', 138), ('658', 131)]
Train sessions: 2600 - Test sessions: 4165


## Development

In [None]:

import operator

from src.models.abstract import ISeqRecommender


class PopularityRecommender(ISeqRecommender):

    def __init__(self):
        super(PopularityRecommender, self).__init__()

    def fit(self, train_data):
        sequences = train_data['sequence'].values

        count_dict = {}
        for s in sequences:
            for item in s:
                if item not in count_dict:
                    count_dict[item] = 1
                else:
                    count_dict[item] += 1

        self.top = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)
        self.top = [([x[0]], x[1]) for x in self.top]

    def recommend(self, user_profile, user_id=None):
        return self.top

    def get_popular_list(self):
        return self.top

### Train

In [17]:
import os
import sys
import pickle
import pandas as pd

from src.models import PopularityRecommender, KNNRecommender


# import yaml
# params = yaml.safe_load(open("params.yaml"))["train"]
# modelname = params['model_name']


def load_data(datapath):
    path_train = os.path.join(datapath, 'train.parquet.snappy')
    path_test = os.path.join(datapath, 'test.parquet.snappy')
    train = pd.read_parquet(path_train)
    test = pd.read_parquet(path_test)
    return train, test


def train_model(modelname, train):
    models = {'itempop': PopularityRecommender()}
    model = models[modelname]
    model.fit(train)
    return model


def save_model(model, modelname, modelpath):
    pickle.dump(model, open(os.path.join(modelpath, modelname+'.pkl'), 'wb'))


if __name__ == "__main__":
    # load the params
    modelname = 'itempop'
    datapath = './data/silver/30music'
    modelpath = './artifacts/30music/models'
    # load the data
    train, test = load_data(datapath)
    # train the model
    model = train_model(modelname, train)
    # save the model
    save_model(model, modelname, modelpath)

### Evaluate

In [None]:
import os
import sys
import pickle
import pandas as pd

from src.eval import SequentialEvaluator


def load_data(datapath):
    path_train = os.path.join(datapath, 'train.parquet.snappy')
    path_test = os.path.join(datapath, 'test.parquet.snappy')
    train = pd.read_parquet(path_train)
    test = pd.read_parquet(path_test)
    return train, test


def load_model(modelpath):
    model = pickle.load(open(modelpath, 'rb'))
    return model


def save_results(evaluator, resultspath):
    results = {}
    results['seq_reveal'] = evaluator.eval_seqreveal()
    results['static_profile'] = evaluator.eval_staticprofile()
    results['rec_length'] = evaluator.eval_reclength()
    results['profile_length'] = evaluator.eval_profilelength()
    pickle.dump(results, open(resultspath, 'wb'))


if __name__ == "__main__":
    # load the params
    datapath = './data/silver/30music'
    modelpath = './artifacts/30music/models/itempop.pkl'
    resultspath = './artifacts/30music/results/itempop.pkl'
    # load the data
    train, test = load_data(datapath)
    # load the model
    model = load_model(modelpath)
    # evaluate and save the results
    evaluator = SequentialEvaluator(train, test, model)
    save_results(evaluator, resultspath)

## Scripting

In [None]:
import os
import sys
import yaml
import pickle
import pandas as pd

from src.models import PopularityRecommender, KNNRecommender


params = yaml.safe_load(open("params.yaml"))["train"]


def load_data(datapath):
    path_train = os.path.join(datapath, 'train.parquet.snappy')
    path_test = os.path.join(datapath, 'test.parquet.snappy')
    train = pd.read_parquet(path_train)
    test = pd.read_parquet(path_test)
    return train, test


def train_model(modelname, train):
    models = {'itempop': PopularityRecommender(),
              'knn': KNNRecommender(model='sknn', k=10)}
    model = models[modelname]
    model.fit(train)
    return model


def save_model(model, modelname, modelpath):
    pickle.dump(model, open(os.path.join(modelpath, modelname+'.pkl'), 'wb'))


if __name__ == "__main__":
    # load the params
    modelname = params['model_name']
    datapath = str(sys.argv[1])
    modelpath = str(sys.argv[2])
    # load the data
    train, test = load_data(datapath)
    # train the model
    model = train_model(modelname, train)
    # save the model
    save_model(model, modelname, modelpath)

In [None]:
import os
import sys
import pickle
import pandas as pd

from src.eval import SequentialEvaluator


def load_data(datapath):
    path_train = os.path.join(datapath, 'train.parquet.snappy')
    path_test = os.path.join(datapath, 'test.parquet.snappy')
    train = pd.read_parquet(path_train)
    test = pd.read_parquet(path_test)
    return train, test


def load_model(modelpath):
    model = pickle.load(open(modelpath, 'rb'))
    return model


def save_results(evaluator, resultspath):
    results = {}
    results['seq_reveal'] = evaluator.eval_seqreveal()
    results['static_profile'] = evaluator.eval_staticprofile()
    results['rec_length'] = evaluator.eval_reclength()
    results['profile_length'] = evaluator.eval_profilelength()
    pickle.dump(results, open(resultspath, 'wb'))


if __name__ == "__main__":
    # load the params
    datapath = str(sys.argv[1])
    modelpath = str(sys.argv[2])
    resultspath = str(sys.argv[3])
    # load the data
    train, test = load_data(datapath)
    # load the model
    model = load_model(modelpath)
    # evaluate and save the results
    evaluator = SequentialEvaluator(train, test, model)
    save_results(evaluator, resultspath)