In [1]:
import os
project_name = "reco-tut-cris"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

/content/reco-tut-cris
Initialized empty Git repository in /content/reco-tut-cris/.git/
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 26 (delta 5), reused 22 (delta 2), pack-reused 0[K
Unpacking objects: 100% (26/26), done.
From https://github.com/sparsh-ai/reco-tut-cris
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [2]:
import os
import sys
import csv
import pdb
import copy
import random
import numpy as np
import itertools
from collections import Counter

In [3]:
def replace_id2idx(trn, vld, tst):
    
    def build_dict(category):
        category = list(set(category))

        cate_dict = {}
        for i, c in enumerate(category): cate_dict[c] = i
        return cate_dict

    def id2idx(uir, udict, idict): # Convert IDs in string into IDs in numbers
        newuir = []
        for i in range(len(uir)):
            user, item, rating, _ = uir[i] # Fourth element is a time stamp for the interaction
            newuir.append([udict[user], idict[item], rating])
        return newuir

    trn_users = [i[0] for i in trn] 
    trn_items = [i[1] for i in trn] 
    
    user_dict = build_dict(trn_users)
    item_dict = build_dict(trn_items)
    
    trn = id2idx(trn, user_dict, item_dict)
    vld = id2idx(vld, user_dict, item_dict)
    tst = id2idx(tst, user_dict, item_dict)
    
    return trn, vld, tst, user_dict, item_dict

In [4]:
def load_raw_data(fn):
    print('Load ' + fn)
    rawdata = [l for l in csv.reader(open(fn))]
    return rawdata

In [5]:
def find_negatives(dataset):
    NUMNEG = 100
    
    trn, vld, tst = dataset
    
    allitems = set([i[1] for i in trn])
    
    uidict = {} # {u: [items consumed by user u]}
    for i in range(len(trn)):
        user, item, rating = trn[i]
        if user not in uidict: uidict[user] = []
        uidict[user].append(item)
    
    for i in range(len(vld)):
        user, item, _ = vld[i]
            
        useritems = set(uidict[user] + [item]) # Target item and a user's consumed items
        negative_items = random.sample(list(allitems - useritems), NUMNEG)
        
        vld[i] = vld[i][:-1] + negative_items # Append negative items for evaluation
    
    for i in range(len(tst)):
        user, item, _ = tst[i]
        
        useritems = set(uidict[user] + [item])
        negative_items = random.sample(list(allitems - useritems), NUMNEG) 
        
        tst[i] = tst[i][:-1] + negative_items
    
    return trn, vld, tst

In [6]:
data_path = './data/silver/amazon_tools/'

print('\n🧰 Building a dataset for training the recommender system \n')

for fn in os.listdir(data_path):
    if 'train' in fn: trndata_name = data_path+fn
    if 'valid' in fn: vlddata_name = data_path+fn
    if 'test' in fn: tstdata_name = data_path+fn

# Load datasets and review features from csv format
trndata = load_raw_data(trndata_name)
vlddata = load_raw_data(vlddata_name)
tstdata = load_raw_data(tstdata_name)

trndata, org_vlddata, org_tstdata, user2id_dict, item2id_dict = replace_id2idx(trndata, vlddata, tstdata)

trndat, vlddata, tstdata = find_negatives([trndata, copy.deepcopy(org_vlddata), copy.deepcopy(org_tstdata)])

print('\nTRAIN:{}\tVALID:{}\tTEST:{}'.format(len(trndata), len(vlddata), len(tstdata)))


🧰 Building a dataset for training the recommender system 

Load ./data/silver/amazon_tools/train.csv
Load ./data/silver/amazon_tools/valid.csv
Load ./data/silver/amazon_tools/test.csv

TRAIN:126568	VALID:3446	TEST:2956


In [7]:
print('\n📂 Starting to save datasets')
data_path = './data/gold/amazon_tools/'
if not os.path.exists(data_path): os.makedirs(data_path)

np.save(open(data_path+'train.npy','wb'), np.array(trndata).astype(float).astype(int))
np.save(open(data_path+'valid.npy','wb'), np.array(vlddata).astype(float).astype(int))
np.save(open(data_path+'test.npy','wb'), np.array(tstdata).astype(float).astype(int))
np.save(open(data_path+'user_dict.npy','wb'), user2id_dict)
np.save(open(data_path+'item_dict.npy','wb'), item2id_dict)

print('\nDatasets saved to the data directory: {}\n'.format(data_path))


📂 Starting to save datasets

Datasets saved to the data directory: ./data/gold/amazon_tools/



In [8]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/gold/[m

nothing added to commit but untracked files present (use "git add" to track)


In [9]:
!git add . && git commit -m 'ADD data in gold layer amazon tools' && git push origin main

[main aa75f54] ADD data in gold layer amazon tools
 5 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/gold/amazon_tools/item_dict.npy
 create mode 100644 data/gold/amazon_tools/test.npy
 create mode 100644 data/gold/amazon_tools/train.npy
 create mode 100644 data/gold/amazon_tools/user_dict.npy
 create mode 100644 data/gold/amazon_tools/valid.npy
Counting objects: 10, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (9/9), done.
Writing objects: 100% (10/10), 2.56 MiB | 1.96 MiB/s, done.
Total 10 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/sparsh-ai/reco-tut-cris.git
   9da3184..aa75f54  main -> main
