In [None]:
# default_exp synth

# Synth Dataset Generator

> Module to generate Synthetic Datasets to perform tests

This module is used to initialize datasets to test the utils from this library

In [None]:
# hide
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
# export
import pandas as pd
import numpy as np
import random
from lightfm import LightFM
from lightfm.data import Dataset
from fastcore.all import * 

random.seed(42)

In [None]:
# export
class SynthDataset():
    '''
    Instance of two states of a dataset, one at time **T** and the other at time **T+1** 
    where some users and items could had been added, deleted, and also their feautes
    '''
    def __init__(self):
        self.all_users = []
        self.all_items = []
        self.all_user_features = []
        self.all_item_features = []
        self.before = {}
        self.after = {}
        self.users_added = []
        self.users_deleted = []

In [None]:
# export
def gen_values(n_values=10, prefix='u'):
    "Generates a list of values that will be used for generate the dataset"
    l = []
    for i in range(n_values):
        l.append(prefix + str(i))
    return l

In [None]:
gen_values()

['u0', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9']

In [None]:
# export
def gen_added_n_deleted(l_values, max_added=3, max_deleted=3):
    '''
    Generates two lists of values, one list will contain the values that will be deleted from the dataset,
    and the second one will contain the values that will be added to the dataset.
    '''
    deleted = []
    added = []
    for i in l_values:
        r = random.random()
        if len(deleted) < max_deleted and r < 0.8:
            deleted.append(i)
        elif len(added) < max_added and r > 0.2:
            added.append(i)
            
    return added, deleted

In [None]:
gen_added_n_deleted(gen_values())

(['u3', 'u4', 'u5'], ['u0', 'u1', 'u2'])

In [None]:
# export
def exclude_element(l, values_to_exclude, shuffle=False):
    "Excludes the elements from **values_to_exclude** from **l**"
    new_l = [x for x in l if set(values_to_exclude).issuperset({x}) == False]
    if shuffle: new_l.shuffle()
    return new_l

In [None]:
# hide
a = [1,2,3,4]
b = [2,3]

assert exclude_element(a, b) == [1,4], 'Both lists should be equal'

In [None]:
# export
def build_interactions(l1, l2, l1_col_name='user_id', l2_col_name='item_id', sparsity=0.5):
    '''
    Builds interactions between l1 and l2. 
    The sparsity determines how sparse this interactions will be.
    '''
    interactions = {l1_col_name:[], l2_col_name:[]}
    for i in l1:
        for j in l2:
            if random.random() < sparsity:
                interactions[l1_col_name].append(i)
                interactions[l2_col_name].append(j)
    return pd.DataFrame(interactions)

In [None]:
# hide
build_interactions(gen_values(prefix='u'), gen_values(prefix='i'))

Unnamed: 0,user_id,item_id
0,u0,i0
1,u0,i2
2,u0,i3
3,u0,i6
4,u0,i9
5,u1,i2
6,u1,i3
7,u1,i5
8,u1,i6
9,u1,i7


In [None]:
# export
def build_features_from_df(feature_interactions_df, element_id_column, feature_column, tolist=True):
    '''
    Builds tuples of elements and its features to build the dataset
    '''
    unique_elements = feature_interactions_df[element_id_column].unique()
    tuples = []
    for e in unique_elements:
        filtered_rows = feature_interactions_df[feature_interactions_df[element_id_column] == e]
        feature_list = filtered_rows[feature_column].unique()
        if tolist: feature_list = feature_list.tolist()
        tuples.append((e, feature_list))
    return tuples

In [None]:
# hide
user_features = build_interactions(gen_values(n_values=4, prefix='u'), 
                   gen_values(n_values=4, prefix='f'), 
                   l1_col_name='user_id', 
                   l2_col_name='feature_id', 
                   sparsity=0.5
                  )

build_features_from_df(user_features, 'user_id', 'feature_id')

[('u0', ['f1', 'f3']), ('u1', ['f2']), ('u2', ['f1']), ('u3', ['f2', 'f3'])]

In [None]:
# hide
users = gen_values(prefix='u')
items = gen_values(prefix='i')
all_user_features = gen_values(prefix='uf')
all_item_features = gen_values(prefix='if')
user_features = build_features_from_df(
    build_interactions(
        users,
        all_user_features,
        l1_col_name='user_id', 
        l2_col_name='feature_id', 
        sparsity=0.5
    ),
    element_id_column='user_id',
    feature_column='feature_id'
)
item_features = build_features_from_df(
    build_interactions(
        items,
        all_item_features,
        l1_col_name='item_id', 
        l2_col_name='feature_id', 
        sparsity=0.5
    ),
    element_id_column='item_id',
    feature_column='feature_id'
)
users, items, all_user_features, all_item_features, user_features, item_features

(['u0', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9'],
 ['i0', 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'i9'],
 ['uf0', 'uf1', 'uf2', 'uf3', 'uf4', 'uf5', 'uf6', 'uf7', 'uf8', 'uf9'],
 ['if0', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9'],
 [('u0', ['uf0', 'uf4', 'uf5', 'uf8', 'uf9']),
  ('u1', ['uf0', 'uf3', 'uf4', 'uf6', 'uf8', 'uf9']),
  ('u2', ['uf2', 'uf3', 'uf6', 'uf8', 'uf9']),
  ('u3', ['uf0', 'uf2', 'uf3', 'uf4', 'uf6', 'uf9']),
  ('u4', ['uf0', 'uf2', 'uf3', 'uf6', 'uf9']),
  ('u5', ['uf0', 'uf1', 'uf2', 'uf3', 'uf7', 'uf8', 'uf9']),
  ('u6', ['uf1', 'uf2', 'uf3', 'uf4', 'uf5', 'uf6', 'uf8']),
  ('u7', ['uf1', 'uf7', 'uf8', 'uf9']),
  ('u8', ['uf0', 'uf1', 'uf2', 'uf4', 'uf6', 'uf7']),
  ('u9', ['uf2', 'uf3', 'uf8'])],
 [('i0', ['if2', 'if4', 'if5', 'if8', 'if9']),
  ('i1', ['if1', 'if4', 'if8']),
  ('i2', ['if0', 'if1', 'if3', 'if4', 'if6', 'if7']),
  ('i3', ['if0', 'if1', 'if2', 'if6', 'if7']),
  ('i4', ['if3', 'if5', 'if8', 'if9']),
  ('i5',

In [None]:
#export
@patch
def build_synth_dataset(self:SynthDataset, n_users=10, n_items=10, max_added=3, max_deleted=3, print_added_n_deleted=False):
    '''
    This function generates two **datasets** to simulate changes through time from one dataset.
    The first generated **dataset** is the state from the data in a time *t* and the second dataset
    simulates the state from the data at a time *t+1* where some users and items where added and deleted,
    and their metadata could be also updated (new metadata that expresses better the characteristics from that item, or just corrections)
    '''

    self.all_users = gen_values(n_values=n_users, prefix='u')
    self.all_items = gen_values(n_values=n_items, prefix='i')
    self.all_user_features = gen_values(prefix='uf')
    self.all_item_features = gen_values(prefix='if')
    
    self.users_added, self.users_deleted = gen_added_n_deleted(self.all_users, max_added=max_added, max_deleted=max_deleted)
    if print_added_n_deleted: print('added users: {}\t deleted users: {}'.format(self.users_added, self.users_deleted))
        
    self.items_added, self.items_deleted = gen_added_n_deleted(self.all_items, max_added=max_added, max_deleted=max_deleted)
    if print_added_n_deleted: print('added items: {}\t deleted items: {}'.format(self.items_added, self.items_deleted))
        
    self.before['user_id'] = exclude_element(self.all_users, self.users_added)
    self.before['item_id'] = exclude_element(self.all_items, self.items_added)
    self.after['user_id'] = exclude_element(self.all_users, self.users_deleted)
    self.after['item_id'] = exclude_element(self.all_items, self.items_deleted)
    
    if print_added_n_deleted: print('users before:\t{}\nusers after:\t{}'.format(self.before['user_id'], self.after['user_id']))
    if print_added_n_deleted: print('items before:\t{}\nitems after:\t{}'.format(self.before['item_id'], self.after['item_id']))

In [None]:
x = SynthDataset()
x.build_synth_dataset(print_added_n_deleted=True)
# x.before['user_id'], x.after['user_id'], x.before['item_id'], x.after['item_id']

added users: ['u1', 'u2', 'u3']	 deleted users: ['u0', 'u5', 'u6']
added items: ['i1', 'i4', 'i5']	 deleted items: ['i0', 'i2', 'i3']
users before:	['u0', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9']
users after:	['u1', 'u2', 'u3', 'u4', 'u7', 'u8', 'u9']
items before:	['i0', 'i2', 'i3', 'i6', 'i7', 'i8', 'i9']
items after:	['i1', 'i4', 'i5', 'i6', 'i7', 'i8', 'i9']
