In [None]:
# default_exp synth

# Synth Dataset Generator

> Module to generate Synthetic Datasets to perform tests

This module is used to initialize datasets to test the utils from this library

In [None]:
# hide
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
# hide
import pandas as pd
import numpy as np
import random
from lightfm import LightFM
from lightfm.data import Dataset

random.seed(42)

In [None]:
# export
def gen_values(n_values=10, prefix='u'):
    "Generates a list of values that will be used for generate the dataset"
    l = []
    for i in range(n_values):
        l.append(prefix + str(i))
    return l

In [None]:
gen_values()

['u0', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9']

In [None]:
# export
def gen_added_n_deleted(l_values, max_added=3, max_deleted=3):
    '''
    Generates two lists of values, one list will contain the values that will be deleted from the dataset,
    and the second one will contain the values that will be added to the dataset.
    '''
    deleted = []
    added = []
    r = random.random()
    for i in l_values:
        if len(deleted) < max_deleted and r < 0.8:
            deleted.append(i)
        elif len(added) < max_added and r > 0.2:
            added.append(i)
            
    return added, deleted

In [None]:
gen_added_n_deleted(gen_values())

(['u3', 'u4', 'u5'], ['u0', 'u1', 'u2'])

In [None]:
# export
def exclude_element(l, values_to_exclude):
    "Excludes the elements from **values_to_exclude** from **l**"
    return [x for x in l if set(values_to_exclude).issuperset({x}) == False]

In [None]:
# hide
a = [1,2,3,4]
b = [2,3]

assert exclude_element(a, b) == [1,4], 'Both lists should be equal'

In [None]:
# export
def build_interactions(l1, l2, l1_col_name='user_id', l2_col_name='item_id', sparcity=0.5):
    '''
    Builds interactions between l1 and l2. 
    The sparcity determines how sparse this interactions will be.
    '''
    interactions = {l1_col_name:[], l2_col_name:[]}
    for i in l1:
        for j in l2:
            if random.random() < sparcity:
                interactions[l1_col_name].append(i)
                interactions[l2_col_name].append(j)
    return pd.DataFrame(interactions)

In [None]:
# hide
build_interactions(gen_values(prefix='u'), gen_values(prefix='i'))

Unnamed: 0,user_id,item_id
0,u0,i0
1,u0,i1
2,u0,i2
3,u0,i6
4,u0,i7
5,u0,i8
6,u0,i9
7,u1,i1
8,u1,i2
9,u1,i5


In [None]:
# export
def build_features_from_df(feature_interactions_df, element_id_column, feature_column, tolist=True):
    '''
    Builds tuples of elements and its features to build the dataset
    '''
    unique_elements = feature_interactions_df[element_id_column].unique()
    tuples = []
    for e in unique_elements:
        filtered_rows = feature_interactions_df[feature_interactions_df[element_id_column] == e]
        feature_list = filtered_rows[feature_column].unique()
        if tolist: feature_list = feature_list.tolist()
        tuples.append((e, feature_list))
    return tuples

In [None]:
# hide
user_features = build_interactions(gen_values(n_values=4, prefix='u'), 
                   gen_values(n_values=4, prefix='f'), 
                   l1_col_name='user_id', 
                   l2_col_name='feature_id', 
                   sparcity=0.5
                  )

build_features_from_df(user_features, 'user_id', 'feature_id')

[('u0', ['f1', 'f2', 'f3']),
 ('u1', ['f2', 'f3']),
 ('u2', ['f1', 'f3']),
 ('u3', ['f0', 'f3'])]

## Experimentos

In [None]:
# hide
values = gen_values()
added, deleted = gen_added_n_deleted(values)
initial_values = exclude_element(values, added)
final_values = exclude_element(values, deleted)
('added',added), ('deleted',deleted), initial_values, final_values

(('added', ['u3', 'u4', 'u5']),
 ('deleted', ['u0', 'u1', 'u2']),
 ['u0', 'u1', 'u2', 'u6', 'u7', 'u8', 'u9'],
 ['u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9'])

In [None]:
# hide
dataset_inicial = Dataset()
all_users = gen_values()
all_items = gen_values(prefix='i')

users_added, users_deleted = gen_added_n_deleted(all_users)
items_added, items_deleted = gen_added_n_deleted(all_items)

initial_users = exclude_element(all_users, users_added)
initial_items = exclude_element(all_items, items_added)

dataset_inicial.fit(initial_users, initial_items)
users_added, items_added

(['u3', 'u4', 'u5'], ['i3', 'i4', 'i5'])

In [None]:
# hide
users_deleted, items_deleted

(['u0', 'u1', 'u2'], ['i0', 'i1', 'i2'])

In [None]:
# hide
dataset_inicial.mapping()

({'u0': 0, 'u1': 1, 'u2': 2, 'u6': 3, 'u7': 4, 'u8': 5, 'u9': 6},
 {'u0': 0, 'u1': 1, 'u2': 2, 'u6': 3, 'u7': 4, 'u8': 5, 'u9': 6},
 {'i0': 0, 'i1': 1, 'i2': 2, 'i6': 3, 'i7': 4, 'i8': 5, 'i9': 6},
 {'i0': 0, 'i1': 1, 'i2': 2, 'i6': 3, 'i7': 4, 'i8': 5, 'i9': 6})

In [None]:
# hide
dataset_inicial.fit_partial(users_added, items_added)

In [None]:
# hide
dataset_inicial.mapping()

({'u0': 0,
  'u1': 1,
  'u2': 2,
  'u6': 3,
  'u7': 4,
  'u8': 5,
  'u9': 6,
  'u3': 7,
  'u4': 8,
  'u5': 9},
 {'u0': 0,
  'u1': 1,
  'u2': 2,
  'u6': 3,
  'u7': 4,
  'u8': 5,
  'u9': 6,
  'u3': 7,
  'u4': 8,
  'u5': 9},
 {'i0': 0,
  'i1': 1,
  'i2': 2,
  'i6': 3,
  'i7': 4,
  'i8': 5,
  'i9': 6,
  'i3': 7,
  'i4': 8,
  'i5': 9},
 {'i0': 0,
  'i1': 1,
  'i2': 2,
  'i6': 3,
  'i7': 4,
  'i8': 5,
  'i9': 6,
  'i3': 7,
  'i4': 8,
  'i5': 9})