In [None]:
# default_exp synth

# Synth Dataset Generator

> Module to generate Synthetic Datasets to perform tests

This module is used to initialize datasets to test the utils from this library

In [None]:
# hide
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
# export
import pandas as pd
import numpy as np
import random
from lightfm import LightFM
from lightfm.data import Dataset
from fastcore.all import * 
from datetime import datetime

random.seed(42)

In [None]:
# export
class SynthDataset():
    '''
    Instance of two states of a dataset, one at time **T** and the other at time **T+1** 
    where some users and items could had been added, deleted, and also their feautes
    '''
    def __init__(self):
        self.all_users = []
        self.all_items = []
        self.all_user_metadata = []
        self.all_item_metadata = []
        
        self.before = {}
        self.after = {}
        
        self.users_added = []
        self.users_deleted = []
        self.items_added = []
        self.items_deleted = []
        
        self.deleted_user_metadata = []
        self.added_user_metadata = []
        self.deleted_item_metadata = []
        self.added_item_metadata = []

In [None]:
# export
def gen_values(n_values:int = 10, prefix:str ='u') -> list:
    "Generates a list of values that will be used for generate the dataset"
    l = []
    for i in range(n_values):
        l.append(prefix + str(i))
    return l

In [None]:
gen_values()

['u0', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9']

In [None]:
# export
def gen_added_n_deleted(l_values:list, max_added:int = 3, max_deleted:int = 3) -> (list, list):
    '''
    Generates two lists of values, one list will contain the values that will be deleted from the dataset,
    and the second one will contain the values that will be added to the dataset.
    '''
    deleted = []
    added = []
    for i in l_values:
        r = random.random()
        if len(deleted) < max_deleted and r < 0.8:
            deleted.append(i)
        elif len(added) < max_added and r > 0.2:
            added.append(i)
            
    return added, deleted

In [None]:
gen_added_n_deleted(gen_values())

(['u3', 'u4', 'u5'], ['u0', 'u1', 'u2'])

In [None]:
# export
def exclude_element(l:list, values_to_exclude:list, shuffle:bool = False) -> list:
    "Excludes the elements from **values_to_exclude** from **l**"
    new_l = [x for x in l if set(values_to_exclude).issuperset({x}) == False]
    if shuffle: new_l.shuffle()
    return new_l

In [None]:
# hide
a = [1,2,3,4]
b = [2,3]

assert exclude_element(a, b) == [1,4], 'Both lists should be equal'

In [None]:
# export
def build_interactions(l1:list, 
                       l2:list, 
                       l1_col_name:str = 'user_id', 
                       l2_col_name:str = 'item_id', 
                       sparsity:float = 0.5, 
                       feedback:bool = False, 
                       timestamp:bool = False) -> pd.DataFrame:
    '''
    Builds interactions between l1 and l2. 
    The sparsity determines how sparse this interactions will be.
    If feedback equals **True** a column with feedback of one is added
    If timestamp equals **True** a column with a timestamp is added in order to determine the order in which each interaction occurred
    '''
    interactions = {l1_col_name:[], l2_col_name:[]}
    if feedback: interactions['feedback']=[]
    if timestamp: interactions['timestamp']=[]
    for i in l1:
        for j in l2:
            if random.random() < sparsity:
                interactions[l1_col_name].append(i)
                interactions[l2_col_name].append(j)
                if feedback: interactions['feedback'].append(1)
                if timestamp: interactions['timestamp'].append(datetime.timestamp(datetime.now()))
    return pd.DataFrame(interactions)

In [None]:
# hide
test_interactions = build_interactions(gen_values(prefix='u'), gen_values(prefix='i'))

In [None]:
# export
def build_metadata_from_df(metadata_interactions_df:pd.DataFrame, element_id_column:str, metadata_column:str, tolist=True) -> [tuple]:
    '''
    Builds tuples of elements and its metadata to build the dataset
    '''
    unique_elements = metadata_interactions_df[element_id_column].unique()
    tuples = []
    for e in unique_elements:
        filtered_rows = metadata_interactions_df[metadata_interactions_df[element_id_column] == e]
        metadata_list = filtered_rows[metadata_column].unique()
        if tolist: metadata_list = metadata_list.tolist()
        tuples.append((e, metadata_list))
    return tuples

In [None]:
# hide
user_metadata = build_interactions(gen_values(n_values=4, prefix='u'), 
                   gen_values(n_values=4, prefix='f'), 
                   l1_col_name='user_id', 
                   l2_col_name='metadata_id', 
                   sparsity=0.5
                  )

build_metadata_from_df(user_metadata, 'user_id', 'metadata_id')

[('u0', ['f1', 'f3']), ('u1', ['f2']), ('u2', ['f1']), ('u3', ['f2', 'f3'])]

In [None]:
# hide
users = gen_values(prefix='u')
items = gen_values(prefix='i')
all_user_metadata = gen_values(prefix='uf')
all_item_metadata = gen_values(prefix='if')
user_metadata = build_metadata_from_df(
    build_interactions(
        users,
        all_user_metadata,
        l1_col_name='user_id', 
        l2_col_name='metadata_id', 
        sparsity=0.5
    ),
    element_id_column='user_id',
    metadata_column='metadata_id'
)
item_metadata = build_metadata_from_df(
    build_interactions(
        items,
        all_item_metadata,
        l1_col_name='item_id', 
        l2_col_name='metadata_id', 
        sparsity=0.5
    ),
    element_id_column='item_id',
    metadata_column='metadata_id'
)
users, items, all_user_metadata, all_item_metadata, user_metadata, item_metadata

(['u0', 'u1', 'u2', 'u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9'],
 ['i0', 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'i9'],
 ['uf0', 'uf1', 'uf2', 'uf3', 'uf4', 'uf5', 'uf6', 'uf7', 'uf8', 'uf9'],
 ['if0', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9'],
 [('u0', ['uf0', 'uf4', 'uf5', 'uf8', 'uf9']),
  ('u1', ['uf0', 'uf3', 'uf4', 'uf6', 'uf8', 'uf9']),
  ('u2', ['uf2', 'uf3', 'uf6', 'uf8', 'uf9']),
  ('u3', ['uf0', 'uf2', 'uf3', 'uf4', 'uf6', 'uf9']),
  ('u4', ['uf0', 'uf2', 'uf3', 'uf6', 'uf9']),
  ('u5', ['uf0', 'uf1', 'uf2', 'uf3', 'uf7', 'uf8', 'uf9']),
  ('u6', ['uf1', 'uf2', 'uf3', 'uf4', 'uf5', 'uf6', 'uf8']),
  ('u7', ['uf1', 'uf7', 'uf8', 'uf9']),
  ('u8', ['uf0', 'uf1', 'uf2', 'uf4', 'uf6', 'uf7']),
  ('u9', ['uf2', 'uf3', 'uf8'])],
 [('i0', ['if2', 'if4', 'if5', 'if8', 'if9']),
  ('i1', ['if1', 'if4', 'if8']),
  ('i2', ['if0', 'if1', 'if3', 'if4', 'if6', 'if7']),
  ('i3', ['if0', 'if1', 'if2', 'if6', 'if7']),
  ('i4', ['if3', 'if5', 'if8', 'if9']),
  ('i5',

In [None]:
# export

@patch
def add_user_metadata(self: SynthDataset):
    '''
    Adds metadata to the users
    '''
    users_metadata = build_metadata_from_df(
        build_interactions(
            all_users,
            all_user_metadata,
            l1_col_name='user_id', 
            l2_col_name='metadata_id', 
            sparsity=0.5
        ),
        element_id_column='user_id',
        feature_column='metadata_id'
    )
    return users_metadata

@patch
def add_item_metadata(self: SynthDataset):
    '''
    Adds metadata to the items
    '''
    items_metadata = build_metadata_from_df(
        build_interactions(
            all_items,
            all_item_metadata,
            l1_col_name='item_id', 
            l2_col_name='metadata_id', 
            sparsity=0.5
        ),
        element_id_column='item_id',
        feature_column='metadata_id'
    )
    return items_metadata

In [None]:
#export
@patch
def gen_users_n_items(self:SynthDataset, 
                           n_users:int, 
                           n_items:int):
    '''
    Method to generate all users and items
    '''
    self.all_users = gen_values(n_values=n_users, prefix='u')
    self.all_items = gen_values(n_values=n_items, prefix='i')
    
@patch
def gen_metadata(self:SynthDataset, n_user_metadata, n_item_metadata):
    '''
    Generates metadata for users and items
    '''
    self.all_user_metadata = gen_values(n_values=n_user_metadata, prefix='uf')
    self.all_item_metadata = gen_values(n_values=n_item_metadata, prefix='if')
    
@patch
def print_dataset_components(self:SynthDataset, 
                             print_added_n_deleted:bool, 
                             add_user_metadata:bool, 
                             add_item_metadata:bool):
    '''
    Prints the Dataset components
    '''
    if print_added_n_deleted: 
        print('added users: {}\t deleted users: {}'.format(self.users_added, self.users_deleted))
        print('added items: {}\t deleted items: {}'.format(self.items_added, self.items_deleted))
        print('users before:\t{}\nusers after:\t{}'.format(self.before['user_id'], self.after['user_id']))
        print('items before:\t{}\nitems after:\t{}'.format(self.before['item_id'], self.after['item_id']))
        if add_user_metadata: print('added user features: {}\t deleted user features: {}'.format(self.added_user_metadata, self.deleted_user_metadata))
        if add_item_metadata: print('added item features: {}\t deleted item features: {}'.format(self.added_item_metadata, self.deleted_item_metadata))
        




In [None]:
#export
@patch
def build_synth_dataset(self:SynthDataset, 
                        n_users:int = 10, 
                        n_items:int = 10, 
                        max_added:int = 3, 
                        max_deleted:int = 3, 
                        print_added_n_deleted:bool = False, 
                        add_user_metadata:bool = False,
                        add_item_metadata:bool = False,
                        n_user_metadata:int = 10,
                        n_item_metadata:int = 10
                       ):
    '''
    This function generates two **datasets** to simulate changes through time from one dataset.
    The first generated **dataset** is the state from the data in a time *t* and the second dataset
    simulates the state from the data at a time *t+1* where some users and items where added and deleted,
    and their metadata could be also updated (new metadata that expresses better the characteristics from that item, or just corrections)
    '''

    # Generates all the possible users and items
    self.gen_users_n_items(n_users=n_users, n_items=n_items)
    # Generates all the posible metadata values for users and items
    self.gen_metadata(n_user_metadata=n_user_metadata, n_item_metadata=n_item_metadata)
    
    # Builds two groups of users, the ones that will be added to the system
    # and the ones that requested to be deleted from the system
    self.users_added, self.users_deleted = gen_added_n_deleted(self.all_users, 
                                                               max_added=max_added,
                                                               max_deleted=max_deleted)
    
    # Builds two groups of items, the ones that will be added to the system
    # and the ones that requested to be deleted from the system
    self.items_added, self.items_deleted = gen_added_n_deleted(self.all_items, 
                                                               max_added=max_added, 
                                                               max_deleted=max_deleted)
        
    self.before['user_id'] = exclude_element(self.all_users, self.users_added)
    self.before['item_id'] = exclude_element(self.all_items, self.items_added)
    self.after['user_id'] = exclude_element(self.all_users, self.users_deleted)
    self.after['item_id'] = exclude_element(self.all_items, self.items_deleted)
    
    if add_user_metadata: self.added_user_metadata, self.deleted_user_metadata = gen_added_n_deleted(self.all_user_metadata, 
                                                                                   max_added=max_added, 
                                                                                   max_deleted=max_deleted)
    
    if add_item_metadata: self.added_item_metadata, self.deleted_item_metadata = gen_added_n_deleted(self.all_item_metadata, 
                                                                                   max_added=max_added, 
                                                                                   max_deleted=max_deleted)
    self.print_dataset_components(print_added_n_deleted, add_user_metadata, add_item_metadata)
            

In [None]:
x = SynthDataset()
x.build_synth_dataset(print_added_n_deleted=True, add_user_metadata=True, add_item_metadata=True)
# x.before['user_id'], x.after['user_id'], x.before['item_id'], x.after['item_id']

added users: ['u3', 'u4', 'u5']	 deleted users: ['u0', 'u1', 'u2']
added items: ['i3', 'i4', 'i5']	 deleted items: ['i0', 'i1', 'i2']
users before:	['u0', 'u1', 'u2', 'u6', 'u7', 'u8', 'u9']
users after:	['u3', 'u4', 'u5', 'u6', 'u7', 'u8', 'u9']
items before:	['i0', 'i1', 'i2', 'i6', 'i7', 'i8', 'i9']
items after:	['i3', 'i4', 'i5', 'i6', 'i7', 'i8', 'i9']
added user features: ['uf2', 'uf4', 'uf5']	 deleted user features: ['uf0', 'uf1', 'uf3']
added item features: ['if3', 'if4', 'if5']	 deleted item features: ['if0', 'if1', 'if2']


In [None]:
# export
@patch
def build_user_item_interactions(self:SynthDataset, 
                                 new:bool = False, 
                                 sparsity:float = 0.5, 
                                 feedback:bool = True, 
                                 timestamp:bool = True) -> pd.DataFrame:
    '''
    Builds interactions between users and items. If ```new=False``` the interactions build will be based on
    the original state of the dataset, else it will be based on the new state of the dataset, with the added 
    and deleted users and items respectively 
    '''
    if new: 
        interactions = build_interactions(self.after['user_id'], self.after['item_id'], sparsity=sparsity, feedback=feedback, timestamp=timestamp)
    else: 
        interactions = build_interactions(self.before['user_id'], self.before['item_id'], sparsity=sparsity, feedback=feedback, timestamp=timestamp)
    return interactions

In [None]:
# export
def df_highlight_interaction(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    # is_max = s = s.max()
    is_max = s > 0
    # return ['background-color: yellow' if v else '' for v in is_max]
    return ['background-color: #1a8a1c; color: white' if v else '' for v in is_max]

def df_color_positive_interaction(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = '#1db31f' if val > 0 else '#bfbfbf' #1a8a1c
    return 'color: %s' % color

In [None]:
# export
@patch
def top_interactions_crosstab(self:SynthDataset, 
                              df_interactions:pd.DataFrame, 
                              hightlight_interaction:bool = False, 
                              color_interaction:bool = False) -> pd.DataFrame:
    '''
    Takes the interactions DataFrame (user-item interactions or object-metadata interactions) and transforms de
    Dataframe in a crosstab between the most interacted abstractions (user-item or object-metadata)
    '''
    g=df_interactions.groupby('user_id')['feedback'].count()
    topUsers=g.sort_values(ascending=False)[:15]

    g=df_interactions.groupby('item_id')['feedback'].count()
    topItems=g.sort_values(ascending=False)[:15]

    top_r = df.join(topUsers, rsuffix='_r', how='inner', on='user_id')
    top_r = top_r.join(topItems, rsuffix='_r', how='inner', on='item_id')

    crosstab = pd.crosstab(top_r.user_id, top_r.item_id, top_r.feedback, aggfunc=np.sum)
    return crosstab

In [None]:
df = x.build_user_item_interactions()
crosstab = x.top_interactions_crosstab(df, hightlight_interaction=True)

In [None]:
crosstab.style.apply(df_highlight_interaction)

item_id,i0,i1,i2,i6,i7,i8,i9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u0,1.0,,,1.0,,,
u1,1.0,1.0,1.0,1.0,1.0,,1.0
u2,1.0,,1.0,,1.0,1.0,
u6,1.0,,,1.0,1.0,1.0,
u7,,1.0,1.0,,1.0,,
u9,,,,,1.0,1.0,1.0


In [None]:
crosstab.style.applymap(df_color_positive_interaction)

item_id,i0,i1,i2,i6,i7,i8,i9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u0,1.0,,,1.0,,,
u1,1.0,1.0,1.0,1.0,1.0,,1.0
u2,1.0,,1.0,,1.0,1.0,
u6,1.0,,,1.0,1.0,1.0,
u7,,1.0,1.0,,1.0,,
u9,,,,,1.0,1.0,1.0
