In [1]:
# Import dependencies
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import data
with open("BigSim.pkl", "rb") as file:
    full_data = pickle.load(file)

### A. Prepare training data for  inference model

Use 20 marketplaces to train the inference model. Considering 1600 time series per marketplace, that equals 32.000 review time series.

In [2]:
# Shape the training set for the inference model: Review time series

simulations = full_data["simulations"]
print(f"Shape of the full set of review time series: {simulations.shape}")

train_simulations = simulations[:20]
print(f"Shape of the defined train set of review time series: {train_simulations.shape}")

Shape of the full set of review time series: (32, 1600)
Shape of the defined train set of review time series: (20, 1600)


In [3]:
# Shape the training set for the inference model: Simulation parameters

parameters = full_data["simulation_parameters"]
for key, value in parameters.items():
    print(f"Shape of the full set of parameter {key}: {value.shape}")
print("######################")

train_parameters = {}  

for key, value in parameters.items():
    if key == "rho":
        train_parameters[key] = value[:, :32000, :]
    else:
        train_parameters[key] = value[:, :32000]

for key, value in train_parameters.items():
    print(f"Shape of the train set of parameter {key}: {value.shape}")

Shape of the full set of parameter rho: (10, 51200, 2)
Shape of the full set of parameter h_p: (10, 51200)
Shape of the full set of parameter p_5: (10, 51200)
Shape of the full set of parameter p_4: (10, 51200)
Shape of the full set of parameter p_2: (10, 51200)
Shape of the full set of parameter p_1: (10, 51200)
Shape of the full set of parameter bias_5_star: (10, 51200)
######################
Shape of the train set of parameter rho: (10, 32000, 2)
Shape of the train set of parameter h_p: (10, 32000)
Shape of the train set of parameter p_5: (10, 32000)
Shape of the train set of parameter p_4: (10, 32000)
Shape of the train set of parameter p_2: (10, 32000)
Shape of the train set of parameter p_1: (10, 32000)
Shape of the train set of parameter bias_5_star: (10, 32000)


In [4]:
# Substitute train_simulations and train_parameters in a copy of the simulation to create the train simulation (train_data)

train_data = full_data.copy()
train_data["simulations"] = train_simulations
train_data["simulation_parameters"] = train_parameters

In [5]:
# Store training data as .pkl

with open('training_simulation_data.pkl', 'wb') as file:
    pickle.dump(train_data, file)

### B. Prepare experiments' data ("Base group")

All simulated time series not used to train the inference model (12/32 marketplaces - 19200 time series) will be used to build the "treatment" groups (See C. and D. below) and for conditional sampling.

In [6]:
# Shape experiments' data: time series

simulations = full_data["simulations"]
print(f"Shape of the full set of review time series: {simulations.shape}")

experiment_simulations = simulations[20:]
print(f"Shape of the defined experimental set of review time series: {experiment_simulations.shape}")

Shape of the full set of review time series: (32, 1600)
Shape of the defined experimental set of review time series: (12, 1600)


In [7]:
# Shape experiments' data: simulation parameters

parameters = full_data["simulation_parameters"]
for key, value in parameters.items():
    print(f"Shape of the full set of parameter {key}: {value.shape}")
print("######################")

experiment_parameters = {}  

for key, value in parameters.items():
    if key == "rho":
        experiment_parameters[key] = value[:, 32000:, :]
    else:
        experiment_parameters[key] = value[:, 32000:]

for key, value in experiment_parameters.items():
    print(f"Shape of the experiment set of parameter {key}: {value.shape}")

Shape of the full set of parameter rho: (10, 51200, 2)
Shape of the full set of parameter h_p: (10, 51200)
Shape of the full set of parameter p_5: (10, 51200)
Shape of the full set of parameter p_4: (10, 51200)
Shape of the full set of parameter p_2: (10, 51200)
Shape of the full set of parameter p_1: (10, 51200)
Shape of the full set of parameter bias_5_star: (10, 51200)
######################
Shape of the experiment set of parameter rho: (10, 19200, 2)
Shape of the experiment set of parameter h_p: (10, 19200)
Shape of the experiment set of parameter p_5: (10, 19200)
Shape of the experiment set of parameter p_4: (10, 19200)
Shape of the experiment set of parameter p_2: (10, 19200)
Shape of the experiment set of parameter p_1: (10, 19200)
Shape of the experiment set of parameter bias_5_star: (10, 19200)


In [8]:
# Substitute experiment_simulations and experiment_parameters in a copy of the simulation to create the train simulation (train_data)

experiment_data = full_data.copy()
experiment_data["simulations"] = experiment_simulations
experiment_data["simulation_parameters"] = experiment_parameters

In [9]:
# Store base experiment data as .pkl

with open('inference_experiment_base_data.pkl', 'wb') as file:
    pickle.dump(experiment_data, file)

### C. Preparation for inference experiment 1: Jumble review time series

From the base group, modify the order in which reviews are left by simulated users in all time series while preserving the final review histogram shape.

In [10]:
experiment_1_data = experiment_data.copy()

In [11]:
def generate_alternative_timeseries(final_state):
    '''
    Given a review timeseries' final state (histogram), returns an alternative 
    time series between the initial state ([1, 1, 1, 1, 1]) and the final state
    generated at random.
    '''
    
    # "Total distance": number of reviews posted for the considered product (time series length)
    total_distance = final_state - np.ones(5)
    
    # If time series is empty, return itself
    if sum(total_distance) == 0:
        return(np.ones(5))
    
    # Remaining distance: number of reviews yet to be posted
    remaining_distance = total_distance
    
    # New (alternative) time series
    path = [np.array([1,1,1,1,1])]
    
    # Iterate over total length of time series
    for i in range (int(sum(total_distance))):
        
        # Rating axes along which "steps" can be made
        axes = np.where(remaining_distance)[0] 
        
        # New rating to be added (at random)
        new_rating_index = np.random.choice(axes)
        
        # Generate new rating
        new_rating_step = np.zeros(5)
        new_rating_step[new_rating_index] = 1
        
        # Update time series with new rating
        path.append(path[-1] + new_rating_step)
        
        # Update "remaining distance" after the addtion of the last rating
        remaining_distance = remaining_distance - new_rating_step
        
    # Return new alternative time series    
    return np.array(path)

In [12]:
jumbled_timeseries = np.empty((12, 1600), dtype=object)

# Loop over dimension 1 of simulated review timeseries (Marketplaces)
for i in range(experiment_1_data["simulations"].shape[0]):
    
    # Loop over dimension 2 of simulated review timeseries (Products)
    for e in range(experiment_1_data["simulations"].shape[1]):
        
        # "Select" original time series for the simulation
        series = experiment_1_data["simulations"][i][e]
        
        # Store jumbled version of the original time series in the same position of the empty array
        jumbled_timeseries[i][e] = generate_alternative_timeseries(series[-1])
        
        

In [13]:
# Shape sanity check

print(f"Shape of array of jumbled time series: {jumbled_timeseries.shape}")

print(f"Shape of array of original simulation time series: {experiment_1_data['simulations'].shape}")

Shape of array of jumbled time series: (12, 1600)
Shape of array of original simulation time series: (12, 1600)


In [14]:
# Substitute original simulation time series with jumbled time series

experiment_1_data["simulations"] = jumbled_timeseries

In [15]:
# Store experiment 1 data as .pkl

with open('inference_experiment_1_data.pkl', 'wb') as file:
    pickle.dump(experiment_1_data, file)

### D. Preparation for inference experiment 2: Increase number of reviews

From the base group, increase the number of reviews in all time series by a factor of two while preserving the final review histogram shape.

In [16]:
experiment_2_data = experiment_data.copy()

In [17]:
time_series = experiment_2_data["simulations"][1][0]

In [18]:
def augment_timeseries(time_series):

    # Transform time series to first differences
    first_differences = [time_series[i] - time_series[i - 1] for i in range(1, len(time_series))]
    
    # Prepare "Augmentation" by stacking two time series' first differences
    first_differences_double = first_differences + first_differences
    
    # Set initial value of new augmented time series
    initial_value = np.array([1, 1, 1, 1, 1])
    absolute_values = [initial_value]
    
    # Transform stacked first differences into absolute values
    for diff in first_differences_double:
        absolute_values.append(absolute_values[-1] + diff)
        
    absolute_values.append(absolute_values[-1] + np.ones(5))

    absolute_values = np.array(absolute_values)
    
    return(absolute_values)
        

In [19]:
augmented_timeseries = np.empty((12, 1600), dtype=object)

# Loop over dimension 1 of simulated review timeseries (Marketplaces)
for i in range(experiment_2_data["simulations"].shape[0]):
    
    # Loop over dimension 2 of simulated review timeseries (Products)
    for e in range(experiment_2_data["simulations"].shape[1]):
        
        augmented_timeseries[i][e] = augment_timeseries(experiment_2_data["simulations"][i][e])

In [20]:
# Shape sanity check

print(f"Shape of array of augmented time series: {augmented_timeseries.shape}")

print(f"Shape of array of original simulation time series: {experiment_2_data['simulations'].shape}")

Shape of array of augmented time series: (12, 1600)
Shape of array of original simulation time series: (12, 1600)


In [21]:
# Substitute original simulation time series with augmented time series

experiment_2_data["simulations"] = augmented_timeseries

In [22]:
# Store experiment 2 data as .pkl

with open('inference_experiment_2_data.pkl', 'wb') as file:
    pickle.dump(experiment_2_data, file)