Mainstream Bias in Dynamic Recommendation

In [21]:
# Import dependencies
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logs
import warnings; warnings.simplefilter('ignore')  # Ignore warnings for cleaner output
# import time
import numpy as np
import argparse
# import utility
from Simulation_basic import Simulation
import pickle
import pandas as pd
# from tqdm import tqdm
from math import log
from scipy.sparse import coo_matrix
# import seaborn as sns
import matplotlib.pyplot as plt
# from scipy.stats import skew
# from scipy.stats import mode
# from sklearn.neighbors import LocalOutlierFactor

In [22]:
# Parse command-line arguments
# args = parser.parse_args()

# Define the default values for arguments
default_args = {
    'run': 1,
    'iteration': 1000,
    'exp': 1,
    'cycle_itr': 50,
    'epoch': 20,
    'K': 20,
    'lr': 0.001,
    'reg': 1e-5,
    'hidden': 100,
    'neg': 5,
    'data': 'ml1m'
}

# Create the args object with default values
args = argparse.Namespace(**default_args)

In [31]:
# Load truth data and set experiment parameters
truth = np.load('./Data/' + args.data + '/truth.npy')
args.num_user = truth.shape[0]
args.num_item = truth.shape[1]
audience_size = np.sum(truth, axis=0)
item_sorted = np.argsort(audience_size)
truth_like = np.load('./Data/' + args.data + '/user_truth_like.npy', allow_pickle=True)

print("Number of Items: " + str(len(truth_like)) + "\n")

Number of Items: 223869



In [35]:
# # Calculate mainstream scores (MS_similarity) using your code
# train_df = pd.read_csv('./Data/' + args.data + '/train_df.csv')

# # Calculate user popularity
# pos_user_array = train_df['userId'].values
# pos_item_array = train_df['itemId'].values
# train_mat = coo_matrix((np.ones(len(pos_user_array)), (pos_user_array, pos_item_array)), shape=(num_user, num_item)).toarray()
# user_pop = np.sum(train_mat, axis=1)

# # Calculate standard deviation of user interactions
# user_stddev = np.std(train_mat, axis=1)

# # Save the user standard deviations to a file (adjust the path accordingly)
# with open(f'./Data/{args.data}/user_stddev.npy', "wb") as f:
#     np.save(f, user_stddev)

# # Calculate Jaccard similarity matrix
# Jaccard_mat = np.matmul(train_mat, train_mat.T)
# deno = user_pop.reshape((-1, 1)) + user_pop.reshape((1, -1)) - Jaccard_mat + 1e-7
# Jaccard_mat /= deno
# Jaccard_mat = Jaccard_mat + np.eye(num_user) * -9999
# Jaccard_mat = Jaccard_mat[np.where(Jaccard_mat > -1)].reshape((num_user, num_user - 1))

# # Calculate Mainstream Similarity (MS_similarity) by taking the mean along axis 1
# MS_similarity = np.mean(Jaccard_mat, axis=1)

# # Save the MS similarity to a file (adjust the path accordingly)
# with open(f'./Data/{args.data}/MS_similarity.npy', "wb") as f:
#     np.save(f, MS_similarity)


# Since we're not using train_df, let's create a user-item matrix from truth
user_item_matrix = truth.copy()  # Assuming truth already represents user-item interactions

# Calculate user popularity based on the user-item matrix
user_pop = np.sum(user_item_matrix, axis=1)

# Calculate Jaccard similarity matrix based on the user-item matrix
Jaccard_mat = np.matmul(user_item_matrix, user_item_matrix.T)
deno = user_pop.reshape((-1, 1)) + user_pop.reshape((1, -1)) - Jaccard_mat + 1e-7
Jaccard_mat /= deno
np.fill_diagonal(Jaccard_mat, 0)  # Set diagonal to zero to exclude self-similarity

# Calculate Mainstream Similarity (MS_similarity) by taking the mean along axis 1
MS_similarity = np.mean(Jaccard_mat, axis=1)

# Save the MS similarity to a file (adjust the path accordingly)
with open(f'./Data/{args.data}/MS_similarity.npy', "wb") as f:
    np.save(f, MS_similarity)

print("Mainstream scores: " + str(len(MS_similarity)) + "\n")
print("Args Number of Users: " + str(args.num_user) + "\n")

Mainstream scores: 1000

Args Number of Users: 1000



In [40]:
# Initialize a list to store Gini coefficients after each epoch
gini_coefficients = []

# Run the experiment for a specified number of runs
for r in range(args.run):
    print('')
    print('#' * 100)
    print('#' * 100)
    print(' ' * 50 + ' Experiment run ' + str(r + 1) + ' ' * 50)
    print('#' * 100)
    print('#' * 100)

    # Initialize the simulation with provided arguments and data
    simulation = Simulation(args, truth, truth_like)
    init_popularity = simulation.initial_iterations()  # Perform initial iterations to gather feedback
    gini_coefficients.append(simulation.run_simulation())  # Run the main simulation


####################################################################################################
####################################################################################################
                                                   Experiment run 1                                                  
####################################################################################################
####################################################################################################
Namespace(K=20, cycle_itr=50, data='ml1m', epoch=20, exp=1, hidden=100, iteration=1000, lr=0.001, neg=5, num_item=3406, num_user=1000, reg=1e-05, run=1)
****************************** Start initial random iterations ******************************
---------- Iteration 1 ----------


100%|██████████| 1000/1000 [00:00<00:00, 6601.18it/s]

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Generate 463 records.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
****************************** Train MF until converge ******************************
Update bs to 55





******************************MF initialization done ******************************


100%|██████████| 51/51 [00:00<00:00, 99.03it/s] 


Training // Epoch 0 //  Total cost = 35.3854  Total cost1 = 35.3450  Total cost2 = 0.0404


100%|██████████| 51/51 [00:00<00:00, 134.74it/s]


Training // Epoch 1 //  Total cost = 35.0200  Total cost1 = 34.9930  Total cost2 = 0.0270


100%|██████████| 51/51 [00:00<00:00, 154.08it/s]


Training // Epoch 2 //  Total cost = 34.4976  Total cost1 = 34.4632  Total cost2 = 0.0344


100%|██████████| 51/51 [00:00<00:00, 141.86it/s]


Training // Epoch 3 //  Total cost = 33.6552  Total cost1 = 33.6073  Total cost2 = 0.0479


100%|██████████| 51/51 [00:00<00:00, 123.19it/s]


Training // Epoch 4 //  Total cost = 32.3739  Total cost1 = 32.3067  Total cost2 = 0.0673


100%|██████████| 51/51 [00:00<00:00, 150.22it/s]


Training // Epoch 5 //  Total cost = 30.5303  Total cost1 = 30.4357  Total cost2 = 0.0945


100%|██████████| 51/51 [00:00<00:00, 152.47it/s]


Training // Epoch 6 //  Total cost = 27.9861  Total cost1 = 27.8565  Total cost2 = 0.1296


100%|██████████| 51/51 [00:00<00:00, 155.72it/s]


Training // Epoch 7 //  Total cost = 24.9615  Total cost1 = 24.7888  Total cost2 = 0.1726


100%|██████████| 51/51 [00:00<00:00, 159.38it/s]


Training // Epoch 8 //  Total cost = 21.2293  Total cost1 = 21.0060  Total cost2 = 0.2233


100%|██████████| 51/51 [00:00<00:00, 139.15it/s]


Training // Epoch 9 //  Total cost = 16.9608  Total cost1 = 16.6784  Total cost2 = 0.2824


100%|██████████| 51/51 [00:00<00:00, 127.98it/s]


Training // Epoch 10 //  Total cost = 12.1784  Total cost1 = 11.8312  Total cost2 = 0.3472


100%|██████████| 51/51 [00:00<00:00, 138.97it/s]


Training // Epoch 11 //  Total cost = 6.8911  Total cost1 = 6.4701  Total cost2 = 0.4210


100%|██████████| 51/51 [00:00<00:00, 112.58it/s]


Training // Epoch 12 //  Total cost = 1.2068  Total cost1 = 0.7042  Total cost2 = 0.5026


100%|██████████| 51/51 [00:00<00:00, 139.34it/s]


Training // Epoch 13 //  Total cost = -4.8566  Total cost1 = -5.4465  Total cost2 = 0.5899


100%|██████████| 51/51 [00:00<00:00, 153.15it/s]


Training // Epoch 14 //  Total cost = -11.3488  Total cost1 = -12.0338  Total cost2 = 0.6850


100%|██████████| 51/51 [00:00<00:00, 160.13it/s]


Training // Epoch 15 //  Total cost = -18.3846  Total cost1 = -19.1712  Total cost2 = 0.7866


100%|██████████| 51/51 [00:00<00:00, 163.72it/s]


Training // Epoch 16 //  Total cost = -25.9405  Total cost1 = -26.8339  Total cost2 = 0.8935


100%|██████████| 51/51 [00:00<00:00, 138.02it/s]


Training // Epoch 17 //  Total cost = -34.3392  Total cost1 = -35.3478  Total cost2 = 1.0086


100%|██████████| 51/51 [00:00<00:00, 136.18it/s]


Training // Epoch 18 //  Total cost = -41.4635  Total cost1 = -42.5930  Total cost2 = 1.1295


100%|██████████| 51/51 [00:00<00:00, 138.96it/s]


Training // Epoch 19 //  Total cost = -50.1669  Total cost1 = -51.4228  Total cost2 = 1.2559
****************************** Start simulation ******************************
****************************** Epoch 0 ******************************


100%|██████████| 50/50 [00:00<00:00, 4545.09it/s]
100%|██████████| 50/50 [00:00<00:00, 2500.03it/s]
100%|██████████| 50/50 [00:00<00:00, 2940.98it/s]
100%|██████████| 50/50 [00:00<00:00, 3030.48it/s]
100%|██████████| 50/50 [00:00<00:00, 2439.14it/s]
100%|██████████| 50/50 [00:00<00:00, 3125.55it/s]
100%|██████████| 50/50 [00:00<00:00, 2777.76it/s]
100%|██████████| 50/50 [00:00<00:00, 819.69it/s]
100%|██████████| 50/50 [00:00<00:00, 1886.84it/s]
100%|██████████| 50/50 [00:00<00:00, 2702.62it/s]
100%|██████████| 50/50 [00:00<00:00, 2222.10it/s]
100%|██████████| 50/50 [00:00<00:00, 3703.58it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.47it/s]
100%|██████████| 50/50 [00:00<00:00, 4000.52it/s]
100%|██████████| 50/50 [00:00<00:00, 3846.08it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.63it/s]
100%|██████████| 50/50 [00:00<00:00, 3225.69it/s]
100%|██████████| 50/50 [00:00<00:00, 3124.95it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.79it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.26it/s]


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U2'), dtype('float64')) -> None

In [None]:
# Calculate the average of each corresponding element across lists
print(gini_coefficients)
average_gini_coefficients = np.mean(np.array(gini_coefficients), axis=0)

# Visualize the averaged Gini coefficients
plt.figure(figsize=(10, 6))
x_values = range(len(average_gini_coefficients))
plt.plot(x_values, average_gini_coefficients, marker='o', linestyle='-', color='b')
plt.title('Average Gini Coefficients Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Average Gini Coefficient')
plt.grid(True)
print(average_gini_coefficients)
plt.show()

[]


TypeError: object of type 'numpy.float64' has no len()

<Figure size 1000x600 with 0 Axes>