Mainstream Bias in Dynamic Recommendation

In [2]:
# Import dependencies
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logs
import warnings; warnings.simplefilter('ignore')  # Ignore warnings for cleaner output
# import time
import numpy as np
import argparse
# import utility
from Simulation_basic import Simulation
import pickle
import pandas as pd
# from tqdm import tqdm
from math import log
from scipy.sparse import coo_matrix
# import seaborn as sns
import matplotlib.pyplot as plt
# from scipy.stats import skew
# from scipy.stats import mode
# from sklearn.neighbors import LocalOutlierFactor

In [3]:
# Parse command-line arguments
# args = parser.parse_args()

# Define the default values for arguments
default_args = {
    'run': 1,
    'iteration': 1000,
    'exp': 1,
    'cycle_itr': 50,
    'epoch': 20,
    'K': 20,
    'lr': 0.001,
    'reg': 1e-5,
    'hidden': 100,
    'neg': 5,
    'data': 'ml1m'
}

# Create the args object with default values
args = argparse.Namespace(**default_args)

In [4]:
# Load truth data and set experiment parameters
truth = np.load('./Data/' + args.data + '/truth.npy')
args.num_user = truth.shape[0]
args.num_item = truth.shape[1]
audience_size = np.sum(truth, axis=0)
item_sorted = np.argsort(audience_size)
truth_like = list(np.load('./Data/' + args.data + '/user_truth_like.npy', allow_pickle=True))

# Print total truth for reference
print('')
print('!' * 30 + ' Total truth ' + str(np.sum(truth)) + ' ' + '!' * 30)
print('')


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Total truth 223869.0 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!



In [5]:
# Calculate mainstream scores (MS_similarity) using your code
with open('./Data/' + args.data + '/info.pkl', 'rb') as f:
    info = pickle.load(f)
    num_user = info['num_user']
    num_item = info['num_item']

train_df = pd.read_csv('./Data/' + args.data + '/train_df.csv')

# Calculate user popularity
pos_user_array = train_df['userId'].values
pos_item_array = train_df['itemId'].values
train_mat = coo_matrix((np.ones(len(pos_user_array)), (pos_user_array, pos_item_array)), shape=(num_user, num_item)).toarray()
user_pop = np.sum(train_mat, axis=1)

# Calculate standard deviation of user interactions
user_stddev = np.std(train_mat, axis=1)

# Save the user standard deviations to a file (adjust the path accordingly)
with open(f'./Data/{args.data}/user_stddev.npy', "wb") as f:
    np.save(f, user_stddev)

# Calculate Jaccard similarity matrix
Jaccard_mat = np.matmul(train_mat, train_mat.T)
deno = user_pop.reshape((-1, 1)) + user_pop.reshape((1, -1)) - Jaccard_mat + 1e-7
Jaccard_mat /= deno
Jaccard_mat = Jaccard_mat + np.eye(num_user) * -9999
Jaccard_mat = Jaccard_mat[np.where(Jaccard_mat > -1)].reshape((num_user, num_user - 1))

# Calculate Mainstream Similarity (MS_similarity) by taking the mean along axis 1
MS_similarity = np.mean(Jaccard_mat, axis=1)

# Save the MS similarity to a file (adjust the path accordingly)
with open(f'./Data/{args.data}/MS_similarity.npy', "wb") as f:
    np.save(f, MS_similarity)

In [7]:
# Initialize a list to store Gini coefficients after each epoch
gini_coefficients = []

# Run the experiment for a specified number of runs
for r in range(args.run):
    print('')
    print('#' * 100)
    print('#' * 100)
    print(' ' * 50 + ' Experiment run ' + str(r + 1) + ' ' * 50)
    print('#' * 100)
    print('#' * 100)

    # Initialize the simulation with provided arguments and data
    simulation = Simulation(args, truth, truth_like)
    init_popularity = simulation.initial_iterations()  # Perform initial iterations to gather feedback
    gini_coefficients.append(simulation.run_simulation())  # Run the main simulation


####################################################################################################
####################################################################################################
                                                   Experiment run 1                                                  
####################################################################################################
####################################################################################################
Namespace(K=20, cycle_itr=50, data='ml1m', epoch=20, exp=1, hidden=100, iteration=1000, lr=0.001, neg=5, num_item=3406, num_user=1000, reg=1e-05, run=1)
****************************** Start initial random iterations ******************************
---------- Iteration 1 ----------


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:00<00:00, 5952.46it/s]

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Generate 484 records.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
****************************** Train MF until converge ******************************
Update bs to 58





******************************MF initialization done ******************************


100%|██████████| 50/50 [00:00<00:00, 113.90it/s]


Training // Epoch 0 //  Total cost = 34.6800  Total cost1 = 34.6394  Total cost2 = 0.0405


100%|██████████| 50/50 [00:00<00:00, 146.84it/s]


Training // Epoch 1 //  Total cost = 34.3144  Total cost1 = 34.2866  Total cost2 = 0.0277


100%|██████████| 50/50 [00:00<00:00, 160.77it/s]


Training // Epoch 2 //  Total cost = 33.8065  Total cost1 = 33.7711  Total cost2 = 0.0354


100%|██████████| 50/50 [00:00<00:00, 161.29it/s]


Training // Epoch 3 //  Total cost = 32.9997  Total cost1 = 32.9504  Total cost2 = 0.0493


100%|██████████| 50/50 [00:00<00:00, 156.49it/s]


Training // Epoch 4 //  Total cost = 31.7612  Total cost1 = 31.6920  Total cost2 = 0.0692


100%|██████████| 50/50 [00:00<00:00, 163.67it/s]


Training // Epoch 5 //  Total cost = 29.9883  Total cost1 = 29.8916  Total cost2 = 0.0967


100%|██████████| 50/50 [00:00<00:00, 160.00it/s]


Training // Epoch 6 //  Total cost = 27.6829  Total cost1 = 27.5503  Total cost2 = 0.1327


100%|██████████| 50/50 [00:00<00:00, 158.48it/s]


Training // Epoch 7 //  Total cost = 24.7509  Total cost1 = 24.5759  Total cost2 = 0.1750


100%|██████████| 50/50 [00:00<00:00, 152.44it/s]


Training // Epoch 8 //  Total cost = 21.1925  Total cost1 = 20.9652  Total cost2 = 0.2274


100%|██████████| 50/50 [00:00<00:00, 147.71it/s]


Training // Epoch 9 //  Total cost = 17.1571  Total cost1 = 16.8694  Total cost2 = 0.2877


100%|██████████| 50/50 [00:00<00:00, 165.84it/s]


Training // Epoch 10 //  Total cost = 12.6105  Total cost1 = 12.2552  Total cost2 = 0.3553


100%|██████████| 50/50 [00:00<00:00, 149.25it/s]


Training // Epoch 11 //  Total cost = 7.7291  Total cost1 = 7.2990  Total cost2 = 0.4301


100%|██████████| 50/50 [00:00<00:00, 157.73it/s]


Training // Epoch 12 //  Total cost = 2.3168  Total cost1 = 1.8036  Total cost2 = 0.5132


100%|██████████| 50/50 [00:00<00:00, 159.75it/s]


Training // Epoch 13 //  Total cost = -3.4213  Total cost1 = -4.0226  Total cost2 = 0.6013


100%|██████████| 50/50 [00:00<00:00, 145.56it/s]


Training // Epoch 14 //  Total cost = -9.8438  Total cost1 = -10.5423  Total cost2 = 0.6984


100%|██████████| 50/50 [00:00<00:00, 155.52it/s]


Training // Epoch 15 //  Total cost = -16.4409  Total cost1 = -17.2440  Total cost2 = 0.8030


100%|██████████| 50/50 [00:00<00:00, 158.73it/s]


Training // Epoch 16 //  Total cost = -23.1918  Total cost1 = -24.1047  Total cost2 = 0.9129


100%|██████████| 50/50 [00:00<00:00, 155.76it/s]


Training // Epoch 17 //  Total cost = -30.5370  Total cost1 = -31.5677  Total cost2 = 1.0307


100%|██████████| 50/50 [00:00<00:00, 161.81it/s]


Training // Epoch 18 //  Total cost = -38.3316  Total cost1 = -39.4842  Total cost2 = 1.1526


100%|██████████| 50/50 [00:00<00:00, 161.29it/s]


Training // Epoch 19 //  Total cost = -46.0649  Total cost1 = -47.3458  Total cost2 = 1.2809
****************************** Start simulation ******************************
****************************** Epoch 0 ******************************


100%|██████████| 50/50 [00:00<00:00, 4166.47it/s]
100%|██████████| 50/50 [00:00<00:00, 3030.04it/s]
100%|██████████| 50/50 [00:00<00:00, 4166.22it/s]
100%|██████████| 50/50 [00:00<00:00, 3030.65it/s]
100%|██████████| 50/50 [00:00<00:00, 2631.47it/s]
100%|██████████| 50/50 [00:00<00:00, 2856.84it/s]
100%|██████████| 50/50 [00:00<00:00, 3334.26it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.47it/s]
100%|██████████| 50/50 [00:00<00:00, 3226.14it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.15it/s]
100%|██████████| 50/50 [00:00<00:00, 4166.89it/s]
100%|██████████| 50/50 [00:00<00:00, 4762.90it/s]
100%|██████████| 50/50 [00:00<00:00, 4347.87it/s]
100%|██████████| 50/50 [00:00<00:00, 3333.36it/s]
100%|██████████| 50/50 [00:00<00:00, 4166.80it/s]
100%|██████████| 50/50 [00:00<00:00, 3571.38it/s]
100%|██████████| 50/50 [00:00<00:00, 3448.58it/s]
100%|██████████| 50/50 [00:00<00:00, 4167.30it/s]
100%|██████████| 50/50 [00:00<00:00, 3448.47it/s]
100%|██████████| 50/50 [00:00<00:00, 3571.08it/s]


IndexError: index 2678 is out of bounds for axis 0 with size 1000

In [None]:
# Calculate the average of each corresponding element across lists
print(gini_coefficients)
average_gini_coefficients = np.mean(np.array(gini_coefficients), axis=0)

# Visualize the averaged Gini coefficients
plt.figure(figsize=(10, 6))
x_values = range(len(average_gini_coefficients))
plt.plot(x_values, average_gini_coefficients, marker='o', linestyle='-', color='b')
plt.title('Average Gini Coefficients Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Average Gini Coefficient')
plt.grid(True)
print(average_gini_coefficients)
plt.show()

[]


TypeError: object of type 'numpy.float64' has no len()

<Figure size 1000x600 with 0 Axes>