In [1]:
##IMPORTS

import numpy as np
import pandas as pd
import re
import os

import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator

In [2]:
# def load_combined_data(base_path):
#     data = []
#     for i in range(1, 5):  # Loop through each of the combined_data files
#         file_path = os.path.join(base_path, f'combined_data_{i}.txt')
#         with open(file_path, 'r') as file:
#             movie_id = None
#             for line in file:
#                 line = line.strip()
#                 if line.endswith(":"):
#                     movie_id = int(line[:-1])
#                 else:
#                     customer_id, rating, date = line.split(",")
#                     data.append([movie_id, int(customer_id), int(rating), date])

#     df_sub = pd.DataFrame(data, columns=["MovieID", "CustomerID", "Rating", "Date"])
#     return df_sub

###CURRENTLY JUST USING 1 COMBINED DATASET

def load_combined_data(base_path):
    data = []
    with open(base_path, 'r') as file:
        movie_id = None
        for line in file:
            line = line.strip()
            if line.endswith(":"):
                movie_id = int(line[:-1])
            else:
                customer_id, rating, date = line.split(",")
                data.append([movie_id, int(customer_id), int(rating), date])

    df_sub = pd.DataFrame(data, columns=["MovieID", "CustomerID", "Rating", "Date"])
    return df_sub


# Get the base directory path from the user
base_path = '/Users/Ryan/Desktop/BT4222/combined_data_1.txt/combined_data_1.txt'

# Load the data and create the DataFrame
df = load_combined_data(base_path)

print(df.head())

   MovieID  CustomerID  Rating        Date
0        1     1488844       3  2005-09-06
1        1      822109       5  2005-05-13
2        1      885013       4  2005-10-19
3        1       30878       4  2005-12-26
4        1      823519       3  2004-05-03


In [3]:
#Converting date into timestamp
from datetime import datetime

# Convert 'Date' to timestamp and rename rating column
df['timestamp'] = pd.to_datetime(df['Date']).apply(lambda x: x.timestamp())
df = df.rename(columns={'Rating': 'rating'})

# Drop the original 'Date' column
df = df.drop(columns=['Date'])

# Display the DataFrame
print(df.head())

# print(type(df['timestamp'][0]))

   MovieID  CustomerID  rating     timestamp
0        1     1488844       3  1.125965e+09
1        1      822109       5  1.115942e+09
2        1      885013       4  1.129680e+09
3        1       30878       4  1.135555e+09
4        1      823519       3  1.083542e+09


In [4]:
### CONVERTING [CustomerID -> userId, MovieID -> itemsId] to match format for data loader

user_id = df[['CustomerID']].drop_duplicates().reindex()

print("initial user_id")
print(user_id.head())

user_id['userId'] = np.arange(len(user_id)) 

print("final user_id")
print(user_id.head())

df = pd.merge(df, user_id, on=['CustomerID'], how='left')

print("initial df ")
print(df.head())

item_id = df[['MovieID']].drop_duplicates()

print("initial item_id")
print(item_id.head())

item_id['itemId'] = np.arange(len(item_id))

print("final item_id")
print(item_id.head())

df = pd.merge(df, item_id, on=['MovieID'], how='left')

print("final df ")
print(df.head())

df = df[['userId', 'itemId', 'rating', 'timestamp']]

print("final final df ")
print(df.head())

print('Range of userId is [{}, {}]'.format(df.userId.min(), df.userId.max()))
print('Range of itemId is [{}, {}]'.format(df.itemId.min(), df.itemId.max()))

initial user_id
   CustomerID
0     1488844
1      822109
2      885013
3       30878
4      823519
final user_id
   CustomerID  userId
0     1488844       0
1      822109       1
2      885013       2
3       30878       3
4      823519       4
initial df 
   MovieID  CustomerID  rating     timestamp  userId
0        1     1488844       3  1.125965e+09       0
1        1      822109       5  1.115942e+09       1
2        1      885013       4  1.129680e+09       2
3        1       30878       4  1.135555e+09       3
4        1      823519       3  1.083542e+09       4
initial item_id
      MovieID
0           1
547         2
692         3
2704        4
2846        5
final item_id
      MovieID  itemId
0           1       0
547         2       1
692         3       2
2704        4       3
2846        5       4
final df 
   MovieID  CustomerID  rating     timestamp  userId  itemId
0        1     1488844       3  1.125965e+09       0       0
1        1      822109       5  1.115942e+09  

In [5]:
### DOWNSIZING DATA (original dataset too big - Memory Error)

##Reduce by sampling a % of dataset
# df_sampled = df.sample(frac=0.1, random_state=42)

# print(df_sampled.head())

# print('Range of userId is [{}, {}]'.format(df_sampled.userId.min(), df_sampled.userId.max()))
# print('Range of itemId is [{}, {}]'.format(df_sampled.itemId.min(), df_sampled.itemId.max()))

##Reduce by decreasing number of unique users
desired_unique_users = 8000

# Get a subset of user_ids to keep
subset_user_ids = df['userId'].unique()[:desired_unique_users]

# Filter the DataFrame to keep only rows with user_ids in the subset
df_reduced = df[df['userId'].isin(subset_user_ids)]

print('Range of userId is [{}, {}]'.format(df_reduced.userId.min(), df_reduced.userId.max()))
print('Range of itemId is [{}, {}]'.format(df_reduced.itemId.min(), df_reduced.itemId.max()))

Range of userId is [0, 7999]
Range of itemId is [0, 4498]


In [6]:
###Comparing the length of the dataset

print(len(df))
# print(len(df_sampled))
print(len(df_reduced))

rating_range = df_reduced['rating'].min(), df_reduced['rating'].max()

print(rating_range)

24053764
1152161
(1, 5)


In [13]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 1,
              'batch_size': 1024,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 8000,
              'num_items': 4499,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': False,
              'device_id': 7,
              'pretrain': False,
              'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [9]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 1024,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': 6040,
                'num_items': 3706,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.01,
                'use_cuda': False,
                'device_id': 7,
                'pretrain': True,
                'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
                'pretrain_mlp': 'checkpoints/{}'.format('mlp_factor8neg4_Epoch100_HR0.5606_NDCG0.2463.model'),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }

In [10]:
# DataLoader for training
sample_generator1 = SampleGenerator(ratings=df_reduced)
evaluate_data1 = sample_generator1.evaluate_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['rating'][ratings['rating'] > 0] = 1.0


(8000, 2)    userId                                   interacted_items
0       0  {0, 7, 4108, 16, 2068, 2071, 4122, 29, 4134, 4...
1       1  {0, 1541, 1797, 1809, 2579, 3859, 1306, 2593, ...
2       2  {0, 4097, 4, 2056, 1551, 4114, 1560, 3610, 413...
3       3  {0, 2050, 4, 17, 2071, 4122, 27, 29, 4135, 43,...
4       4  {0, 1026, 1541, 1542, 7, 3078, 1034, 2571, 16,...
-------
(8000, 3)    userId                                   interacted_items  \
0       0  {0, 7, 4108, 16, 2068, 2071, 4122, 29, 4134, 4...   
1       1  {0, 1541, 1797, 1809, 2579, 3859, 1306, 2593, ...   
2       2  {0, 4097, 4, 2056, 1551, 4114, 1560, 3610, 413...   
3       3  {0, 2050, 4, 17, 2071, 4122, 27, 29, 4135, 43,...   
4       4  {0, 1026, 1541, 1542, 7, 3078, 1034, 2571, 16,...   

                                      negative_items  
0  {1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 1...  
1  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...  
2  {1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...  
3

In [14]:
# Specify the exact model
# config = gmf_config
# engine = GMFEngine(config)
config = mlp_config
engine = MLPEngine(config)
# config = neumf_config
# engine = NeuMFEngine(config)

MLP(
  (embedding_user): Embedding(8000, 8)
  (embedding_item): Embedding(4499, 8)
  (fc_layers): ModuleList(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=32, bias=True)
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): Linear(in_features=16, out_features=8, bias=True)
  )
  (affine_output): Linear(in_features=8, out_features=1, bias=True)
  (logistic): Sigmoid()
)


In [15]:
for epoch in range(config['num_epoch']):
    print('Epoch {} starts !'.format(epoch))
    print('-' * 80)
    train_loader = sample_generator1.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    hit_ratio, ndcg = engine.evaluate(evaluate_data1, epoch_id=epoch)
    engine.save(config['alias'], epoch, hit_ratio, ndcg)

Epoch 0 starts !
--------------------------------------------------------------------------------
[Training Epoch 0] Batch 0, Loss 0.6435933113098145
[Training Epoch 0] Batch 1, Loss 0.6461410522460938
[Training Epoch 0] Batch 2, Loss 0.6417548656463623
[Training Epoch 0] Batch 3, Loss 0.6385498642921448
[Training Epoch 0] Batch 4, Loss 0.638254702091217
[Training Epoch 0] Batch 5, Loss 0.6344251036643982
[Training Epoch 0] Batch 6, Loss 0.6290842294692993
[Training Epoch 0] Batch 7, Loss 0.6231271028518677
[Training Epoch 0] Batch 8, Loss 0.6246495842933655
[Training Epoch 0] Batch 9, Loss 0.6205435991287231
[Training Epoch 0] Batch 10, Loss 0.6248477697372437
[Training Epoch 0] Batch 11, Loss 0.617735743522644
[Training Epoch 0] Batch 12, Loss 0.6113344430923462
[Training Epoch 0] Batch 13, Loss 0.611133873462677
[Training Epoch 0] Batch 14, Loss 0.6082634925842285
[Training Epoch 0] Batch 15, Loss 0.6095789670944214
[Training Epoch 0] Batch 16, Loss 0.5975694060325623
[Training Epoc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(lambda x: math.log(2) / math.log(1 + x)) # the rank starts from 1
