## IMPORTING MARTIN'S CODE

In [2]:
import numpy as np
import pandas as pd
import re
import os

In [3]:
# def load_combined_data(base_path):
#     data = []
#     for i in range(1, 5):  # Loop through each of the combined_data files
#         file_path = os.path.join(base_path, f'combined_data_{i}.txt')
#         with open(file_path, 'r') as file:
#             movie_id = None
#             for line in file:
#                 line = line.strip()
#                 if line.endswith(":"):
#                     movie_id = int(line[:-1])
#                 else:
#                     customer_id, rating, date = line.split(",")
#                     data.append([movie_id, int(customer_id), int(rating), date])

#     df_sub = pd.DataFrame(data, columns=["MovieID", "CustomerID", "Rating", "Date"])
#     return df_sub

def load_combined_data(base_path):
    data = []
    with open(base_path, 'r') as file:
        movie_id = None
        for line in file:
            line = line.strip()
            if line.endswith(":"):
                movie_id = int(line[:-1])
            else:
                customer_id, rating, date = line.split(",")
                data.append([movie_id, int(customer_id), int(rating), date])

    df_sub = pd.DataFrame(data, columns=["MovieID", "CustomerID", "Rating", "Date"])
    return df_sub

In [4]:
# Get the base directory path from the user
base_path = '/Users/Ryan/Desktop/BT4222/combined_data_1.txt/combined_data_1.txt'

# Load the data and create the DataFrame
df = load_combined_data(base_path)

In [None]:
print(df.head())

# print(type(df['Date'][0]))

In [5]:
#Converting date into timestamp
from datetime import datetime

# Convert 'Date' to timestamp and rename rating column
df['timestamp'] = pd.to_datetime(df['Date']).apply(lambda x: x.timestamp())
df = df.rename(columns={'Rating': 'rating'})

# Drop the original 'Date' column
df = df.drop(columns=['Date'])

# Display the DataFrame
print(df.head())

   MovieID  CustomerID  rating     timestamp
0        1     1488844       3  1.125965e+09
1        1      822109       5  1.115942e+09
2        1      885013       4  1.129680e+09
3        1       30878       4  1.135555e+09
4        1      823519       3  1.083542e+09


In [None]:
print(type(df['timestamp'][0]))

In [6]:
user_id = df[['CustomerID']].drop_duplicates().reindex()

print("initial user_id")
print(user_id.head())

user_id['userId'] = np.arange(len(user_id)) 

print("final user_id")
print(user_id.head())

df = pd.merge(df, user_id, on=['CustomerID'], how='left')

print("initial df ")
print(df.head())

item_id = df[['MovieID']].drop_duplicates()

print("initial item_id")
print(item_id.head())

item_id['itemId'] = np.arange(len(item_id))

print("final item_id")
print(item_id.head())

df = pd.merge(df, item_id, on=['MovieID'], how='left')

print("final df ")
print(df.head())

df = df[['userId', 'itemId', 'rating', 'timestamp']]

print("final final df ")
print(df.head())

print('Range of userId is [{}, {}]'.format(df.userId.min(), df.userId.max()))
print('Range of itemId is [{}, {}]'.format(df.itemId.min(), df.itemId.max()))

initial user_id
   CustomerID
0     1488844
1      822109
2      885013
3       30878
4      823519
final user_id
   CustomerID  userId
0     1488844       0
1      822109       1
2      885013       2
3       30878       3
4      823519       4
initial df 
   MovieID  CustomerID  rating     timestamp  userId
0        1     1488844       3  1.125965e+09       0
1        1      822109       5  1.115942e+09       1
2        1      885013       4  1.129680e+09       2
3        1       30878       4  1.135555e+09       3
4        1      823519       3  1.083542e+09       4
initial item_id
      MovieID
0           1
547         2
692         3
2704        4
2846        5
final item_id
      MovieID  itemId
0           1       0
547         2       1
692         3       2
2704        4       3
2846        5       4
final df 
   MovieID  CustomerID  rating     timestamp  userId  itemId
0        1     1488844       3  1.125965e+09       0       0
1        1      822109       5  1.115942e+09  

In [None]:
# df_sampled = df.sample(frac=0.1, random_state=42)

# print(df_sampled.head())

# print('Range of userId is [{}, {}]'.format(df_sampled.userId.min(), df_sampled.userId.max()))
# print('Range of itemId is [{}, {}]'.format(df_sampled.itemId.min(), df_sampled.itemId.max()))

In [7]:
# Assuming df is your original DataFrame with a 'user_id' column
# Replace 'n' with the desired number of unique user_ids to keep
desired_unique_users = 8000

# Get a subset of user_ids to keep
subset_user_ids = df['userId'].unique()[:desired_unique_users]

# Filter the DataFrame to keep only rows with user_ids in the subset
df_reduced = df[df['userId'].isin(subset_user_ids)]

print('Range of userId is [{}, {}]'.format(df_reduced.userId.min(), df_reduced.userId.max()))
print('Range of itemId is [{}, {}]'.format(df_reduced.itemId.min(), df_reduced.itemId.max()))

Range of userId is [0, 7999]
Range of itemId is [0, 4498]


In [8]:
print(len(df))
print(len(df_reduced))

24053764
1152161


# MODEL STUFF

In [9]:
import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator

In [17]:
gmf_config = {'alias': 'gmf_factor8neg4-implict',
              'num_epoch': 1,
              'batch_size': 1024,
              # 'optimizer': 'sgd',
              # 'sgd_lr': 1e-3,
              # 'sgd_momentum': 0.9,
              # 'optimizer': 'rmsprop',
              # 'rmsprop_lr': 1e-3,
              # 'rmsprop_alpha': 0.99,
              # 'rmsprop_momentum': 0,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 7985,
              'num_items': 4498,
              'latent_dim': 8,
              'num_negative': 4,
              'l2_regularization': 0, # 0.01
              'use_cuda': False,
              'device_id': 0,
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 200,
              'batch_size': 256,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 6040,
              'num_items': 3706,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': False,
              'device_id': 7,
              'pretrain': True,
              'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 1024,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': 6040,
                'num_items': 3706,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.01,
                'use_cuda': False,
                'device_id': 7,
                'pretrain': True,
                'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
                'pretrain_mlp': 'checkpoints/{}'.format('mlp_factor8neg4_Epoch100_HR0.5606_NDCG0.2463.model'),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }

In [None]:
# Load Data
ml1m_dir = 'data/ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')

In [None]:
print(ml1m_rating.head())

In [None]:
print(type(ml1m_rating['timestamp'][0]))

In [None]:
# import pandas as pd

# # Create a sample DataFrame
# data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
# df = pd.DataFrame(data, index=['row1', 'row2', 'row3'])

# print("Original DataFrame:")
# print(df)

# # Reindex the DataFrame with a new set of indices
# new_index = ['row1', 'row2', 'row3', 'row4']
# df_reindexed = df.reindex(new_index)

# print("\nDataFrame after reindexing:")
# print(df_reindexed)


In [None]:
# Reindex

##Actually why is there a need to reindex?

user_id = ml1m_rating[['uid']].drop_duplicates().reindex()

print("initial user_id")
print(user_id.head())

user_id['userId'] = np.arange(len(user_id)) 

print("final user_id")
print(user_id.head())

ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')

print("initial ml1m_rating ")
print(ml1m_rating.head())

item_id = ml1m_rating[['mid']].drop_duplicates()

print("initial user_id")
print(item_id.head())

item_id['itemId'] = np.arange(len(item_id))

print("final user_id")
print(item_id.head())

ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')

print("final ml1m_rating ")
print(ml1m_rating.head())

ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]

print("final final ml1m_rating ")
print(ml1m_rating.head())

print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))

In [None]:
print(len(ml1m_rating))

In [None]:
print(ml1m_rating.tail())

In [None]:
# DataLoader for training
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data

In [10]:
# DataLoader for training (MARTIN'S code)
sample_generator1 = SampleGenerator(ratings=df_reduced)
evaluate_data1 = sample_generator1.evaluate_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['rating'][ratings['rating'] > 0] = 1.0


7985
4499
----
7985
1836


In [None]:
print(evaluate_data1)

In [18]:
# Specify the exact model
config = gmf_config
engine = GMFEngine(config)
# config = mlp_config
# engine = MLPEngine(config)
# config = neumf_config
# engine = NeuMFEngine(config)

In [19]:
for epoch in range(config['num_epoch']):
    print('Epoch {} starts !'.format(epoch))
    print('-' * 80)
    train_loader = sample_generator1.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
    engine.save(config['alias'], epoch, hit_ratio, ndcg)

Epoch 0 starts !
--------------------------------------------------------------------------------


IndexError: index out of range in self