In [1]:
import os
import random
import pickle

import pandas as pd
import numpy as np

In [2]:
data_dir = 'ml-1m/'
rating_file = 'ratings.dat'

test_neg_num = 100

np.random.seed(2021)

In [3]:
raw_df = pd.read_csv(os.path.join(data_dir, rating_file),
                     sep='::',
                     names=['uid', 'iid', 'timestamp'],
                     usecols=[0, 1, 3],
                     engine='python')
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [4]:
uids = raw_df['uid'].unique()
iids = np.sort(uids)
uids, len(uids)

(array([   1,    2,    3, ..., 6038, 6039, 6040]), 6040)

In [5]:
iids = raw_df['iid'].unique()
iids = np.sort(iids)
iids, len(iids)

(array([   1,    2,    3, ..., 3950, 3951, 3952]), 3706)

In [6]:
raw_df = raw_df.sort_values(['uid', 'iid'])
raw_df

Unnamed: 0,uid,iid,timestamp
40,1,1,978824268
25,1,48,978824351
39,1,150,978301777
44,1,260,978300760
23,1,527,978824195
...,...,...,...
1000120,6040,3683,960971696
1000178,6040,3703,964828575
1000183,6040,3735,960971654
1000191,6040,3751,964828782


In [7]:
def get_map(ids):
    id_map = {}
    new_id = 0
    for old_id in ids:
        if old_id not in id_map:
            id_map[old_id] = new_id
            new_id += 1
    return id_map

uid_map = get_map(uids)
iid_map = get_map(iids)

raw_df['uid'] = raw_df['uid'].map(uid_map)
raw_df['iid'] = raw_df['iid'].map(iid_map)

raw_df

Unnamed: 0,uid,iid,timestamp
40,0,0,978824268
25,0,47,978824351
39,0,144,978301777
44,0,253,978300760
23,0,513,978824195
...,...,...,...
1000120,6039,3441,960971696
1000178,6039,3461,964828575
1000183,6039,3493,960971654
1000191,6039,3508,964828782


In [12]:
# uids = raw_df['uid'].unique()
# iids = raw_df['iid'].unique()
# user_num = len(uids)
# item_num = len(iids)

# neg_dict = {}

# for uid in uids:
#     pos_iids = raw_df[raw_df['uid'] == uid]['iid'].to_list()
#     neg_iids = np.setdiff1d(iids, pos_iids)
#     np.random.shuffle(neg_iids)
#     neg_dict[uid] = neg_iids[0:test_neg_num-1].tolist()
    
len(neg_dict), len(neg_dict[0])

(6040, 99)

In [13]:
info_dict = {
    'user_num': len(uids),
    'item_num': len(iids),
}

info_dict

{'user_num': 6040, 'item_num': 3706}

In [14]:
latest_idxs = raw_df.groupby('uid')['timestamp'].idxmax()
pos_test_df = raw_df.iloc[latest_idxs]
pos_test_df

Unnamed: 0,uid,iid,timestamp
52,0,1154,978302091
98,1,575,978299773
191,2,3622,978298486
241,3,1106,978294199
442,4,31,978244962
...,...,...,...
999358,6035,822,956755171
999571,6036,2651,956709215
999731,6037,1094,956717204
999761,6038,846,956705486


In [15]:
pos_train_df = pd.concat([raw_df, pos_test_df, pos_test_df]).drop_duplicates(keep=False)
pos_train_df

Unnamed: 0,uid,iid,timestamp
40,0,0,978824268
25,0,47,978824351
39,0,144,978301777
44,0,253,978300760
23,0,513,978824195
...,...,...,...
1000120,6039,3441,960971696
1000178,6039,3461,964828575
1000183,6039,3493,960971654
1000191,6039,3508,964828782


In [16]:
pos_train_df = pos_train_df.reset_index(drop=True)
pos_train_df = pos_train_df[['uid', 'iid']]

pos_test_df = pos_test_df.reset_index(drop=True)
pos_test_df = pos_test_df[['uid', 'iid']]

pos_train_df, pos_test_df

(         uid   iid
 0          0     0
 1          0    47
 2          0   144
 3          0   253
 4          0   513
 ...      ...   ...
 994164  6039  3441
 994165  6039  3461
 994166  6039  3493
 994167  6039  3508
 994168  6039  3575
 
 [994169 rows x 2 columns],
        uid   iid
 0        0  1154
 1        1   575
 2        2  3622
 3        3  1106
 4        4    31
 ...    ...   ...
 6035  6035   822
 6036  6036  2651
 6037  6037  1094
 6038  6038   846
 6039  6039  1499
 
 [6040 rows x 2 columns])

In [17]:
with open(os.path.join(data_dir, 'neg.dict'), 'wb') as f:
    pickle.dump(neg_dict, f)
    
with open(os.path.join(data_dir, 'info.dict'), 'wb') as f:
    pickle.dump(info_dict, f)
    
pos_train_df.to_csv(os.path.join(data_dir, 'pos_train.csv'), index=False)
pos_test_df.to_csv(os.path.join(data_dir, 'pos_test.csv'), index=False)