In [1]:
import os
import random
import pickle

import pandas as pd
import numpy as np

In [2]:
data_dir = 'ml-1m/'
rating_file = 'ratings.dat'

min_itemcount = 20
min_usercount = 5
test_neg_num = 100

np.random.seed(2021)

In [3]:
raw_df = pd.read_csv(os.path.join(data_dir, rating_file),
                     sep='::',
                     names=['uid', 'iid', 'timestamp'],
                     usecols=[0, 1, 3],
                     engine='python')
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [4]:
user_num = raw_df['uid'].unique()
item_num = raw_df['iid'].unique()
user_num, len(user_num), item_num, len(item_num)

(array([   1,    2,    3, ..., 6038, 6039, 6040]),
 6040,
 array([1193,  661,  914, ..., 2845, 3607, 2909]),
 3706)

In [5]:
user_group = raw_df.groupby('uid', as_index=False)
user_itemcount = user_group.size()
active_users = user_itemcount[user_itemcount['size'] >= min_itemcount]['uid']
raw_df = raw_df[raw_df['uid'].isin(active_users)]
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [6]:
item_group = raw_df.groupby('iid', as_index=False)
item_usercount = item_group.size()
item_usercount
active_items = item_usercount[item_usercount['size'] >= min_usercount]['iid']
raw_df = raw_df[raw_df['iid'].isin(active_items)]
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [7]:
raw_df = raw_df.sort_values(['uid', 'iid'])
raw_df

Unnamed: 0,uid,iid,timestamp
40,1,1,978824268
25,1,48,978824351
39,1,150,978301777
44,1,260,978300760
23,1,527,978824195
...,...,...,...
1000120,6040,3683,960971696
1000178,6040,3703,964828575
1000183,6040,3735,960971654
1000191,6040,3751,964828782


In [8]:
uids = raw_df['uid'].unique()
uids, len(uids)

(array([   1,    2,    3, ..., 6038, 6039, 6040]), 6040)

In [9]:
iids = raw_df['iid'].unique()
iids, len(iids)

(array([   1,   48,  150, ..., 3533, 2777, 3443]), 3416)

In [10]:
def get_map(ids):
    id_map = {}
    new_id = 0
    for old_id in ids:
        if old_id not in id_map:
            id_map[old_id] = new_id
            new_id += 1
    return id_map

uid_map = get_map(uids)
iid_map = get_map(iids)

raw_df['uid'] = raw_df['uid'].map(uid_map)
raw_df['iid'] = raw_df['iid'].map(iid_map)

raw_df = raw_df.reset_index(drop=True)
raw_df

Unnamed: 0,uid,iid,timestamp
0,0,0,978824268
1,0,1,978824351
2,0,2,978301777
3,0,3,978300760
4,0,4,978824195
...,...,...,...
999606,6039,1456,960971696
999607,6039,780,964828575
999608,6039,172,960971654
999609,6039,546,964828782


In [11]:
uids = raw_df['uid'].unique()
iids = raw_df['iid'].unique()
user_num = len(uids)
item_num = len(iids)

neg_dict = {}
for uid in uids:
    pos_iids = raw_df[raw_df['uid'] == uid]['iid'].to_list()
    neg_iids = np.setdiff1d(iids, pos_iids)
    np.random.shuffle(neg_iids)
    neg_dict[uid] = neg_iids[0:test_neg_num-1].tolist()
    
len(neg_dict), len(neg_dict[0])

(6040, 99)

In [22]:
info_dict = {
    'user_num': len(uids),
    'item_num': len(iids),
}

info_dict

{'user_num': 6040, 'item_num': 3416}

In [23]:
raw_df.describe()

Unnamed: 0,uid,iid,timestamp
count,999611.0,999611.0,999611.0
mean,3023.576537,869.26222,972240900.0
std,1728.436705,734.612449,12148270.0
min,0.0,0.0,956703900.0
25%,1505.0,253.0,965302500.0
50%,3069.0,673.0,973017000.0
75%,4476.0,1286.0,975220800.0
max,6039.0,3415.0,1046455000.0


In [13]:
latest_idxs = raw_df.groupby('uid')['timestamp'].idxmax()
pos_test_df = raw_df.loc[latest_idxs.values]
pos_test_df

Unnamed: 0,uid,iid,timestamp
1,0,1,978824351
66,1,66,978300174
215,2,193,978298504
235,3,208,978294282
276,4,238,978246585
...,...,...,...
998460,6035,2012,956755196
998939,6036,571,956801840
999133,6037,1537,956717204
999169,6038,393,956758029


In [14]:
pos_train_df = pd.concat([raw_df, pos_test_df, pos_test_df]).drop_duplicates(keep=False)
pos_train_df

Unnamed: 0,uid,iid,timestamp
0,0,0,978824268
2,0,2,978301777
3,0,3,978300760
4,0,4,978824195
5,0,5,978302149
...,...,...,...
999606,6039,1456,960971696
999607,6039,780,964828575
999608,6039,172,960971654
999609,6039,546,964828782


In [15]:
pos_test_df = pos_test_df.reset_index(drop=True)
pos_test_df = pos_test_df[['uid', 'iid']]
pos_test_df

Unnamed: 0,uid,iid
0,0,1
1,1,66
2,2,193
3,3,208
4,4,238
...,...,...
6035,6035,2012
6036,6036,571
6037,6037,1537
6038,6038,393


In [16]:
pos_train_df = pos_train_df.reset_index(drop=True)
pos_train_df = pos_train_df[['uid', 'iid']]
pos_train_df

Unnamed: 0,uid,iid
0,0,0
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
993566,6039,1456
993567,6039,780
993568,6039,172
993569,6039,546


In [17]:
pos_train_arr = pos_train_df.to_numpy()
pos_train_arr

array([[   0,    0],
       [   0,    2],
       [   0,    3],
       ...,
       [6039,  172],
       [6039,  546],
       [6039, 2485]])

In [18]:
inter_mat = np.zeros((user_num, item_num), dtype=np.float32)
for arr in pos_train_arr:
    inter_mat[arr[0], arr[1]] = 1.0

len(inter_mat)

6040

In [19]:
with open(os.path.join(data_dir, 'neg.dict'), 'wb') as f:
    pickle.dump(neg_dict, f)
    
with open(os.path.join(data_dir, 'info.dict'), 'wb') as f:
    pickle.dump(info_dict, f)
    
pos_train_df.to_csv(os.path.join(data_dir, 'pos_train.csv'), index=False)
pos_test_df.to_csv(os.path.join(data_dir, 'pos_test.csv'), index=False)

np.save(os.path.join(data_dir, 'inter_mat.npy'), inter_mat)