In [1]:
import os
import random
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
data_dir = 'ml-1m/'
rating_file = 'ratings.dat'

min_itemcount = 20
min_usercount = 5
test_neg_num = 100

np.random.seed(2021)

In [3]:
raw_df = pd.read_csv(os.path.join(data_dir, rating_file),
                     sep='::',
                     names=['uid', 'iid', 'timestamp'],
                     usecols=[0, 1, 3],
                     engine='python')
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [4]:
user_group = raw_df.groupby('uid', as_index=False)
user_itemcount = user_group.size()
active_users = user_itemcount.index[user_itemcount['size'] >= min_itemcount]
raw_df = raw_df[raw_df['uid'].isin(active_users)]
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
999863,6039,1081,956705989
999864,6039,1083,956706051
999865,6039,1086,956706182
999866,6039,1088,956706019


In [5]:
item_group = raw_df.groupby('iid', as_index=False)
item_usercount = item_group.size()
active_items = item_usercount.index[item_usercount['size'] >= min_usercount]
raw_df = raw_df[raw_df['uid'].isin(active_items)]
raw_df

Unnamed: 0,uid,iid,timestamp
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
611527,3705,2985,966286629
611528,3705,1246,966282465
611529,3705,3788,966281609
611530,3705,2986,966286629


In [6]:
raw_df = raw_df.sort_values(['uid', 'iid'])
raw_df

Unnamed: 0,uid,iid,timestamp
40,1,1,978824268
25,1,48,978824351
39,1,150,978301777
44,1,260,978300760
23,1,527,978824195
...,...,...,...
611045,3705,3811,966281767
611046,3705,3813,966284183
611047,3705,3814,966284831
611101,3705,3871,966286947


In [7]:
uids = raw_df['uid'].unique()
uids, len(uids)

(array([   1,    2,    3, ..., 3703, 3704, 3705]), 3415)

In [8]:
iids = raw_df['iid'].unique()
iids, len(iids)

(array([   1,   48,  150, ..., 3376,  311, 1905]), 3636)

In [9]:
def get_map(ids):
    id_map = {}
    new_id = 0
    for old_id in ids:
        if old_id not in id_map:
            id_map[old_id] = new_id
            new_id += 1
    return id_map

uid_map = get_map(uids)
iid_map = get_map(iids)

raw_df['uid'] = raw_df['uid'].map(uid_map)
raw_df['iid'] = raw_df['iid'].map(iid_map)

raw_df = raw_df.reset_index(drop=True)
raw_df

Unnamed: 0,uid,iid,timestamp
0,0,0,978824268
1,0,1,978824351
2,0,2,978301777
3,0,3,978300760
4,0,4,978824195
...,...,...,...
565195,3414,2049,966281767
565196,3414,2494,966284183
565197,3414,1800,966284831
565198,3414,1647,966286947


In [10]:
uids = raw_df['uid'].unique()
iids = raw_df['iid'].unique()
user_num = len(uids)
item_num = len(iids)

neg_dict = {}
for uid in uids:
    pos_iids = raw_df[raw_df['uid'] == uid]['iid'].to_list()
    neg_iids = np.setdiff1d(iids, pos_iids)
    np.random.shuffle(neg_iids)
    neg_dict[uid] = neg_iids[0:test_neg_num-1].tolist()
    
len(neg_dict), len(neg_dict[0])

(3415, 99)

In [11]:
info_dict = {
    'user_num': len(uids),
    'item_num': len(iids),
}

info_dict

{'user_num': 3415, 'item_num': 3636}

In [12]:
latest_idxs = raw_df.groupby('uid')['timestamp'].idxmax()
pos_test_df = raw_df.loc[latest_idxs.values]
pos_test_df

Unnamed: 0,uid,iid,timestamp
1,0,1,978824351
66,1,66,978300174
215,2,193,978298504
235,3,208,978294282
276,4,238,978246585
...,...,...,...
564017,3410,4,1031075692
564132,3411,158,966290245
564143,3412,570,966285533
564202,3413,59,983466920


In [13]:
pos_train_df = pd.concat([raw_df, pos_test_df, pos_test_df]).drop_duplicates(keep=False)
pos_train_df

Unnamed: 0,uid,iid,timestamp
0,0,0,978824268
2,0,2,978301777
3,0,3,978300760
4,0,4,978824195
5,0,5,978302149
...,...,...,...
565195,3414,2049,966281767
565196,3414,2494,966284183
565197,3414,1800,966284831
565198,3414,1647,966286947


In [14]:
pos_test_df = pos_test_df.reset_index(drop=True)
pos_test_df = pos_test_df[['uid', 'iid']]
pos_test_df

Unnamed: 0,uid,iid
0,0,1
1,1,66
2,2,193
3,3,208
4,4,238
...,...,...
3410,3410,4
3411,3411,158
3412,3412,570
3413,3413,59


In [15]:
pos_train_df = pos_train_df.reset_index(drop=True)
pos_train_df = pos_train_df[['uid', 'iid']]
pos_train_df

Unnamed: 0,uid,iid
0,0,0
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
561780,3414,2049
561781,3414,2494
561782,3414,1800
561783,3414,1647


In [23]:
pos_train_arr = pos_train_df.to_numpy()
pos_train_arr

(array([[   1,    0],
        [   1,    2],
        [   0,    3],
        ...,
        [3414, 1800],
        [3414, 1647],
        [3414, 1801]]),
 3415,
 3636)

In [None]:
inter_sp_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
for arr in pos_train_arr:
    print(arr)
    inter_sp_mat[arr[0], arr[1]] = 1.0

len(inter_sp_mat)

In [None]:
with open(os.path.join(data_dir, 'neg.dict'), 'wb') as f:
    pickle.dump(neg_dict, f)
    
with open(os.path.join(data_dir, 'info.dict'), 'wb') as f:
    pickle.dump(info_dict, f)
    
pos_train_df.to_csv(os.path.join(data_dir, 'pos_train.csv'), index=False)
pos_test_df.to_csv(os.path.join(data_dir, 'pos_test.csv'), index=False)