# Train/Validation/Test data splitting based on the 5-core interaction graph generated from `rating2inter.ipynb`.
- Based on generated interactions, perform data splitting


In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('data')
os.getcwd()

'/root/preprocessing/data'

## Directly load the existing interactions, Load interactions.

In [3]:
rslt_file = 'microlens.inter'
df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {df.shape}')
df[:4]

shape: (719405, 4)


Unnamed: 0,userID,itemID,timestamp,x_label
0,0,0,1583378629552,0
1,1,0,1583436719018,0
2,2,0,1584083806481,0
3,3,0,1584412681021,0


In [4]:
import random
import numpy as np

In [5]:

df = df.sample(frac=1).reset_index(drop=True)

df.sort_values(by=['userID'], inplace=True)
df[:20]

Unnamed: 0,userID,itemID,timestamp,x_label
158882,0,0,1583378629552,0
484357,0,17741,1662694576599,2
92443,0,16123,1661660511750,2
618621,0,14805,1660439827039,0
241955,0,10209,1658197898237,0
47900,0,13185,1662714477582,2
519954,0,5412,1655642536796,0
123844,1,0,1583436719018,0
70595,1,1162,1649991914832,0
519448,1,2513,1658009084817,0


In [6]:
uid_field, iid_field = 'userID', 'itemID'

uid_freq = df.groupby(uid_field)[iid_field]
u_i_dict = {}
for u, u_ls in uid_freq:
    u_i_dict[u] = list(u_ls)
u_i_dict

{0: [0, 17741, 16123, 14805, 10209, 13185, 5412],
 1: [0, 1162, 2513, 9535, 6196],
 2: [14037, 0, 19340, 12827, 18346, 6905, 11904],
 3: [15698, 0, 16676, 16062, 19618, 8732, 1516, 17855, 18701, 2521],
 4: [16835, 0, 10109, 7090, 592, 6341],
 5: [6477, 6472, 14903, 16261, 18596, 0, 11823, 14021, 16538, 15643, 18036],
 6: [14643, 593, 4946, 17593, 17547, 10760, 0, 4334],
 7: [14789, 18114, 8659, 8482, 0, 18267],
 8: [12365, 0, 17126, 13436, 2513, 10306, 10624],
 9: [10459, 11869, 0, 12558, 11347],
 10: [13897, 7539, 7698, 8145, 9628, 8695, 706, 0],
 11: [10866, 13121, 0, 12634, 1272],
 12: [1, 406, 2, 301, 1182, 931, 939],
 13: [7804, 1274, 4668, 1889, 2, 4932, 3522, 1, 5],
 14: [17692, 1, 11805, 4, 18031],
 15: [1616, 2127, 6466, 7439, 1, 8898],
 16: [210,
  10249,
  12492,
  14820,
  9451,
  16584,
  814,
  7327,
  455,
  635,
  14534,
  11127,
  11621,
  3102,
  14443,
  13466,
  5508,
  12180,
  66,
  1,
  16863,
  5751,
  4,
  567,
  8764,
  7839,
  7534],
 17: [17017,
  4,
  6587,

In [7]:
new_label = []
u_ids_sorted = sorted(u_i_dict.keys())

for u in u_ids_sorted:
    items = u_i_dict[u]
    n_items = len(items)
    if n_items < 10:
        tmp_ls = [0] * (n_items - 2) + [1] + [2]
    else:
        val_test_len = int(n_items * 0.2)
        train_len = n_items - val_test_len
        val_len = val_test_len // 2
        test_len = val_test_len - val_len
        tmp_ls = [0] * train_len + [1] * val_len + [2] * test_len
    new_label.extend(tmp_ls)

new_label[:100]

[0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [8]:
df['x_label'] = new_label
df[:20]

Unnamed: 0,userID,itemID,timestamp,x_label
158882,0,0,1583378629552,0
484357,0,17741,1662694576599,0
92443,0,16123,1661660511750,0
618621,0,14805,1660439827039,0
241955,0,10209,1658197898237,0
47900,0,13185,1662714477582,1
519954,0,5412,1655642536796,2
123844,1,0,1583436719018,0
70595,1,1162,1649991914832,0
519448,1,2513,1658009084817,0


In [9]:
rslt_file[:-6]

'microlens'

In [10]:
new_labeled_file = rslt_file[:-6] + '-v4.inter'
df.to_csv(os.path.join('./', new_labeled_file), sep='\t', index=False)
print('done!!!')

done!!!


## Reload

In [11]:
indexed_df = pd.read_csv(new_labeled_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:20]

shape: (719405, 4)


Unnamed: 0,userID,itemID,timestamp,x_label
0,0,0,1583378629552,0
1,0,17741,1662694576599,0
2,0,16123,1661660511750,0
3,0,14805,1660439827039,0
4,0,10209,1658197898237,0
5,0,13185,1662714477582,1
6,0,5412,1655642536796,2
7,1,0,1583436719018,0
8,1,1162,1649991914832,0
9,1,2513,1658009084817,0


In [12]:
u_id_str, i_id_str = 'userID', 'itemID'
u_uni = indexed_df[u_id_str].unique()
c_uni = indexed_df[i_id_str].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 100000
# of unique courses: 19738
min/max of unique learners: 0/99999
min/max of unique courses: 0/19737
