# Rating to inter
- The User-Item interaction graph is extracted from the MicroLens-100k_pairs.csv file, and after applying a 5-core filter, the IDs are reindexed.
- dataset located at: https://recsys.westlake.edu.cn/MicroLens-100k-Dataset/

In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('data')
os.getcwd()

'/root/preprocessing/data'

In [3]:
df = pd.read_csv('MicroLens-100k_pairs.csv', names=['userID', 'itemID', 'timestamp'], header=None)
print(f'shape: {df.shape}')
df[:5]

shape: (719405, 3)


Unnamed: 0,userID,itemID,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,94805,9580,1584083806481
3,37550,9580,1584412681021
4,89825,9580,1584649439020


In [4]:
unique_items_count = df['itemID'].nunique()
unique_items_count

19738

In [5]:
k_core = 5
learner_id, course_id, tmstmp_str = 'userID', 'itemID', 'timestamp'

df.dropna(subset=[learner_id, course_id, tmstmp_str], inplace=True)
df.drop_duplicates(subset=[learner_id, course_id, tmstmp_str], inplace=True)
print(f'After dropped: {df.shape}')
df[:3]

After dropped: (719405, 3)


Unnamed: 0,userID,itemID,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,94805,9580,1584083806481


In [6]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 1, 1

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    # Return an empty set if no field or both max_num and min_num are not specified
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    # Set default values for max_num and min_num
    max_num = max_num or np.inf
    min_num = min_num or -1

    # Get all IDs from the specified field
    ids = df[field].values
    # Count the number of occurrences of each ID
    inter_num = Counter(ids)
    # Identify IDs that don't meet the interaction number constraints
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    
    # Print the number of illegal IDs found
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids

def filter_by_k_core(df):
    while True:
        # Get users and items that don't meet the k-core criteria
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        
        # If no illegal users or items are found, stop filtering
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        # Track interactions to be dropped
        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        
        # Print the number of interactions dropped
        print(f'{len(dropped_inter)} dropped interactions')
        
        # Drop the identified interactions from the DataFrame
        df.drop(df.index[dropped_inter], inplace=True)




## k-core

In [7]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:2]

0 illegal_ids_by_inter_num, field=userID
0 illegal_ids_by_inter_num, field=itemID
k-core shape: (719405, 3)
shape after k-core: (719405, 3)


Unnamed: 0,userID,itemID,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018


## Re-index

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:

i_mapping_file = 'i_id_mapping.csv'
u_mapping_file = 'u_id_mapping.csv'

splitting = [0.7, 0.1, 0.2]
uid_field, iid_field = learner_id, course_id

uni_users = pd.unique(df[uid_field])
uni_items = pd.unique(df[iid_field])

# start from 0
u_id_map = {k: i for i, k in enumerate(uni_users)}
i_id_map = {k: i for i, k in enumerate(uni_items)}

df[uid_field] = df[uid_field].map(u_id_map)
df[iid_field] = df[iid_field].map(i_id_map)
df[uid_field] = df[uid_field].astype(int)
df[iid_field] = df[iid_field].astype(int)

# dump
rslt_dir = './'
u_df = pd.DataFrame(list(u_id_map.items()), columns=['user_id', 'userID'])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', 'itemID'])

u_df.to_csv(os.path.join(rslt_dir, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(rslt_dir, i_mapping_file), sep='\t', index=False)
print(f'mapping dumped...')

mapping dumped...


In [10]:

# =========2. splitting
print(f'splitting ...')
tot_ratio = sum(splitting)
ratios = [i for i in splitting if i > .0]
ratios = [_ / tot_ratio for _ in ratios]
split_ratios = np.cumsum(ratios)[:-1]

split_ratios

splitting ...


array([0.7, 0.8])

In [11]:
ts_id = 'timestamp'

split_timestamps = list(np.quantile(df[ts_id], split_ratios))
# get df training dataset unique users/items
df_train = df.loc[df[ts_id] < split_timestamps[0]].copy()
df_val = df.loc[(split_timestamps[0] <= df[ts_id]) & (df[ts_id] < split_timestamps[1])].copy()
df_test = df.loc[(split_timestamps[1] <= df[ts_id])].copy()

x_label, rslt_file = 'x_label', 'microlens.inter'
df_train[x_label] = 0
df_val[x_label] = 1
df_test[x_label] = 2
temp_df = pd.concat([df_train, df_val, df_test])
temp_df = temp_df[[learner_id, course_id, ts_id, x_label]]
print(f'columns: {temp_df.columns}')

temp_df.columns = [learner_id, course_id, ts_id, x_label]

temp_df.to_csv(os.path.join(rslt_dir, rslt_file), sep='\t', index=False)
temp_df[:5]
print('done!')

columns: Index(['userID', 'itemID', 'timestamp', 'x_label'], dtype='object')
done!


## Reload

In [12]:
indexed_df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:4]

shape: (719405, 4)


Unnamed: 0,userID,itemID,timestamp,x_label
0,0,0,1583378629552,0
1,1,0,1583436719018,0
2,2,0,1584083806481,0
3,3,0,1584412681021,0


In [13]:
u_uni = indexed_df[learner_id].unique()
c_uni = indexed_df[course_id].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 100000
# of unique courses: 19738
min/max of unique learners: 0/99999
min/max of unique courses: 0/19737
