In [1]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [2]:
# config
ds_name = 'Epinion'
is_implicit = False

In [3]:
# loading the Book-Crossing dataset
dataset = pd.read_csv(f"datasets/{ds_name}/{ds_name}_train.txt", sep="\t", names=['uid', 'iid', 'count'])

In [4]:
dataset.head(5)

Unnamed: 0,uid,iid,count
0,0,0,2
1,0,90,5
2,0,120,4
3,0,131,5
4,0,147,1


In [5]:
# get user distribution
user_dist = dataset['uid'].value_counts()
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean items per user: ' + str(user_dist.mean()))
print('Min items per user: ' + str(user_dist.min()))
print('Max items per user: ' + str(user_dist.max()))

No. users: 2677
Mean items per user: 27.08106088905491
Min items per user: 8
Max items per user: 241


In [6]:
def read_item_popularity(is_implicit=False):
    items_freq = dict()
    for eachline in dataset.itertuples(index=True):
        iid, count = int(eachline.iid), int(eachline.count)
        if not is_implicit:
            count = 1
        if iid in items_freq.keys():
            items_freq[iid] += count
        else:
            items_freq[iid] = count
    return items_freq

In [7]:
item_freq = read_item_popularity(is_implicit=True)
num_items = len(item_freq)
print('No. items: ' + str(num_items))

No. items: 2060


In [8]:
# get top items
top_fraction = 0.2
num_top = int(top_fraction * num_items)
sorted_item_freq = {k: v for k, v in sorted(item_freq.items(), key=lambda item: item[1], reverse=True)}
short_heads = list(sorted_item_freq.keys())[:num_top] # top pop items
long_tails = list(sorted_item_freq.keys())[num_top:]
print('No. top items: ' + str(num_top))

No. top items: 412


### Writing the list of items

In [9]:
# Files to save users ids based on the number of checkins
shorthead_items = open(f'datasets/{ds_name}/groups/items/020/shorthead_items.txt', 'w')
longtail_items = open(f'datasets/{ds_name}/groups/items/020/longtail_items.txt', 'w')

for iid in short_heads:
    shorthead_items.write(str(iid) + '\n')
shorthead_items.close()

for iid in long_tails:
    longtail_items.write(str(iid) + '\n')
longtail_items.close()

In [10]:
user_interactions = defaultdict(set)
for eachline in tqdm(dataset.itertuples(index=True)):
    uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
    uid, iid, count = int(uid), int(iid), int(count)
    if uid in user_interactions.keys():
        user_interactions[uid].add(iid)
    else:
        user_interactions[uid] = {iid}

72496it [00:00, 262135.86it/s]


In [130]:
# user_profile_ratio = {}

# for user, items in tqdm(user_interactions.items()):
#     if user not in user_profile_ratio.keys():
#         user_profile_ratio[user] = (len(set(items) & set(short_heads))) / len(set(items))

100%|██████████| 2448/2448 [00:00<00:00, 57665.00it/s]


In [None]:
user_profile_pop = {}

for user, items in tqdm(user_interactions.items()):
    if user not in user_profile_pop.keys():
        user_profile_pop[user] = (len(set(items) & set(short_heads)))

In [131]:
sorted_user_profile_ratio = {k: v for k, v in sorted(user_profile_ratio.items(), key=lambda item: item[1], reverse=True)}

In [132]:
num_user = len(sorted_user_profile_ratio)
num_top_users = int(top_fraction * num_users)
num_top_users, num_user

(489, 2448)

In [133]:
# Files to save users ids based on the number of checkins
inactive_users = open(f'datasets/{ds_name}/groups/users/20/inactive_ids.txt', 'w')
active_users = open(f'datasets/{ds_name}/groups/users/20/active_ids.txt', 'w')

In [134]:
for uid in list(sorted_user_profile_ratio.keys())[:num_top_users]:
    active_users.write(str(uid) + '\n')
active_users.close()

In [135]:
for uid in list(sorted_user_profile_ratio.keys())[num_top_users:]:
    inactive_users.write(str(uid) + '\n')
inactive_users.close()