In [1]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [2]:
# config
ds_name = 'Epinion'
is_implicit = False

In [3]:
# loading the Book-Crossing dataset
dataset = pd.read_csv(f"datasets/{ds_name}/{ds_name}_train.txt", sep="\t", names=['uid', 'iid', 'count'])

In [4]:
dataset.head(5)

Unnamed: 0,uid,iid,count
0,0,0,2
1,0,90,5
2,0,120,4
3,0,131,5
4,0,147,1


In [5]:
# get user distribution
user_dist = dataset['uid'].value_counts()
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean items per user: ' + str(user_dist.mean()))
print('Min items per user: ' + str(user_dist.min()))
print('Max items per user: ' + str(user_dist.max()))

No. users: 2677
Mean items per user: 27.08106088905491
Min items per user: 8
Max items per user: 241


In [6]:
def read_item_popularity(is_implicit=False):
    items_freq = dict()
    for eachline in dataset.itertuples(index=True):
        iid, count = int(eachline.iid), int(eachline.count)
        if not is_implicit:
            count = 1
        if iid in items_freq.keys():
            items_freq[iid] += count
        else:
            items_freq[iid] = count
    return items_freq

In [7]:
item_freq = read_item_popularity(is_implicit=True)
num_items = len(item_freq)
print('No. items: ' + str(num_items))

No. items: 2060


In [8]:
# get top items
top_fraction = 0.2
num_top = int(top_fraction * num_items)
sorted_item_freq = {k: v for k, v in sorted(item_freq.items(), key=lambda item: item[1], reverse=True)}
short_heads = list(sorted_item_freq.keys())[:num_top] # top pop items
long_tails = list(sorted_item_freq.keys())[num_top:]
print('No. top items: ' + str(num_top))

No. top items: 412


### Writing the list of items

In [9]:
# Files to save users ids based on the number of checkins
shorthead_items = open(f'datasets/{ds_name}/groups/items/020/shorthead_items.txt', 'w')
longtail_items = open(f'datasets/{ds_name}/groups/items/020/longtail_items.txt', 'w')

for iid in short_heads:
    shorthead_items.write(str(iid) + '\n')
shorthead_items.close()

for iid in long_tails:
    longtail_items.write(str(iid) + '\n')
longtail_items.close()

In [27]:
user_profile_pop_df = pd.DataFrame(columns=['uid', 'pop_count', 'profile_size'])

In [10]:
user_interactions = defaultdict(list)
for eachline in tqdm(dataset.itertuples(index=True)):
    uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
    uid, iid, count = int(uid), int(iid), int(count)
    if is_implicit:
        for i in range(count):
            if uid in user_interactions.keys():
                user_interactions[uid].append(iid)
            else:
                user_interactions[uid] = [iid]
    else:
        if uid in user_interactions.keys():
            user_interactions[uid].append(iid)
        else:
            user_interactions[uid] = [iid]

72496it [00:00, 242525.72it/s]


In [130]:
# user_profile_ratio = {}

# for user, items in tqdm(user_interactions.items()):
#     if user not in user_profile_ratio.keys():
#         user_profile_ratio[user] = (len(set(items) & set(short_heads))) / len(set(items))

100%|██████████| 2448/2448 [00:00<00:00, 57665.00it/s]


In [29]:
user_profile_pop = {}

for user, items in tqdm(user_interactions.items()):
    if user not in user_profile_pop.keys():
        pop_count = len(set(items) & set(short_heads))
        user_profile_pop[user] = len(set(items) & set(short_heads))
        user_profile_pop_df = user_profile_pop_df.append({'uid': user, 'pop_count': pop_count, 'profile_size': len(user_interactions[user])}, ignore_index=True)

100%|██████████| 2677/2677 [00:05<00:00, 465.55it/s]


In [21]:
len(user_interactions[0])
len(set(user_interactions[0]) & set(short_heads))

11

In [32]:
user_profile_pop_df.head()

Unnamed: 0,uid,pop_count,profile_size
211,211,96,207
256,256,94,241
14,14,83,182
448,448,76,208
376,376,75,225


In [31]:
user_profile_pop_df.sort_values(['pop_count', 'profile_size'], ascending = (False, False), inplace=True)

In [38]:
# advantaged_users

In [12]:
# sorted_user_profile_ratio = {k: v for k, v in sorted(user_profile_pop.items(), key=lambda item: item[1], reverse=True)}

In [39]:
num_user = user_profile_pop_df.shape[0]
num_top_users = int(top_fraction * num_users)
num_top_users, num_user

(535, 2677)

In [41]:
advantaged_users = user_profile_pop_df.head(num_top_users)
disadvantaged_users = user_profile_pop_df.iloc[num_top_users:user_profile_pop_df.shape[0]]

In [50]:
type(advantaged_users)

pandas.core.frame.DataFrame

In [52]:
disadvantaged_users

Unnamed: 0,uid,pop_count,profile_size
890,890,16,44
925,925,16,44
1696,1696,16,43
1003,1003,16,42
766,766,16,41
...,...,...,...
882,882,1,12
1603,1603,1,11
1872,1872,1,11
583,583,0,15


In [46]:
# Files to save users ids based on the number of checkins
inactive_users = open(f'datasets/{ds_name}/groups/users/2/inactive_ids.txt', 'w')
active_users = open(f'datasets/{ds_name}/groups/users/2/active_ids.txt', 'w')

In [47]:
for eachline in advantaged_users.itertuples(index=True):
    active_users.write(str(eachline.uid) + '\n')
active_users.close()

In [51]:
for eachline in disadvantaged_users.itertuples(index=True):
    inactive_users.write(str(eachline.uid) + '\n')
inactive_users.close()