# User and Item Grouping

In [1]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

## 1. Setup

In [2]:
# config
ds_name = 'AmazonToy'
is_implicit = False
item_top_fraction = 0.2
user_top_fraction = 0.05 # 0.05 or 0.2
'''
To indicate the user grouping method which can be based on (1) interactions or (2) popular item consumption. For interactions, we select top 5% users
(user_top_fraction = 0.05) as advantaged while for popular consumption we consdier top 20% users (user_top_fraction = 0.2) as advanteged users.
'''
user_grouping_method = 'interactions'  # this can be either interactions or popular_consumption

## 2. Load Train Data

In [3]:
# loading ds_name dataset
dataset = pd.read_csv(f"datasets/{ds_name}/{ds_name}_train.txt", sep="\t", names=['uid', 'iid', 'count'])

In [4]:
dataset.head(5)

Unnamed: 0,uid,iid,count
0,0,0,4
1,0,3,3
2,0,41,5
3,0,78,5
4,0,118,5


### Charactristics

In [5]:
# get user distribution
user_dist = dataset['uid'].value_counts()
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean items per user: ' + str(user_dist.mean()))
print('Min items per user: ' + str(user_dist.min()))
print('Max items per user: ' + str(user_dist.max()))

No. users: 2170
Mean items per user: 10.596774193548388
Min items per user: 1
Max items per user: 74


## 3. Read User and Item Profile
Read user and item interactions profile

In [6]:
# A dictionary to store users and the items in their profiles 
user_interactions = defaultdict(list)

In [7]:
# To read the popularit of itmes from a implicit feedback type datasets
def read_implicit_item_popularity():
    print("Reading implicit item popularity ...")
    items_freq = dict()
    global user_interactions
    for eachline in tqdm(dataset.itertuples(index=True)):
        # iid, count = int(eachline.iid), int(eachline.count)
        uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
        uid, iid, count = int(uid), int(iid), int(count)
        # Adding users and their corresponding items
        # We need to loop on the user interaction frequency with item in implicit feedback
        for i in range(count):
            if uid in user_interactions.keys():
                user_interactions[uid].append(iid)
            else:
                user_interactions[uid] = [iid]
        # Adding the interaction frequency (implicit) of items
        if iid in items_freq.keys():
            items_freq[iid] += count
        else:
            items_freq[iid] = count
    return items_freq

In [8]:
# To read the popularit of itmes from a explicit feedback type datasets
def read_explicit_item_popularity():
    print("Reading explicit item popularity ...")
    items_freq = dict()
    global user_interactions
    for eachline in tqdm(dataset.itertuples(index=True)):
        uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
        uid, iid, count = int(uid), int(iid), int(count)
        # Adding users and their corresponding items
        if uid in user_interactions.keys():
            user_interactions[uid].append(iid)
        else:
            user_interactions[uid] = [iid]
        # Adding the interaction of items
        if iid in items_freq.keys():
            items_freq[iid] += 1
        else:
            items_freq[iid] = 1
    return items_freq

In [9]:
if is_implicit:
    # if feedback type is implicit
    item_freq = read_implicit_item_popularity()
elif not is_implicit:
    # if feedback type is explicit
    item_freq = read_explicit_item_popularity()
num_items = len(item_freq)
print('No. items: ' + str(num_items))

22995it [00:00, 348107.37it/s]

Reading explicit item popularity ...
No. items: 1733





In [10]:
# get top items
num_top = int(item_top_fraction * num_items)
# sort items according to the no. of recevied interactions from users
sorted_item_freq = {k: v for k, v in sorted(item_freq.items(), key=lambda item: item[1], reverse=True)}
short_heads = list(sorted_item_freq.keys())[:num_top] # top popular items based on the item_top_fractin
long_tails = list(sorted_item_freq.keys())[num_top:] # log-tail items
print('No. top items: ' + str(num_top))

No. top items: 346


In [None]:
# Files to save item ids based on the popularity of items (no. of interactions)
shorthead_items = open(f'datasets/{ds_name}/groups/items/020/shorthead_items.txt', 'w')
longtail_items = open(f'datasets/{ds_name}/groups/items/020/longtail_items.txt', 'w')

for iid in short_heads:
    shorthead_items.write(str(iid) + '\n')
shorthead_items.close()

for iid in long_tails:
    longtail_items.write(str(iid) + '\n')
longtail_items.close()

## 2. User Grouping

In [11]:
user_profile_pop_df = pd.DataFrame(columns=['uid', 'pop_count', 'profile_size'])

In [12]:
user_profile_pop = {}

for user, items in tqdm(user_interactions.items()):
    if user not in user_profile_pop.keys():
        pop_count = len(set(items) & set(short_heads))
        user_profile_pop[user] = len(set(items) & set(short_heads))
        user_profile_pop_df = user_profile_pop_df.append({'uid': user, 'pop_count': pop_count, 'profile_size': len(user_interactions[user])}, ignore_index=True)

100%|██████████| 2170/2170 [00:04<00:00, 483.50it/s]


In [13]:
user_profile_pop_df.head()

Unnamed: 0,uid,pop_count,profile_size
0,0,22,37
1,1,6,12
2,2,0,3
3,3,12,13
4,4,1,6


In [14]:
if user_grouping_method == "popular_consumption":
    user_profile_pop_df.sort_values(['pop_count', 'profile_size'], ascending = (False, False), inplace=True)
elif user_grouping_method == "interactions":
    user_profile_pop_df.sort_values(['profile_size'], ascending = (False), inplace=True)

In [15]:
num_user = user_profile_pop_df.shape[0]
num_top_users = int(user_top_fraction * num_users)
num_top_users, num_user

(108, 2170)

In [16]:
advantaged_users = user_profile_pop_df.head(num_top_users)
disadvantaged_users = user_profile_pop_df.iloc[num_top_users:user_profile_pop_df.shape[0]]

In [17]:
# Files to save users ids based on the number of checkins
inactive_users = open(f'datasets/{ds_name}/groups/users/2/inactive_idsXXX.txt', 'w')
active_users = open(f'datasets/{ds_name}/groups/users/2/active_idsXXX.txt', 'w')

In [18]:
for eachline in advantaged_users.itertuples(index=True):
    active_users.write(str(eachline.uid) + '\n')
active_users.close()

In [19]:
for eachline in disadvantaged_users.itertuples(index=True):
    inactive_users.write(str(eachline.uid) + '\n')
inactive_users.close()