# User Grouping Methods

In [1]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

## 1. Setup

In [2]:
# config
ds_name = 'MovieLensSmall'
is_implicit = False
item_top_fraction = 0.2
user_top_fraction = 0.2 # 0.05 or 0.2
user_bottom_fraction = 0.2 # 0.05 or 0.2
'''
To indicate the user grouping method which can be based on (1) interactions or (2) popular item consumption. For interactions, we select top 5% users
(user_top_fraction = 0.05) as advantaged while for popular consumption we consdier top 20% users (user_top_fraction = 0.2) as advanteged users.
'''
user_grouping_method = 'interactions'  # this can be either interactions or popular_consumption

In [3]:
if user_top_fraction + user_bottom_fraction < 1:
    print("The precentages are coorect!")

The precentages are coorect!


## 2. Load Train Data

In [4]:
# loading ds_name dataset
dataset = pd.read_csv(f"datasets/{ds_name}/{ds_name}_train.txt", sep="\t", names=['uid', 'iid', 'count'])

In [5]:
dataset.head(5)

Unnamed: 0,uid,iid,count
0,0,0,5
1,0,5,5
2,0,7,3
3,0,8,4
4,0,12,4


### Charactristics

In [6]:
# get user distribution
user_dist = dataset['uid'].value_counts()
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean items per user: ' + str(user_dist.mean()))
print('Min items per user: ' + str(user_dist.min()))
print('Max items per user: ' + str(user_dist.max()))

No. users: 610
Mean items per user: 132.24262295081968
Min items per user: 13
Max items per user: 2160


## 3. Read User and Item Profile
Read user and item interactions profile

In [7]:
# A dictionary to store users and the items in their profiles 
user_interactions = defaultdict(list)

In [8]:
# To read the popularit of itmes from a implicit feedback type datasets
def read_implicit_item_popularity():
    print("Reading implicit item popularity ...")
    items_freq = dict()
    global user_interactions
    for eachline in tqdm(dataset.itertuples(index=True)):
        # iid, count = int(eachline.iid), int(eachline.count)
        uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
        uid, iid, count = int(uid), int(iid), int(count)
        # Adding users and their corresponding items
        # We need to loop on the user interaction frequency with item in implicit feedback
        for i in range(count):
            if uid in user_interactions.keys():
                user_interactions[uid].append(iid)
            else:
                user_interactions[uid] = [iid]
        # Adding the interaction frequency (implicit) of items
        if iid in items_freq.keys():
            items_freq[iid] += count
        else:
            items_freq[iid] = count
    return items_freq

In [9]:
# To read the popularit of itmes from a explicit feedback type datasets
def read_explicit_item_popularity():
    print("Reading explicit item popularity ...")
    items_freq = dict()
    global user_interactions
    for eachline in tqdm(dataset.itertuples(index=True)):
        uid, iid, count = int(eachline.uid), int(eachline.iid), int(eachline.count)
        uid, iid, count = int(uid), int(iid), int(count)
        # Adding users and their corresponding items
        if uid in user_interactions.keys():
            user_interactions[uid].append(iid)
        else:
            user_interactions[uid] = [iid]
        # Adding the interaction of items
        if iid in items_freq.keys():
            items_freq[iid] += 1
        else:
            items_freq[iid] = 1
    return items_freq

In [10]:
if is_implicit:
    # if feedback type is implicit
    item_freq = read_implicit_item_popularity()
elif not is_implicit:
    # if feedback type is explicit
    item_freq = read_explicit_item_popularity()
num_items = len(item_freq)
print('No. items: ' + str(num_items))

29578it [00:00, 295769.00it/s]

Reading explicit item popularity ...


80668it [00:00, 330684.16it/s]

No. items: 8974





## 4. Item Grouping

In [11]:
# get top items
num_top = int(item_top_fraction * num_items)
# sort items according to the no. of recevied interactions from users
sorted_item_freq = {k: v for k, v in sorted(item_freq.items(), key=lambda item: item[1], reverse=True)}
short_heads = list(sorted_item_freq.keys())[:num_top] # top popular items based on the item_top_fractin
long_tails = list(sorted_item_freq.keys())[num_top:] # log-tail items
print('No. top items: ' + str(num_top))

No. top items: 1794


In [12]:
# Files to save item ids based on the popularity of items (no. of interactions)
# shorthead_items = open(f'datasets/{ds_name}/groups/items/020/shorthead_items.txt', 'w')
# longtail_items = open(f'datasets/{ds_name}/groups/items/020/longtail_items.txt', 'w')

# for iid in short_heads:
#     shorthead_items.write(str(iid) + '\n')
# shorthead_items.close()

# for iid in long_tails:
#     longtail_items.write(str(iid) + '\n')
# longtail_items.close()

## 5. User Grouping

In [13]:
user_profile_pop_df = pd.DataFrame(columns=['uid', 'pop_count', 'profile_size'])

In [14]:
user_profile_pop = {}

for user, items in tqdm(user_interactions.items()):
    if user not in user_profile_pop.keys():
        pop_count = len(set(items) & set(short_heads))
        user_profile_pop[user] = len(set(items) & set(short_heads))
        user_profile_pop_df = user_profile_pop_df.append({'uid': user, 'pop_count': pop_count, 'profile_size': len(user_interactions[user])}, ignore_index=True)

100%|██████████| 610/610 [00:01<00:00, 445.49it/s]


In [15]:
user_profile_pop_df.head()

Unnamed: 0,uid,pop_count,profile_size
0,0,370,588
1,1,91,96
2,2,755,1674
3,3,26,27
4,4,21,23


In [16]:
# hrouping users based on which method? popular_consumption or interactions?
if user_grouping_method == "popular_consumption":
    user_profile_pop_df.sort_values(['pop_count', 'profile_size'], ascending = (False, False), inplace=True)
elif user_grouping_method == "interactions":
    user_profile_pop_df.sort_values(['profile_size'], ascending = (False), inplace=True)

In [17]:
num_user = user_profile_pop_df.shape[0]
num_top_users = int(user_top_fraction * num_users) # no. of top users based on user_top_fraction
num_bottom_users = int(user_bottom_fraction * num_users) # no. of top users based on user_top_fraction
print(f"No. of users: {num_user}, no. of top users {num_top_users}, and no. of bottom users {num_bottom_users}.")

No. of users: 610, no. of top users 122, and no. of bottom users 122.


In [18]:
top_users = user_profile_pop_df.head(num_top_users) # get top users
middle_users = user_profile_pop_df.iloc[num_top_users:user_profile_pop_df.shape[0]-num_bottom_users] # get middle users
bottom_users = user_profile_pop_df.tail(num_bottom_users) # get bottom users

In [20]:
len_top_profile = 0
len_middle_profile = 0
len_bottom_profile = 0

for uid in top_users.iterrows():
    len_top_profile += len(user_interactions[int(uid[1][0])])

for uid in middle_users.iterrows():
    len_middle_profile += len(user_interactions[int(uid[1][0])])

for uid in bottom_users.iterrows():
    len_bottom_profile += len(user_interactions[int(uid[1][0])])

print(f"No. of TOP users: {top_users.shape[0]} - Sum of TOP users profile size: {len_top_profile}")
print(f"No. of MIDDLE users: {middle_users.shape[0]} - Sum of MIDDLE users profile size: {len_middle_profile}")
print(f"No. of BOTTOM users: {bottom_users.shape[0]} - Sum of BOTTOM users profile size: {len_bottom_profile}")

No. of TOP users: 122 - Sum of TOP users profile size: 52763
No. of MIDDLE users: 366 - Sum of MIDDLE users profile size: 25517
No. of BOTTOM users: 122 - Sum of BOTTOM users profile size: 2388


In [21]:
# Files to save users ids based on the number of checkins
# active_users == advantaged_users, inactive_users == disadvantaged_users
top_users_file = open(f'datasets/{ds_name}/user-groups/2/popular_users.txt', 'w')
middle_users_file = open(f'datasets/{ds_name}/user-groups/2/diverse_users.txt', 'w')
bottom_users_file = open(f'datasets/{ds_name}/user-groups/2/unpopular_users.txt', 'w')

In [22]:
for eachline in top_users.itertuples(index=True):
    top_users_file.write(str(eachline.uid) + '\n')
top_users_file.close()

In [23]:
for eachline in middle_users.itertuples(index=True):
    middle_users_file.write(str(eachline.uid) + '\n')
middle_users_file.close()

In [24]:
for eachline in bottom_users.itertuples(index=True):
    bottom_users_file.write(str(eachline.uid) + '\n')
bottom_users_file.close()