# User Groups

Creating user groups according the popularity of items in their profile

In [1]:
# import required packages
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [2]:
# config
ds_name = 'MovieLens100K'

In [3]:
# loading the Book-Crossing dataset
dataset = pd.read_csv(f"datasets/{ds_name}/{ds_name}_train.txt", sep="\t", names=['uid', 'iid', 'rating'])

In [4]:
dataset

Unnamed: 0,uid,iid,rating
0,0,0,4
1,0,4,5
2,0,18,4
3,0,22,3
4,0,24,5
...,...,...,...
69495,942,351,3
69496,942,368,4
69497,942,638,4
69498,942,838,1


In [5]:
# get user distribution
user_dist = dataset['uid'].value_counts()
num_users = len(user_dist)
print('No. users: ' + str(num_users))
print('Mean items per user: ' + str(user_dist.mean()))
print('Min items per user: ' + str(user_dist.min()))
print('Max items per user: ' + str(user_dist.max()))

No. users: 943
Mean items per user: 73.70095440084836
Min items per user: 10
Max items per user: 461


In [6]:
# get item distribution
item_dist = dataset['iid'].value_counts()
num_items = len(item_dist)
print('No. items: ' + str(num_items))

No. items: 1349


In [7]:
item_dist

29      411
11      356
233     352
113     338
241     333
       ... 
1333      2
1348      2
1343      2
1309      1
1345      1
Name: iid, Length: 1349, dtype: int64

In [8]:
# get top items
top_fraction = 0.2
num_top = int(top_fraction * num_items)
top_item_dist = item_dist[:num_top]
print('No. top items: ' + str(num_top))

No. top items: 269


In [9]:
top_item_set = set(top_item_dist.keys())

In [10]:
user_items = defaultdict(set)
for eachline in tqdm(dataset.iterrows()):
    uid, iid, rating = eachline[1][0], eachline[1][1], eachline[1][2] 
    uid, iid = int(uid), int(iid)
    if uid in user_items.keys():
        user_items[uid].add(iid)
    else:
        user_items[uid] = {iid}

69500it [00:08, 8142.88it/s]


In [11]:
user_pop_item_ratio = {}

for user, items in tqdm(user_items.items()):
    if user not in user_pop_item_ratio.keys():
        user_pop_item_ratio[user] = (len(set(items) & set(top_item_set))) / len(set(items))

100%|██████████| 943/943 [00:00<00:00, 71084.79it/s]


In [12]:
sorted_user_pop_item_ratio = {k: v for k, v in sorted(user_pop_item_ratio.items(), key=lambda item: item[1], reverse=True)}

In [13]:
num_user = len(sorted_user_pop_item_ratio)
num_top_users = int(top_fraction * num_users)
num_top_users

188

In [14]:
# Files to save users ids based on the number of checkins
inactive_users = open(f'datasets/{ds_name}/groups/users/inactive_ids.txt', 'w')
active_users = open(f'datasets/{ds_name}/groups/users/active_ids.txt', 'w')

8

In [15]:
for uid in list(sorted_user_pop_item_ratio.keys())[:num_top_users]:
    active_users.write(str(uid) + '\n')
active_users.close()

In [16]:
for uid in list(sorted_user_pop_item_ratio.keys())[num_top_users:]:
    inactive_users.write(str(uid) + '\n')
inactive_users.close()