In [1]:
from collections import defaultdict
import json

In [2]:
# root path to the original datasets
root_path = "original/"

In [4]:
# a dictionary of users to the visited locations
user_to_location = defaultdict(set)

# spliting users based on their gender attribute
male_users = set()
female_users = set()

# spliting users based on their age attribute
aged_users = set()
non_aged_users = set()

# an age threashold to split users based on the age att.
age_thr = 30

# users without att.
non_att_users = set()

# ids of all users
all_users_ids = set()

In [5]:
# adding different datasets records into the user_to_location dictionary
for dataset_file in ['TREC2016/Phase1_requests.json', 'TREC2015/batch_requests.json']:
	with open(root_path + dataset_file) as f:
		for line in f:
			response = json.loads(line)
			user_id = response['body']['person']['id']
			all_users_ids.add(user_id)
			# user gender
			try:
				user_gender = response['body']['person']['gender']
				if user_gender in ['Male', 'male', 'MALE']:
					male_users.add(user_id)
				elif user_gender in ['Female', 'female', 'F', 'f', 'FEMALE', 'femaLE']:
					female_users.add(user_id)
				else:
					print(user_gender)
			except:
				# these users do not have a gender attributes
				print("> Not gen att. for: ", user_id)
				non_att_users.add(user_id)
			# user age
			try:
				user_age = int(response['body']['person']['age'])
				if user_age > age_thr:
					aged_users.add(user_id)
				elif user_age <= age_thr:
					non_aged_users.add(user_id)
				else:
					print(user_age)
			except:
				print("> Not age att. for: ", user_id)
				non_att_users.add(user_id)

			for preference in response['body']['person']['preferences']:
				user_to_location[user_id].add((preference['documentId'], preference['rating']))

> Not gen att. for:  7732
> Not age att. for:  7732
> Not gen att. for:  1502
> Not age att. for:  1502
> Not gen att. for:  11726
> Not age att. for:  11726
> Not age att. for:  16176
> Not gen att. for:  2841
> Not age att. for:  2841
> Not gen att. for:  808
> Not age att. for:  808
> Not gen att. for:  8838
> Not age att. for:  8838
> Not gen att. for:  4177
> Not age att. for:  4177
> Not gen att. for:  5322
> Not age att. for:  5322
> Not gen att. for:  6753
> Not age att. for:  6753


In [6]:
# to check if users have any intersection
both_gender_users = male_users & female_users
print(both_gender_users)

print(male_users & non_att_users)
print(female_users & non_att_users)

{'A1I6J7F9S59DB0', 'A3D6RK0BTXYEO2', 'A27BSYPO6JCB4Q'}
set()
{16176}


In [7]:
both_aged_users = aged_users & non_aged_users
print(both_aged_users)

print(aged_users & non_att_users)
print(non_aged_users & non_att_users)

{'A2UVZ2TKPH9KYW', 'A3D6RK0BTXYEO2', 'A1LI3JQS2VBOXC', 'A3GJ2E1A1H5DM7'}
set()
set()


In [11]:
# the numebr of all users in the system
len(all_users_ids)

453

In [9]:
len(male_users), len(female_users), len(male_users) + len(female_users)

(248, 199, 447)

In [12]:
len(aged_users), len(non_aged_users)

(226, 221)

In [14]:
male_users_txt = open('TRECx1516/TRECx1516_male_ids.txt', 'w')
female_users_txt = open('TRECx1516/TRECx1516_female_ids.txt', 'w')

for male_user in male_users:
    if male_user not in both_gender_users and male_user not in both_aged_users and male_user not in non_att_users:
        male_users_txt.write(f"{str(male_user)}\n")

for female_user in female_users:
    if female_user not in both_gender_users and female_user not in both_aged_users and female_user not in non_att_users:
        female_users_txt.write(f"{str(female_user)}\n")

male_users_txt.close()
female_users_txt.close()

In [15]:
aged_users_txt = open(f'TRECx1516/TRECx1516_aged_ids.txt', 'w')
non_aged_users_txt = open(f'TRECx1516/TRECx1516_non_aged_ids.txt', 'w')

for aged_user in aged_users:
    if aged_user not in both_aged_users and aged_user not in both_gender_users and aged_user not in non_att_users:
        aged_users_txt.write(f"{str(aged_user)}\n")

for non_aged_user in non_aged_users:
    if non_aged_user not in both_aged_users and non_aged_user not in both_gender_users and non_aged_user not in non_att_users:
        non_aged_users_txt.write(f"{str(non_aged_user)}\n")

aged_users_txt.close()
non_aged_users_txt.close()

In [17]:
# split data per user interaction w.r.t split ratio
split_ratio = 0.7

data_all = open('TRECx1516/TRECx1516_all.txt', 'w')
data_train = open('TRECx1516/TRECx1516_train.txt', 'w')
data_test = open('TRECx1516/TRECx1516_test.txt', 'w')

for uid, lids_ratings in user_to_location.items():
    counter = 0
    train_rate = len(lids_ratings) * split_ratio
    if uid not in both_aged_users and uid not in both_aged_users and uid not in non_att_users:
        for lid, rating in lids_ratings:
            data_all.write(f"{uid}\t{lid}\t{rating}\n")
            if counter >= train_rate:
                data_test.write(f"{uid}\t{lid}\t{rating}\n")
                counter += 1
            else:
                data_train.write(f"{uid}\t{lid}\t{rating}\n")
                counter += 1

data_all.close()
data_train.close()
data_test.close()