In [6]:
import itertools as it
import pandas as pd
import requests as rq
from IPython.display import display

In [7]:
# Parameters
users = "../data/users.json"
host = "localhost"

hash_function = "murmur2"
partition_port = {"0": "5000", "1": "5001"}
limits = [10]
walks = [1000]
walk_length = [100]

should_merge = True
should_take_best_partition = True

output_recommendations = f"../data/{hash_function}_recommendations.json"
output_merge_recommendations = f"../data/{hash_function}_merge_recommendations.json"
output_best_partition_recommendations = f"../data/{hash_function}_best_partition_recommendations.json"

In [8]:
# Import users
user_df = pd.read_json(users, orient="table")

# Compute cross product of all configuration values
configurations = list(it.product([hash_function], partition_port.items(), walks, walk_length, limits))
configurations


[('murmur2', ('0', '5000'), 1000, 100, 10),
 ('murmur2', ('1', '5001'), 1000, 100, 10)]

In [11]:
def get_recommendations(user_df, walks, walk_length, limit, partition_port):
    rows = []
    partition = partition_port[0]
    port = partition_port[1]
    print(f"gathering recommendations from port: {port} and partition {partition} hash_function: {hash_function}")

    for i, row in user_df.iterrows():
        user = row["user_id"]
        url = f"http://{host}:{port}/recommendation/salsa/{user}?walks={walks}&walk_length={walk_length}&limit={limit}"
        response = rq.get(url)
        counter = 0
        while response.status_code != 200 and counter != 9:
            print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
            response = rq.get(url)
            counter += 1

        response_body = response.json() if response.status_code == 200 else []
        recommendations = [res['id'] for res in response_body]
        hits = [res['hit'] for res in response_body]
        rows.append((hash_function, partition, user, recommendations, hits))

    return pd.DataFrame(rows, columns=["hash_function", "partition", "user", "recommendations", "hits"])

In [10]:
# Perform API requests
result_dfs = []

for (hash_function, partition_port, walks, walk_length, limit) in configurations:
    result = get_recommendations(user_df, walks, walk_length, limit, partition_port)
    result_dfs.append(result)

result_df = pd.concat(result_dfs)
display(result_df[["user", "recommendations", "hits", "partition"]])
result_df.to_json(output_recommendations, index=False, orient="table")

gathering recommendations from port: 5000 and partition 0 hash_function: murmur2
gathering recommendations from port: 5001 and partition 1 hash_function: murmur2


Unnamed: 0,user,recommendations,hits,partition
0,1046085619221331968,"[1336366193628700673, 765265513366839297, 1368...","[140, 127, 108, 102, 97, 97, 91, 87, 79, 75]",0
1,51353013,"[1367126296812285953, 1366503802648502275, 136...","[403, 224, 212, 209, 172, 169, 154, 146, 135, ...",0
2,1673222870,"[1342934467791486978, 1367011871459594240, 136...","[2154, 1357, 1147, 920, 892, 795, 756, 746, 73...",0
3,89242811,"[1336681326414614534, 1367305184628649985, 133...","[158, 125, 102, 97, 95, 88, 86, 85, 82, 80]",0
4,2996770824,"[1367359188159012864, 1367408947208986632, 136...","[576, 560, 549, 521, 512, 445, 407, 406, 386, ...",0
...,...,...,...,...
495,823920456008863749,"[1364557463136002050, 1365356146236661765, 135...","[265, 228, 217, 213, 211, 207, 203, 196, 182, ...",1
496,1333870645390815238,"[1349174763260932101, 1365987740504924161, 136...","[655, 530, 522, 481, 475, 458, 457, 453, 436, ...",1
497,1365992959380840449,[],[],1
498,1039973371516215296,[],[],1


In [1]:
def merge_sort_rec_hit(row):
    if row["len_rec"] == 10:
        return row["recommendations"], row["hits"]
    rec_hit_dic = dict(zip(row["recommendations"], row["hits"]))
    sorted_by_hits = dict(sorted(rec_hit_dic.items(), key=lambda item: item[1], reverse=True))
    rec_hit_dic = dict(list(sorted_by_hits.items())[:10])
    return list(rec_hit_dic.keys()), list(rec_hit_dic.values())

In [7]:
# result_df = pd.read_json(output_recommendations, orient="table")
# Merge partition results
if should_merge:
    merge_df = result_df.groupby("user", as_index=False).agg({'recommendations': 'sum', 'hits': 'sum'})
    merge_df["hash_function"] = hash_function
    merge_df["partition"] = "merged"
    merge_df["len_rec"] = merge_df.apply(lambda x: len(x["recommendations"]), axis=1)
    pd.set_option('display.max_colwidth', None)
    merge_df[["recommendations", "hits"]] = merge_df.apply(lambda row: merge_sort_rec_hit(row), axis=1,
                                                           result_type="expand")
    # Save results
    merge_df.to_json(output_merge_recommendations, index=False, orient="table")

In [15]:
# result_df = pd.read_json(output_recommendations, orient="table")
if should_take_best_partition:
    res = []
    for group_name, df_group in result_df.groupby("user"):
        highest_hit = 0
        partition_to_take = 0
        for _, row in df_group.iterrows():
            if len(row["hits"]) == 0:
                continue
            hit = row["hits"][0]
            if hit > highest_hit:
                highest_hit = hit
                partition_to_take = row["partition"]
        result = df_group[df_group["partition"] == partition_to_take]
        res.append(df_group[df_group["partition"] == partition_to_take])

    best_partition_df = pd.concat(res, ignore_index=True)
    best_partition_df["partition"] = "merged 2"
    best_partition_df.to_json(output_best_partition_recommendations, index=False, orient="table")


KeyboardInterrupt: 