In [1]:
import itertools as it
from timeit import default_timer as timer

import pandas as pd
import requests as rq
from IPython.display import display


In [8]:
# Parameters
users = "../data/users.json"
host = "localhost"

hash_function = "murmur2"
partition_port = {"0": "5001", "1": "5002"}

repeat_for_user = 10

limits = [10]
walks = [1000]
walk_length = [100]

should_repeat_for_user = True


union_results = True
highest_hit = True
most_interactions = True


output_latency = f"../output/{hash_function}_latency.csv"
output_recommendations = f"../data/{hash_function}_recommendations.json"
output_union_results = f"../data/{hash_function}_union_results.json"
output_highest_hit = f"../data/{hash_function}_highest_hit.json"
output_most_interactions = f"../data/{hash_function}_most_interactions.json"

In [9]:
# Import users
user_df = pd.read_json(users, orient="table")

# Compute cross product of all configuration values
configurations = list(it.product([hash_function], partition_port.items(), walks, walk_length, limits))
configurations


[('murmur2', ('0', '5001'), 1000, 100, 10),
 ('murmur2', ('1', '5002'), 1000, 100, 10)]

In [10]:
def get_recommendations(user_df, walks, walk_length, limit, partition_port):
    rows = []
    partition = partition_port[0]
    port = partition_port[1]
    print(f"gathering recommendations from port: {port} and partition {partition} hash_function: {hash_function}")

    for i, row in user_df.iterrows():
        user = row["user_id"]
        url = f"http://{host}:{port}/recommendation/salsa/{user}?walks={walks}&walk_length={walk_length}&limit={limit}"
        response = rq.get(url)
        counter = 0
        while response.status_code != 200 and counter != 9:
            print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
            response = rq.get(url)
            counter += 1

        response_body = response.json() if response.status_code == 200 else []
        recommendations = [res['id'] for res in response_body]
        hits = [res['hit'] for res in response_body]
        rows.append((hash_function, partition, user, recommendations, hits))

    return pd.DataFrame(rows, columns=["hash_function", "partition", "user", "recommendations", "hits"])

In [11]:
# Perform API requests
result_dfs = []
latency_rows = []

for (hash_function, partition_port, walks, walk_length, limit) in configurations:
    start = timer()
    result = get_recommendations(user_df, walks, walk_length, limit, partition_port)
    end = timer()
    partition_number = partition_port[0]
    latency = end - start
    latency_rows.append((hash_function, partition_number, "recommendation fetch",latency))
    result_dfs.append(result)


result_df = pd.concat(result_dfs)
display(result_df[["user", "recommendations", "hits", "partition"]])
result_df.to_json(output_recommendations, index=False, orient="table")

gathering recommendations from port: 5001 and partition 0 hash_function: murmur2
gathering recommendations from port: 5002 and partition 1 hash_function: murmur2


KeyboardInterrupt: 

In [None]:
def merge_sort_rec_hit(row):
    if row["len_rec"] == 10:
        return row["recommendations"], row["hits"]
    rec_hit_dic = dict(zip(row["recommendations"], row["hits"]))
    sorted_by_hits = dict(sorted(rec_hit_dic.items(), key=lambda item: item[1], reverse=True))
    rec_hit_dic = dict(list(sorted_by_hits.items())[:10])
    return list(rec_hit_dic.keys()), list(rec_hit_dic.values())

In [None]:
# Merge partition results
if union_results:
    start = timer()
    merge_df = result_df.groupby("user", as_index=False).agg({'recommendations': 'sum', 'hits': 'sum'})
    merge_df["hash_function"] = hash_function
    merge_df["partition"] = "union results"
    merge_df["len_rec"] = merge_df.apply(lambda x: len(x["recommendations"]), axis=1)
    pd.set_option('display.max_colwidth', None)
    merge_df[["recommendations", "hits"]] = merge_df.apply(lambda row: merge_sort_rec_hit(row), axis=1,
                                                           result_type="expand")

    end = timer()
    latency = end - start
    latency_rows.append((hash_function, -1, "highest hit", latency))
    # Save results
    merge_df.to_json(output_union_results, index=False, orient="table")

In [None]:
if highest_hit:
    start = timer()
    res = []
    for group_name, df_group in result_df.groupby("user"):
        highest_hit_count = 0
        partition_to_take = 0
        for _, row in df_group.iterrows():
            if len(row["hits"]) == 0:
                continue
            hit = row["hits"][0]
            if hit > highest_hit_count:
                highest_hit_count = hit
                partition_to_take = row["partition"]
        result = df_group[df_group["partition"] == partition_to_take]
        res.append(df_group[df_group["partition"] == partition_to_take])

    best_partition_df = pd.concat(res, ignore_index=True)
    best_partition_df["partition"] = "highest hit"
    end = timer()
    latency = end - start
    latency_rows.append((hash_function, -1, "highest hit", latency))
    best_partition_df.to_json(output_highest_hit, index=False, orient="table")


In [None]:
def get_degree(user_df, port):
    degrees = []
    for i, row in user_df.iterrows():
        user = row["user_id"]
        url = f"http://{host}:{port}/status/degree/left-index/{user}"
        response = rq.get(url)
        counter = 0
        while response.status_code != 200 and counter != 9:
            print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
            response = rq.get(url)
            counter += 1
        degree = response.json() if response.status_code == 200 else 0
        degrees.append(degree)
    return degrees

In [12]:
if most_interactions:
    start = timer()
    degrees = []
    for (hash_function, partition_port, walks, walk_length, limit) in configurations:
        degrees.extend(get_degree(user_df, partition_port[1]))
    result_df["degree"] = degrees
    result_df.to_json(output_recommendations, index=False, orient="table")
    res = []
    for group_name, df_group in result_df.groupby("user"):
        highest_degree = 0
        partition_to_take = 0
        for _, row in df_group.iterrows():
            if row["degree"] == 0:
                continue
            degree = row["degree"]
            if degree > highest_degree:
                highest_degree = degree
                partition_to_take = row["partition"]
        result = df_group[df_group["partition"] == partition_to_take]
        res.append(df_group[df_group["partition"] == partition_to_take])

    best_partition_df = pd.concat(res, ignore_index=True)
    best_partition_df["partition"] = "most interactions"
    end = timer()
    latency = end - start
    latency_rows.append((hash_function, -1, "most interactions", latency))
    best_partition_df.to_json(output_most_interactions, index=False, orient="table")


NameError: name 'user_df' is not defined

In [None]:
latency_df = pd.DataFrame(latency_rows, columns=["hash_function", "partition", "functionality" ,"latency"])

latency_df.to_csv(output_latency, index=False)
