In [17]:
import itertools as it
import pandas as pd
import requests as rq
from IPython.display import display

In [18]:
# Parameters
users = "../data/users.json"
host = "localhost"

hash_function = "murmur2"
partition_port = {"0": "5000", "1": "5001"}

repeat_for_user = 10

limits = [10]
walks = [1000]
walk_length = [100]

should_repeat_for_user = True

should_merge_1 = True
should_merge_2 = True
should_merge_3 = True

output_recommendations = f"../data/{hash_function}_recommendations.json"
output_merge_recommendations = f"../data/{hash_function}_merge_recommendations.json"
output_best_partition_recommendations = f"../data/{hash_function}_best_partition_recommendations.json"
output_highest_degree_recommendations = f"../data/{hash_function}_highest_degree_recommendations.json"

In [19]:
# Import users
user_df = pd.read_json(users, orient="table")

# Compute cross product of all configuration values
configurations = list(it.product([hash_function], partition_port.items(), walks, walk_length, limits))
configurations


[('murmur2', ('0', '5000'), 1000, 100, 10),
 ('murmur2', ('1', '5001'), 1000, 100, 10)]

In [20]:
def get_recommendations(user_df, walks, walk_length, limit, partition_port):
    rows = []
    partition = partition_port[0]
    port = partition_port[1]
    print(f"gathering recommendations from port: {port} and partition {partition} hash_function: {hash_function}")

    for i, row in user_df.iterrows():
        user = row["user_id"]
        url = f"http://{host}:{port}/recommendation/salsa/{user}?walks={walks}&walk_length={walk_length}&limit={limit}"
        response = rq.get(url)
        counter = 0
        while response.status_code != 200 and counter != 9:
            print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
            response = rq.get(url)
            counter += 1

        response_body = response.json() if response.status_code == 200 else []
        recommendations = [res['id'] for res in response_body]
        hits = [res['hit'] for res in response_body]
        rows.append((hash_function, partition, user, recommendations, hits))

    return pd.DataFrame(rows, columns=["hash_function", "partition", "user", "recommendations", "hits"])

In [None]:
def get_recommendations_on_repeat(user_df, walks, walk_length, limit, partition_port):
    rows = []
    partition = partition_port[0]
    port = partition_port[1]

    print(f"gathering recommendations from port: {port} and partition {partition} hash_function: {hash_function}")

    for i, row in user_df.iterrows():
        user_recommendation_df = pd.DataFrame(columns=["recommendations"])
        user = row["user_id"]
        recommendations_series = pd.Series()
        for count in range(repeat_for_user):
            url = f"http://{host}:{port}/recommendation/salsa/{user}?walks={walks}&walk_length={walk_length}&limit={limit}"
            response = rq.get(url)
            counter = 0
            while response.status_code != 200 and counter != 9:
                print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
                response = rq.get(url)
                counter += 1

            response_body = response.json() if response.status_code == 200 else []
            a_series = pd.Series([res['id'] for res in response_body])
            recommendations_series = recommendations_series.append(a_series, ignore_index=True)
        user_recommendation_df["recommendations"] = recommendations_series
        most_freq_rec_series = user_recommendation_df['recommendations'].value_counts().nlargest(limit)
        hits = list(most_freq_rec_series.values)
        most_frequent_recommendations = most_freq_rec_series.index.to_list()
        rows.append((hash_function, partition, user, most_frequent_recommendations, hits))

    return pd.DataFrame(rows, columns=["hash_function", "partition", "user", "recommendations", "hits"])

In [21]:
# Perform API requests
result_dfs = []

for (hash_function, partition_port, walks, walk_length, limit) in configurations:
    result = get_recommendations(user_df, walks, walk_length, limit, partition_port)
    result_dfs.append(result)

result_df = pd.concat(result_dfs)
display(result_df[["user", "recommendations", "hits", "partition"]])
result_df.to_json(output_recommendations, index=False, orient="table")

gathering recommendations from port: 5000 and partition 0 hash_function: murmur2


KeyboardInterrupt: 

In [None]:
def merge_sort_rec_hit(row):
    if row["len_rec"] == 10:
        return row["recommendations"], row["hits"]
    rec_hit_dic = dict(zip(row["recommendations"], row["hits"]))
    sorted_by_hits = dict(sorted(rec_hit_dic.items(), key=lambda item: item[1], reverse=True))
    rec_hit_dic = dict(list(sorted_by_hits.items())[:10])
    return list(rec_hit_dic.keys()), list(rec_hit_dic.values())

In [None]:
# Merge partition results
if should_merge_1:
    merge_df = result_df.groupby("user", as_index=False).agg({'recommendations': 'sum', 'hits': 'sum'})
    merge_df["hash_function"] = hash_function
    merge_df["partition"] = "merge"
    merge_df["len_rec"] = merge_df.apply(lambda x: len(x["recommendations"]), axis=1)
    pd.set_option('display.max_colwidth', None)
    merge_df[["recommendations", "hits"]] = merge_df.apply(lambda row: merge_sort_rec_hit(row), axis=1,
                                                           result_type="expand")
    # Save results
    merge_df.to_json(output_merge_recommendations, index=False, orient="table")

In [None]:
if should_merge_2:
    res = []
    for group_name, df_group in result_df.groupby("user"):
        highest_hit = 0
        partition_to_take = 0
        for _, row in df_group.iterrows():
            if len(row["hits"]) == 0:
                continue
            hit = row["hits"][0]
            if hit > highest_hit:
                highest_hit = hit
                partition_to_take = row["partition"]
        result = df_group[df_group["partition"] == partition_to_take]
        res.append(df_group[df_group["partition"] == partition_to_take])

    best_partition_df = pd.concat(res, ignore_index=True)
    best_partition_df["partition"] = "merge 2"
    best_partition_df.to_json(output_best_partition_recommendations, index=False, orient="table")


In [None]:
def get_degree(user_df, port):
    degrees = []
    for i, row in user_df.iterrows():
        user = row["user_id"]
        url = f"http://{host}:{port}/status/degree/left-index/{user}"
        response = rq.get(url)
        counter = 0
        while response.status_code != 200 and counter != 9:
            print(f"Got response code {response.status_code} trying again {counter + 1}/10...")
            response = rq.get(url)
            counter += 1
        degree = response.json() if response.status_code == 200 else 0
        degrees.append(degree)
    return degrees

In [12]:
if should_merge_3:
    degrees = []
    for (hash_function, partition_port, walks, walk_length, limit) in configurations:
        degrees.extend(get_degree(user_df, partition_port[1]))
    result_df["degree"] = degrees
    result_df.to_json(output_recommendations, index=False, orient="table")
    res = []
    for group_name, df_group in result_df.groupby("user"):
        highest_degree = 0
        partition_to_take = 0
        for _, row in df_group.iterrows():
            if row["degree"] == 0:
                continue
            degree = row["degree"]
            if degree > highest_degree:
                highest_degree = degree
                partition_to_take = row["partition"]
        result = df_group[df_group["partition"] == partition_to_take]
        res.append(df_group[df_group["partition"] == partition_to_take])

    best_partition_df = pd.concat(res, ignore_index=True)
    best_partition_df["partition"] = "merge 3"
    best_partition_df.to_json(output_highest_degree_recommendations, index=False, orient="table")


NameError: name 'user_df' is not defined