## Independend Unbiased Space Saving (I-USS) and Coordinated Unbiased Space Saving (C-USS)

Final Project

Algorithmic Machine Learning for Data Science, Fall 2023


Suemy Inagaki (si2324)
Felipe de Oliveira (fd2264)

In [None]:
!pip install surpyval
import random
from surpyval import Weibull
import matplotlib.pyplot as plt
import numpy as np
import time

In [None]:
def hash_kwise_kv(keys,use_seed= False, seed = 12345, dimension_num=1, k_wise=4, PRIME=2147483587):
    #receives a streaming os keys and returns a streaming of hashes from 0 to 1
    if(use_seed):
      np.random.seed(seed)
    hash_parameters = np.random.randint(1, PRIME, (dimension_num, k_wise))
    hash_kwise = 0
    for exp in range(k_wise):
        hash_kwise += np.dot(np.transpose(np.array([keys])**exp), np.array([np.transpose(hash_parameters[:, exp])]))
    hash_kwise = np.mod(hash_kwise, PRIME)/PRIME
    hashes = hash_kwise.reshape(hash_kwise.shape[0],)
    return hashes

def unbiased_space_saving(streaming, n, m):
  sketch = {}
  x_min = streaming[0]
  N_min = 1
  for i in range(n):
    key = streaming[i]
    if(key in sketch.keys()):
      sketch[key] += 1
    else:
      if(len(sketch) <= m):
        sketch[key] = 1
      else:
        a = random.uniform(0, 1)
        p = 1/(N_min + 1)
        if (a < p):
          sketch[x_min] += 1
          N_min = sketch[x_min]
          sketch[key] = sketch[x_min]
          del sketch[x_min]
          x_min = key
  return sketch



def unbiased_space_saving_coordinated(streaming, n, m):
  sketch = {}
  x_min = streaming[0]
  N_min = 1
  hashes = hash_kwise_kv(streaming, use_seed = True)
  for i in range(n):
    key = streaming[i]
    if(key in sketch.keys()):
      sketch[key] += 1
    else:
      if(len(sketch) <= m):
        sketch[key] = 1
      else:
        a = hashes[i]
        p = 1/(N_min + 1)
        if (a < p):
          sketch[x_min] += 1
          N_min = sketch[x_min]
          sketch[key] = sketch[x_min]
          del sketch[x_min]
          x_min = key
  return sketch

def Priority_Sampling_Chris(aggregated_streaming,m):
  import math
  keys = list(aggregated_streaming.keys())
  values = list(aggregated_streaming.values())
  hashes = hash_kwise_kv(keys)
  norm_2 = np.sum(np.array(values)**2)
  k_v = {}
  #pick the m smallest ranks
  Ranks = [hashes[i]/(values[i]**2) for i in range(len(keys))]

  if(len(keys)<m+1):
    Threshold = math.inf
    index_to_store = range(len(keys))
  else:
    index_to_store = np.argsort(Ranks)[0:m+1]
    Threshold = Ranks[index_to_store[m]]


  for i in index_to_store:
      k_v[keys[i]] = values[i]
  return {"kv":k_v, "threshold" :Threshold }


def threshold_sampling_Chris(aggregated_streaming, m):
  keys = list(aggregated_streaming.keys())
  values = list(aggregated_streaming.values())
  hashes = hash_kwise_kv(keys)
  norm_2 = np.sum(np.array(values)**2)
  k_v = {}
  for i in range(len(keys)):
    threshold = m*values[i]**2/norm_2
    if(hashes[i]<= threshold):
      k_v[keys[i]] = values[i]
  return {"kv":k_v, "threshold" :m/norm_2 }

def inner_product_from_threshold(sketch1, sketch2, return_inner_vector=False):
  key_intersection = list(set(sketch1["kv"].keys()) & set(sketch2["kv"].keys()))
  inner_product_estimate = {}
  for key in key_intersection:
    probability = np.min([1,sketch1["threshold"]*sketch1["kv"][key]**2, sketch2["threshold"]*sketch2["kv"][key]**2 ])
    inner_product_estimate[key]  = sketch1["kv"][key] *sketch2["kv"][key] / probability
  if(return_inner_vector):
    return  np.sum(list(inner_product_estimate.values())), inner_product_estimate
  else:
    return np.sum(list(inner_product_estimate.values()))

def agms(streaming1, streaming2, number_estimates, k_wise =4):
  #receives a pair of streamings and returns an estimate of the inner product between both
  PRIME = 2147483587
  hash_parameters = np.random.randint(1, PRIME, (number_estimates, k_wise))
  hash_kwise = 0
  for exp in range(k_wise):
      hash_kwise += np.dot(np.transpose(np.array([streaming1])**exp), np.array([np.transpose(hash_parameters[:, exp])]))
  hash_kwise = 2*(np.mod(np.mod(hash_kwise, PRIME),2)-1/2) # putting in +-1 instead of in Zp
  estimates1 = np.sum(hash_kwise,axis=0)

  hash_kwise = 0
  for exp in range(k_wise):
      hash_kwise += np.dot(np.transpose(np.array([streaming2])**exp), np.array([np.transpose(hash_parameters[:, exp])]))
  hash_kwise = 2*(np.mod(np.mod(hash_kwise, PRIME),2)-1/2) # putting in +-1 instead of in Zp
  estimates2 = np.sum(hash_kwise,axis=0)

  inner_product_estimate = np.mean([estimates1[i]*estimates2[i] for i in range(number_estimates)]  )

  return inner_product_estimate


def fast_agms(streaming1, streaming2, number_estimates, k_wise =4):
  #receives a pair of streamings and returns an estimate of the inner product between both
  PRIME = 2147483587
  hash_4_parameters = np.random.randint(1, PRIME, (1, k_wise))
  hash_2_parameters = np.random.randint(1, PRIME, (1, 2))
  hash_kwise_2 = 0
  hash_kwise_4 = 0
  sketch_1 = np.zeros(number_estimates)
  sketch_2 = np.zeros(number_estimates)

  #Hash for the position of the sketch
  for exp in range(2):
      hash_kwise_2 += np.dot(np.transpose(np.array([streaming1])**exp), np.array([np.transpose(hash_2_parameters[:, exp])]))
  hash_kwise_2 = np.mod(np.mod(hash_kwise_2, PRIME),number_estimates)# putting in 0 to m-1
  hash_kwise_2 = hash_kwise_2.reshape(hash_kwise_2.shape[0],)

  #Hash for +/- 1
  for exp in range(k_wise):
      hash_kwise_4 += np.dot(np.transpose(np.array([streaming1])**exp), np.array([np.transpose(hash_4_parameters[:, exp])]))
  hash_kwise_4 = 2*(np.mod(np.mod(hash_kwise_4, PRIME),2)-1/2) # putting in +-1 instead of in Zp
  hash_kwise_4 = hash_kwise_4.reshape(hash_kwise_4.shape[0],)


  for i in range(len(streaming1)):
    sketch_1[hash_kwise_2[i]] += hash_kwise_4[i]

  hash_kwise_2 = 0
  hash_kwise_4 = 0

  #Hash for the position of the sketch
  for exp in range(2):
      hash_kwise_2 += np.dot(np.transpose(np.array([streaming2])**exp), np.array([np.transpose(hash_2_parameters[:, exp])]))
  hash_kwise_2 = np.mod(np.mod(hash_kwise_2, PRIME),number_estimates)# putting in 0 to m-1
  hash_kwise_2 = hash_kwise_2.reshape(hash_kwise_2.shape[0],)

  #Hash for +/- 1
  for exp in range(k_wise):
      hash_kwise_4 += np.dot(np.transpose(np.array([streaming2])**exp), np.array([np.transpose(hash_4_parameters[:, exp])]))
  hash_kwise_4 = 2*(np.mod(np.mod(hash_kwise_4, PRIME),2)-1/2) # putting in +-1 instead of in Zp
  hash_kwise_4 = hash_kwise_4.reshape(hash_kwise_4.shape[0],)


  for i in range(len(streaming2)):
    sketch_2[hash_kwise_2[i]] += hash_kwise_4[i]

  inner_product_estimate = np.sum([sketch_2[i]*sketch_1[i] for i in range(number_estimates)]  )

  return inner_product_estimate

def get_inner_vector(d1, d2):
  keys = [k for k in d1.keys() if k in d2.keys()]
  v_s = {}
  keys.sort()
  for k in keys:
    v_s[k] = d1[k]*d2[k]
  return v_s

def get_answer(streaming, n):
  keys, frequencies = np.unique(streaming, return_counts=True)
  a = zip(keys, frequencies)
  di = dict(a)
  return di

def fill_inner_vector(inner_vector_sketch, answer):
  keys = [i for i in answer.keys() if i not in inner_vector_sketch.keys()]
  for k in keys:
    inner_vector_sketch[k] = 0

  sorted_result = dict(sorted(inner_vector_sketch.items(),  key=lambda x:x[0]))
  return sorted_result

def get_inner_product_from_estimates(estimate1, estimate2):
  sum = 0
  for i in range(len(estimate1)):
    sum += estimate1[i]*estimate2[i]
  return sum/len(estimate1)


def calculate_inner_product(d):
  values = list(d.values())
  sum = 0
  for v in values:
    sum += v
  return sum

def generate_weibull(alpha, beta, n):
  rng = np.random.default_rng()
  streaming = rng.weibull(beta, n)
  streaming = [round(alpha*i) for i in streaming]
  return streaming

def generate_truncated_weibull(alpha, beta, a, b, n):
  model = Weibull.from_params((alpha, beta))
  streaming = model.random(n, a=a, b=b)
  streaming = [round(i) for i in streaming]
  return streaming

def generate_zipf(alpha, n):
  s = np.random.default_rng().zipf(alpha, size=n)
  return s

def generate_geometric(p, n):
  streaming = np.random.geometric(p, n)
  return streaming

def get_streamings(n):
  data = []
  # Weibull:
  alpha = 500000
  betas = [0.32, 0.15]
  a, b = 1, 1000
  for beta in betas:
    s1 = generate_weibull(alpha, beta, n)
    s2 = generate_weibull(alpha, beta, n)
    o = {
       "params": {
        "alpha": alpha,
        "beta": beta,
        "generator": "weibull"
      },
       "s1": s1,
       "s2": s2
    }
    data.append(o)
    s1 = generate_truncated_weibull(alpha, beta, a, b, n)
    s2 = generate_truncated_weibull(alpha, beta, a, b, n)
    o = {
       "params": {
        "alpha": alpha,
        "beta": beta,
        "a": a,
        "b": b,
        "generator": "truncated_weibull"
      },
       "s1": s1,
       "s2": s2
    }
    data.append(o)
  s1 = generate_geometric(0.03, n)
  s2 = generate_geometric(0.03, n)
  o = {
      "params": {
      "p": 0.03,
      "generator": "geometric"
    },
      "s1": s1,
      "s2": s2
  }
  data.append(o)
  alphas = [1.1, 2, 3, 4]
  for aa in alphas:
    s1 = generate_zipf(aa, n)
    s2 = generate_zipf(aa, n)
    o = {
        "params": {
        "alpha": aa,
        "generator": "zipfian"
      },
        "s1": s1,
        "s2": s2
    }
    data.append(o)
  return data


In [None]:
m = [100, 200, 300]

final_results_unbiased = []
final_results_unbiased_coordinated = []
final_results_agms = []
final_results_threshold = []
final_results_priority = []

q = 100
n = 400000
data = get_streamings(n)
for t in range(q):
  iteration_result_unbiased = {}
  iteration_result_unbiased_coordinated = {}
  iteration_result_agms = {}
  iteration_result_threshold = {}
  iteration_result_priority = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      t1 = time.time()
      sketch1 = unbiased_space_saving(streaming1, n, size)
      sketch2 = unbiased_space_saving(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch1, sketch2)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]
      iteration_result_unbiased[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      est = fast_agms(streaming1, streaming2, size, 4)
      t2 = time.time()
      iteration_result_agms[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      ag1 = get_answer(streaming1, n)
      ag2 = get_answer(streaming2, n)
      sketch111 = threshold_sampling_Chris(ag1, size)
      sketch222 = threshold_sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch111, sketch222)

      iteration_result_threshold[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch1111 = Priority_Sampling_Chris(ag1, size)
      sketch2222 = Priority_Sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch1111, sketch2222)

      iteration_result_priority[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }
  final_results_unbiased.append(iteration_result_unbiased)
  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)
  final_results_agms.append(iteration_result_agms)
  final_results_threshold.append(iteration_result_threshold)
  final_results_priority.append(iteration_result_priority)


In [None]:
keys = list(final_results_unbiased[0].keys())
print("Unbiased Space Saving")
mean_final_results_unbiased = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 11848363.0
Mean Relative Error: 0.3351454904936491
Mean Duration: 0.3 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 6425203.0
Mean Relative Error: 0.18174475334324802
Mean Duration: 0.3 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 6193519.0
Mean Relative Error: 0.1751912870273082
Mean Duration: 0.31 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 337511949.0
Mean Relative Error: 0.5365029791663072
Mean Duration: 0.27 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 216068625.0
Mean Relative Error: 0.3434588356362688
Mean Duration: 0.26 seconds

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 11848363.0
Mean Relative Error: 0.3351454904936491
Mean Duration: 0.44 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 6425203.0
Mean Relative Error: 0.18174475334324802
Mean Duration: 0.43 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 6186009.76
Mean Relative Error: 0.17497887895684022
Mean Duration: 0.45 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 337511949.0
Mean Relative Error: 0.5365029791663072
Mean Duration: 0.41 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 216068625.0
Mean Relative Error: 0.3434588356362688
Mean Durati

In [None]:
keys = list(final_results_agms[0].keys())
print("Fast AGMS")
mean_final_results_agms = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_agms:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_agms.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Fast AGMS
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 1773022.42
Mean Relative Error: 0.05015211541097598
Mean Duration: 0.57 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 1441404.9
Mean Relative Error: 0.040771906820414754
Mean Duration: 0.58 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 936094.38
Mean Relative Error: 0.026478578528818613
Mean Duration: 0.6 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 45979949.8
Mean Relative Error: 0.0730889087711002
Mean Duration: 0.61 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 39996602.56
Mean Relative Error: 0.0635778866305285
Mean Duration: 0.56 seconds
-------

In [None]:
keys = list(final_results_threshold[0].keys())
print("Threshold Sampling")
mean_final_results_threshold = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_threshold:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_threshold.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Threshold Sampling
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 3943277.180000005
Mean Relative Error: 0.11154043513382535
Mean Duration: 0.63 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 3296373.6957000066
Mean Relative Error: 0.09324197605152226
Mean Duration: 0.65 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 3008157.2603999986
Mean Relative Error: 0.0850894204135027
Mean Duration: 0.63 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 148491903.89360008
Mean Relative Error: 0.2360400840830476
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 111176056.36400002
Mean Relative Error: 0.17672

In [None]:
keys = list(final_results_priority[0].keys())
print("Priority Sampling")
mean_final_results_priority = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_priority:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_priority.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")


Priority Sampling
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 2183326.961754061
Mean Relative Error: 0.06175808299467816
Mean Duration: 0.53 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 1701241.065504369
Mean Relative Error: 0.048121691692464275
Mean Duration: 0.55 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 1485225.048863629
Mean Relative Error: 0.04201141351719691
Mean Duration: 0.53 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 89864340.42503187
Mean Relative Error: 0.1428467540236472
Mean Duration: 0.0 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 54112507.53654331
Mean Relative Error: 0.086016277

In [None]:
import pandas as pd
methods = ["Priority Sampling", "Threshold Sampling", "Unbiased Space Saving", "Unbiased Space Saving Coordinated", "Fast AGMS"]
datas = [mean_final_results_priority, mean_final_results_threshold, mean_final_results_unbiased, mean_final_results_unbiased_coordinated, mean_final_results_agms]
datasets = [d[0] for d in mean_final_results_priority]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors.csv")
df_2.to_csv("durations.csv")



In [None]:
def get_weibull_multiple_params(n):
  data = []
  # Weibull:
  alpha = 500000
  betas = [0.0, 0.05, 0.1, 0.15, 0.2, 0.32, 0.4, 0.6, 0.8, 1]
  a, b = 1, 1000
  for beta in betas:
    s1 = generate_weibull(alpha, beta, n)
    s2 = generate_weibull(alpha, beta, n)
    o = {
       "params": {
        "alpha": alpha,
        "beta": beta,
        "generator": "weibull"
      },
       "s1": s1,
       "s2": s2
    }
    data.append(o)
  return data

In [None]:
m = [100, 200, 300]

final_results_unbiased = []
final_results_unbiased_coordinated = []
final_results_agms = []
final_results_threshold = []
final_results_priority = []

q = 100
n = 400000
data = get_weibull_multiple_params(n)
for t in range(q):
  iteration_result_unbiased = {}
  iteration_result_unbiased_coordinated = {}
  iteration_result_agms = {}
  iteration_result_threshold = {}
  iteration_result_priority = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      t1 = time.time()
      sketch1 = unbiased_space_saving(streaming1, n, size)
      sketch2 = unbiased_space_saving(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch1, sketch2)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]
      iteration_result_unbiased[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      est = fast_agms(streaming1, streaming2, size, 4)
      t2 = time.time()
      iteration_result_agms[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      ag1 = get_answer(streaming1, n)
      ag2 = get_answer(streaming2, n)
      sketch111 = threshold_sampling_Chris(ag1, size)
      sketch222 = threshold_sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch111, sketch222)

      iteration_result_threshold[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch1111 = Priority_Sampling_Chris(ag1, size)
      sketch2222 = Priority_Sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch1111, sketch2222)

      iteration_result_priority[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }
  final_results_unbiased.append(iteration_result_unbiased)
  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)
  final_results_agms.append(iteration_result_agms)
  final_results_threshold.append(iteration_result_threshold)
  final_results_priority.append(iteration_result_priority)

In [None]:
keys = list(final_results_unbiased[0].keys())
print("Unbiased Space Saving")
mean_final_results_unbiased = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.13 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 6264532.0
Mean Relative Error: 0.0002514636683861838
Mean Duration: 0.26 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 4279544.0
Mean Relative Error: 0.00017178455362030001
Mean Duration: 0.26 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
100 iteraction

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.33 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.35 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.34 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 6264532.0
Mean Relative Error: 0.0002514636683861838
Mean Duration: 1.69 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 4279544.0
Mean Relative Error: 0.00017178455362030001
Mean Duration: 1.68 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
10

In [None]:
keys = list(final_results_agms[0].keys())
print("Fast AGMS")
mean_final_results_agms = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_agms:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_agms.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Fast AGMS
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.64 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.6 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.6 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 53406820.5
Mean Relative Error: 0.0021437954183604556
Mean Duration: 2.43 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 94101591.44
Mean Relative Error: 0.0037773183031837537
Mean Duration: 2.46 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
100 iteractions
Mean Absol

In [None]:
keys = list(final_results_threshold[0].keys())
print("Threshold Sampling")
mean_final_results_threshold = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_threshold:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_threshold.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Threshold Sampling
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 76442375.3764
Mean Relative Error: 0.003068462278156334
Mean Duration: 1.64 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 73279255.03479996
Mean Relative Error: 0.0029414919243221285
Mean Duration: 1.65 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
100 ite

In [None]:
keys = list(final_results_priority[0].keys())
print("Priority Sampling")
mean_final_results_priority = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_priority:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_priority.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Priority Sampling
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.0 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.0 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.0 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 874567.2084947205
Mean Relative Error: 3.510587518722023e-05
Mean Duration: 1.49 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 562320.7666450882
Mean Relative Error: 2.2572036153747036e-05
Mean Duration: 1.53 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
100 i

In [None]:
import pandas as pd
methods = ["Priority Sampling", "Threshold Sampling", "Unbiased Space Saving", "Unbiased Space Saving Coordinated", "Fast AGMS"]
datas = [mean_final_results_priority, mean_final_results_threshold, mean_final_results_unbiased, mean_final_results_unbiased_coordinated, mean_final_results_agms]
datasets = [d[0] for d in mean_final_results_priority]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_weibull.csv")
df_2.to_csv("durations_weibull.csv")

In [None]:
# variar o k de 1 a 20
def shift(s1, k, n):
  new_s = []
  shifted = {}
  f = get_answer(s1, n)
  sorted_d = sorted(f.items(), key=lambda x: x[1], reverse=True) #sorted[i][0] = key, sorted[i][1] = frequency
  for i in range(len(sorted_d)):
    new_index = i + k
    if new_index < len(sorted_d):
      shifted[sorted_d[i][0]] = sorted_d[new_index][0]
    else:
      dif = new_index - len(sorted_d)
      shifted[sorted_d[i][0]] = sorted_d[dif][0]

  for e in s1:
    new_s.append(shifted[e])
  return new_s

def get_weibull_shifted(n):
  data = []
  # Weibull:
  alpha = 500000
  beta = 0.15
  for k in range(1, 20):
    s1 = generate_weibull(alpha, beta, n)
    s2 = shift(s1, k, n)
    o = {
       "params": {
        "alpha": alpha,
        "beta": beta,
        "k": k,
        "generator": "weibull"
      },
       "s1": s1,
       "s2": s2
    }
    data.append(o)
  return data

m = [100, 200, 300]

final_results_unbiased = []
final_results_unbiased_coordinated = []
final_results_agms = []
final_results_threshold = []
final_results_priority = []

q = 100
n = 40000
data = get_weibull_shifted(n)
for t in range(q):
  iteration_result_unbiased = {}
  iteration_result_unbiased_coordinated = {}
  iteration_result_agms = {}
  iteration_result_threshold = {}
  iteration_result_priority = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      t1 = time.time()
      sketch1 = unbiased_space_saving(streaming1, n, size)
      sketch2 = unbiased_space_saving(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch1, sketch2)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]
      iteration_result_unbiased[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      est = fast_agms(streaming1, streaming2, size, 4)
      t2 = time.time()
      iteration_result_agms[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      ag1 = get_answer(streaming1, n)
      ag2 = get_answer(streaming2, n)
      sketch111 = threshold_sampling_Chris(ag1, size)
      sketch222 = threshold_sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch111, sketch222)

      iteration_result_threshold[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch1111 = Priority_Sampling_Chris(ag1, size)
      sketch2222 = Priority_Sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch1111, sketch2222)

      iteration_result_priority[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }
  final_results_unbiased.append(iteration_result_unbiased)
  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)
  final_results_agms.append(iteration_result_agms)
  final_results_threshold.append(iteration_result_threshold)
  final_results_priority.append(iteration_result_priority)

In [None]:
keys = list(final_results_unbiased[0].keys())
print("Unbiased Space Saving")
mean_final_results_unbiased = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 339809.0
Mean Relative Error: 0.013728483958986262
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 283550.64
Mean Relative Error: 0.011455613043799006
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 263239.0
Mean Relative Error: 0.010635010811601788
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 412781.0
Mean Relative Error: 0.018235730281841618
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 341545.0
Mean Relative Error: 0.01508868503906813
Mean Duration: 0.05 seconds
----------------------

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 339809.0
Mean Relative Error: 0.013728483958986262
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 284882.0
Mean Relative Error: 0.011509400772798639
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 263239.0
Mean Relative Error: 0.010635010811601788
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 412031.51
Mean Relative Error: 0.018202619510054793
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 341545.0
Mean Relative Error: 0.01508868503906813
Mean Duration: 0.08 seconds
----------

In [None]:
keys = list(final_results_agms[0].keys())
print("Fast AGMS")
mean_final_results_agms = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_agms:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_agms.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Fast AGMS
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 267242.12
Mean Relative Error: 0.010796739219930875
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 337147.04
Mean Relative Error: 0.013620939205435136
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 196664.42
Mean Relative Error: 0.007945358525740466
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 395777.52
Mean Relative Error: 0.017484555021515434
Mean Duration: 0.11 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 268077.8
Mean Relative Error: 0.011843070430444892
Mean Duration: 0.12 seconds
------------------------------

In [None]:
keys = list(final_results_threshold[0].keys())
print("Threshold Sampling")
mean_final_results_threshold = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_threshold:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_threshold.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Threshold Sampling
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 397541.56779999955
Mean Relative Error: 0.016060913738519467
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 294801.93829999975
Mean Relative Error: 0.011910172129136122
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 267987.148133333
Mean Relative Error: 0.010826838795802764
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 365981.9848000005
Mean Relative Error: 0.01616825571628991
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 287664.4888000002
Mean Relative Error: 0.012708366008659898
Mean Dur

In [None]:
keys = list(final_results_priority[0].keys())
print("Priority Sampling")
mean_final_results_priority = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_priority:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_priority.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Priority Sampling
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 60467.05963693447
Mean Relative Error: 0.0024429048620628726
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 48606.66982729193
Mean Relative Error: 0.001963738120602221
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 43074.97729455121
Mean Relative Error: 0.0017402544806698614
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 60554.97627837114
Mean Relative Error: 0.0026751817904304975
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 48763.27936804846
Mean Relative Error: 0.0021542513105346967
Mean 

In [None]:
import pandas as pd
methods = ["Priority Sampling", "Threshold Sampling", "Unbiased Space Saving", "Unbiased Space Saving Coordinated", "Fast AGMS"]
datas = [mean_final_results_priority, mean_final_results_threshold, mean_final_results_unbiased, mean_final_results_unbiased_coordinated, mean_final_results_agms]
datasets = [d[0] for d in mean_final_results_priority]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_weibull_shifted.csv")
df_2.to_csv("durations_weibull_shifted.csv")

In [None]:
def get_weibull_sorted(n):
  data = []
  # Weibull:
  alpha = 500000
  beta = 0.15
  s1 = generate_weibull(alpha, beta, n)
  s1.sort()
  s2 = generate_weibull(alpha, beta, n)
  s2.sort()
  o = {
      "params": {
      "alpha": alpha,
      "beta": beta,
      "generator": "weibull"
    },
      "s1": s1,
      "s2": s2
  }
  data.append(o)
  return data

m = [100, 200, 300]

final_results_unbiased = []
final_results_unbiased_coordinated = []
final_results_agms = []
final_results_threshold = []
final_results_priority = []

q = 100
n = 40000
data = get_weibull_sorted(n)
for t in range(q):
  iteration_result_unbiased = {}
  iteration_result_unbiased_coordinated = {}
  iteration_result_agms = {}
  iteration_result_threshold = {}
  iteration_result_priority = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      t1 = time.time()
      sketch1 = unbiased_space_saving(streaming1, n, size)
      sketch2 = unbiased_space_saving(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch1, sketch2)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]
      iteration_result_unbiased[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      est = fast_agms(streaming1, streaming2, size, 4)
      t2 = time.time()
      iteration_result_agms[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      ag1 = get_answer(streaming1, n)
      ag2 = get_answer(streaming2, n)
      sketch111 = threshold_sampling_Chris(ag1, size)
      sketch222 = threshold_sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch111, sketch222)

      iteration_result_threshold[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

      t1 = time.time()
      sketch1111 = Priority_Sampling_Chris(ag1, size)
      sketch2222 = Priority_Sampling_Chris(ag2, size)
      t2 = time.time()
      est = inner_product_from_threshold(sketch1111, sketch2222)

      iteration_result_priority[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }
  final_results_unbiased.append(iteration_result_unbiased)
  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)
  final_results_agms.append(iteration_result_agms)
  final_results_threshold.append(iteration_result_threshold)
  final_results_priority.append(iteration_result_priority)

In [None]:
keys = list(final_results_unbiased[0].keys())
print("Unbiased Space Saving")
mean_final_results_unbiased = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 23079634.0
Mean Relative Error: 0.9525340779835679
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 23070692.0
Mean Relative Error: 0.9521650270824434
Mean Duration: 0.05 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 23067463.0
Mean Relative Error: 0.9520317609943484
Mean Duration: 0.05 seconds
-------------------------------------------


In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 23079634.0
Mean Relative Error: 0.9525340779835679
Mean Duration: 0.09 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 23070692.0
Mean Relative Error: 0.9521650270824434
Mean Duration: 0.09 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 23067463.0
Mean Relative Error: 0.9520317609943484
Mean Duration: 0.08 seconds
-------------------------------------------


In [None]:
keys = list(final_results_agms[0].keys())
print("Fast AGMS")
mean_final_results_agms = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_agms:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_agms.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Fast AGMS
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 542388.16
Mean Relative Error: 0.02238524258637739
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 397794.02
Mean Relative Error: 0.016417606971933646
Mean Duration: 0.12 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 112244.6
Mean Relative Error: 0.004632517420754347
Mean Duration: 0.11 seconds
-------------------------------------------


In [None]:
keys = list(final_results_threshold[0].keys())
print("Threshold Sampling")
mean_final_results_threshold = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_threshold:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_threshold.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Threshold Sampling
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 374271.7640000002
Mean Relative Error: 0.015446805163245831
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 286554.3539999991
Mean Relative Error: 0.011826564813790635
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 249805.94533333197
Mean Relative Error: 0.010309898147123955
Mean Duration: 0.07 seconds
-------------------------------------------


In [None]:
keys = list(final_results_priority[0].keys())
print("Priority Sampling")
mean_final_results_priority = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_priority:
      absolute_error += result.get(k).get("absolute_error")
      relative_error += result.get(k).get("relative_error")
      duration += result.get(k).get("duration")
  mean_final_results_priority.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Priority Sampling
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 60337.12377492651
Mean Relative Error: 0.002490211350974196
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 48417.19288244799
Mean Relative Error: 0.0019982563926635544
Mean Duration: 0.06 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 43165.18692641869
Mean Relative Error: 0.0017814975545082114
Mean Duration: 0.06 seconds
-------------------------------------------


In [None]:
import pandas as pd
methods = ["Priority Sampling", "Threshold Sampling", "Unbiased Space Saving", "Unbiased Space Saving Coordinated", "Fast AGMS"]
datas = [mean_final_results_priority, mean_final_results_threshold, mean_final_results_unbiased, mean_final_results_unbiased_coordinated, mean_final_results_agms]
datasets = [d[0] for d in mean_final_results_priority]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_weibull_sorted.csv")
df_2.to_csv("durations_weibull_sorted.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

n = 40000
alpha = 5000000
streaming0 = generate_weibull(alpha, 0.15, n)
streaming1 = generate_weibull(alpha, 0.32, n)
streaming2 = generate_geometric(0.03, n)
streaming3 = generate_truncated_weibull(alpha, 0.15, 1, 1000, n)
streaming4 = generate_truncated_weibull(alpha, 0.32, 1, 1000, n)
streaming5 = generate_zipf(1.1, n)
streaming6 = generate_zipf(2, n)
streaming7 = generate_zipf(3, n)
streaming8 = generate_zipf(4, n)



def generate_histogram(streaming, name1, name2, title):
  plt.hist(streaming, bins = 1800, range=[1,1000],
          color = 'darkblue')

  # Plot formatting
  plt.legend()
  plt.xlabel('Items')
  plt.ylabel('Frequency')
  plt.title(title)
  plt.savefig(name1, bbox_inches='tight')


  # Density Plot and Histogram of streamings
  p = sns.distplot(streaming, hist=True, kde=True,
              bins=1800, color = 'darkblue',
              hist_kws={'edgecolor':'black'},
              kde_kws={'linewidth': 4})
  fig = p.get_figure()
  fig.savefig(name2)

generate_histogram(streaming0, "weibull-0.15.png", 'weibull-0.15-hist.png', 'Weibull (500k, 0.15)')
generate_histogram(streaming1, "weibull-0.32.png", 'weibull-0.32-hist.png', 'Weibull (500k, 0.32)')
generate_histogram(streaming2, "geometric-0.03.png", 'geometric-0.03-hist.png', 'Geometric (0.03)')
generate_histogram(streaming3, "weibull-truncated-0.15.png", 'weibull-truncated-0.15-hist.png', 'Weibull Truncated (500k, 0.15)')
generate_histogram(streaming4, "weibull-truncated-0.32.png", 'weibull-truncated-0.32-hist.png', 'Weibull Truncated (500k, 0.32)')
generate_histogram(streaming5, "zipfian-1.1.png", 'zipfian-1.1-hist.png', 'Zipfian (1.1)')
generate_histogram(streaming6, "zipfian-2.png", 'zipfian-2-hist.png', 'Zipfian (2)')
generate_histogram(streaming7, "zipfian-3.png", 'zipfian-3-hist.png', 'Zipfian (3)')
generate_histogram(streaming8, "zipfian-4.png", 'zipfian-4-hist.png', 'Zipfian (4)')





In [None]:
m = [100, 200, 300]

final_results_unbiased_coordinated = []

q = 100
n = 40000
data = get_streamings(n)
for t in range(q):
  iteration_result_unbiased_coordinated = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)


In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
    absolute_error += result.get(k).get("absolute_error")
    relative_error += result.get(k).get("relative_error")
    duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.32-weibull-100
100 iteractions
Mean Absolute Error: 87297.0
Mean Relative Error: 0.22703676925717647
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-200
100 iteractions
Mean Absolute Error: 24632.0
Mean Relative Error: 0.06406141906758293
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.32-weibull-300
100 iteractions
Mean Absolute Error: 21276.0
Mean Relative Error: 0.05533333680098626
Mean Duration: 0.09 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-100
100 iteractions
Mean Absolute Error: 2282556.0
Mean Relative Error: 0.3600561815831177
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.32-1-1000-truncated_weibull-200
100 iteractions
Mean Absolute Error: 1294864.0
Mean Relative Error: 0.2042551365703364
Mean Duration: 0.08 se

In [None]:
import pandas as pd
methods = ["Unbiased Space Saving Coordinated"]
datas = [mean_final_results_unbiased_coordinated]
datasets = [d[0] for d in mean_final_results_unbiased_coordinated]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_unbiased_first.csv")
df_2.to_csv("durations_unbiased_first.csv")

In [None]:
m = [100, 200, 300]

final_results_unbiased_coordinated = []

q = 100
n = 40000
data = get_weibull_multiple_params(n)
for t in range(q):
  iteration_result_unbiased_coordinated = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
    absolute_error += result.get(k).get("absolute_error")
    relative_error += result.get(k).get("relative_error")
    duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.0-weibull-100
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.07 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-200
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.07 seconds
-------------------------------------------
distribution: 500000-0.0-weibull-300
100 iteractions
Mean Absolute Error: 0.0
Mean Relative Error: 0.0
Mean Duration: 0.07 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-100
100 iteractions
Mean Absolute Error: 247919754.0
Mean Relative Error: 0.9970774966785754
Mean Duration: 0.25 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-200
100 iteractions
Mean Absolute Error: 247878391.0
Mean Relative Error: 0.996911143994576
Mean Duration: 0.25 seconds
-------------------------------------------
distribution: 500000-0.05-weibull-300
100 it

In [None]:
import pandas as pd
methods = ["Unbiased Space Saving Coordinated"]
datas = [mean_final_results_unbiased_coordinated]
datasets = [d[0] for d in mean_final_results_unbiased_coordinated]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_unbiased_weibull_multiple.csv")
df_2.to_csv("durations_unbiased_weibull_multiple.csv")

In [None]:
m = [100, 200, 300]

final_results_unbiased_coordinated = []

q = 100
n = 40000
data = get_weibull_shifted(n)
for t in range(q):
  iteration_result_unbiased_coordinated = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
    absolute_error += result.get(k).get("absolute_error")
    relative_error += result.get(k).get("relative_error")
    duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.15-1-weibull-100
100 iteractions
Mean Absolute Error: 206854.0
Mean Relative Error: 0.00861566999502353
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-200
100 iteractions
Mean Absolute Error: 195627.0
Mean Relative Error: 0.00814805454144696
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-1-weibull-300
100 iteractions
Mean Absolute Error: 95110.0
Mean Relative Error: 0.003961423870104951
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-100
100 iteractions
Mean Absolute Error: 338530.0
Mean Relative Error: 0.014979655580756883
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-2-weibull-200
100 iteractions
Mean Absolute Error: 276344.0
Mean Relative Error: 0.01222797962310189
Mean Duration: 0.08 seconds
--------------

In [None]:
import pandas as pd
methods = ["Unbiased Space Saving Coordinated"]
datas = [mean_final_results_unbiased_coordinated]
datasets = [d[0] for d in mean_final_results_unbiased_coordinated]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_unbiased_weibull_shifted.csv")
df_2.to_csv("durations_unbiased_weibull_shifted.csv")

In [None]:
m = [100, 200, 300]

final_results_unbiased_coordinated = []

q = 100
n = 40000
data = get_weibull_sorted(n)
for t in range(q):
  iteration_result_unbiased_coordinated = {}
  print(t)
  for d in data:
    streaming1 = d.get("s1")
    streaming2 = d.get("s1")
    for size in m:
      params = list(d.get("params").values())
      params.append(size)
      key = ""
      for p in params:
        key+=f"{p}-"
      key = key[:len(key)-1]

      t1 = time.time()
      sketch11 = unbiased_space_saving_coordinated(streaming1, n, size)
      sketch22 = unbiased_space_saving_coordinated(streaming2, n, size)
      t2 = time.time()
      answer1 = get_answer(streaming1, n)
      answer2 = get_answer(streaming2, n)
      v = get_inner_vector(sketch11, sketch22)
      v_ans = get_inner_vector(answer1, answer1)
      ans = calculate_inner_product(v_ans)
      est = calculate_inner_product(v)

      iteration_result_unbiased_coordinated[key] = {
          "relative_error": abs(ans - est)/ans,
          "absolute_error": abs(ans - est),
          "duration": t2 - t1
      }

  final_results_unbiased_coordinated.append(iteration_result_unbiased_coordinated)

In [None]:
keys = list(final_results_unbiased_coordinated[0].keys())
print("Unbiased Space Saving Coordinated")
mean_final_results_unbiased_coordinated = []
for k in keys:
  absolute_error = 0
  relative_error = 0
  duration = 0
  for result in final_results_unbiased_coordinated:
    absolute_error += result.get(k).get("absolute_error")
    relative_error += result.get(k).get("relative_error")
    duration += result.get(k).get("duration")
  mean_final_results_unbiased_coordinated.append([k, absolute_error/q, relative_error/q, duration/q])

  print(f"distribution: {k}")
  print(f"{q} iteractions")
  print(f"Mean Absolute Error: {absolute_error/q}")
  print(f"Mean Relative Error: {relative_error/q}")
  print(f"Mean Duration: {round(duration/q, 2)} seconds")
  print("-------------------------------------------")

Unbiased Space Saving Coordinated
distribution: 500000-0.15-weibull-100
100 iteractions
Mean Absolute Error: 120317.0
Mean Relative Error: 0.005070494356026523
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-200
100 iteractions
Mean Absolute Error: 101971.0
Mean Relative Error: 0.004297342686223727
Mean Duration: 0.08 seconds
-------------------------------------------
distribution: 500000-0.15-weibull-300
100 iteractions
Mean Absolute Error: 86389.0
Mean Relative Error: 0.0036406736946796854
Mean Duration: 0.08 seconds
-------------------------------------------


In [None]:
import pandas as pd
methods = ["Unbiased Space Saving Coordinated"]
datas = [mean_final_results_unbiased_coordinated]
datasets = [d[0] for d in mean_final_results_unbiased_coordinated]

di = {}
for i in range(len(methods)):
  di[methods[i]] = [d[2] for d in datas[i]]

di_duration = {}
for i in range(len(methods)):
  di_duration[methods[i]] = [d[3] for d in datas[i]]

df = pd.DataFrame(di, index=datasets)
df_2 = pd.DataFrame(di_duration, index=datasets)

df.to_csv("relative_errors_unbiased_weibull_sorted.csv")
df_2.to_csv("durations_unbiased_weibull_sorted.csv")