In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from probables import (CountMinSketch)
from tqdm import tqdm
from collections import defaultdict
import math
import heapq
import os
import json

In [2]:
def sample_until(xs, size, seed=0):
    r = np.random.RandomState(seed)
    res = set()
    while len(res) < size and len(xs) > 0:
        new_x = r.choice(xs)
        res.add(new_x)
        xs = xs[xs != new_x]
    return list(res)

In [3]:
def sample_until_lazy(xs, size, seed=0):
    r = np.random.RandomState(seed)
    i = 0
    res = set()
    while len(res) < size and len(xs) > 0:
        new_x = r.choice(xs)
        res.add(new_x)
        i += 1
        if i % 1000 == 0:
            xs = xs[np.isin(xs, res, invert=True)]
    return list(res)

# Microsoft

In [4]:
df = pd.read_csv(
"/Users/edwardgan/Documents/Projects/datasets/msft/mb-3M-cube.csv"
)

In [5]:
gs = df.groupby(["TenantId", "AppInfo_Version", "UserInfo_TimeZone", "DeviceInfo_NetworkType"])

In [82]:
x_track = sample_until(df["DeviceInfo_OsBuild"], size=200, seed=0)
df_track = pd.DataFrame(
    {"x_track": x_track}
)
df_track.to_csv("/Users/edwardgan/Documents/Projects/datasets/msft/mb-3M-os-track.csv", index=False)

In [111]:
x_track = sample_until_lazy(df["DeviceInfo_NetworkProvider"], size=200, seed=0)
df_track = pd.DataFrame(
    {"x_track": x_track}
)
df_track.to_csv("/Users/edwardgan/Documents/Projects/datasets/msft/mb-3M-network-track.csv", index=False)

In [48]:
x_to_track = np.percentile(
    df["records_received_count"].dropna(),
    q=np.arange(0,101),
)
df_track = pd.DataFrame(
    {"x_track": x_to_track}
)
df_track.to_csv("/Users/edwardgan/Documents/Projects/datasets/msft/mb-3M-records-track.csv", index=False)

# Instacart Data

In [89]:
df_orders = pd.read_csv(
    "/Users/edwardgan/Documents/Projects/datasets/instacart/orders.csv"
)
df_op = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/instacart/order_products__prior.csv")

In [90]:
df_op.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')

In [91]:
df_aisles = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/instacart/aisles.csv")
df_dept = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/instacart/departments.csv")
df_products = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/instacart/products.csv")

In [92]:
df_g = df_op[["order_id", "product_id", "reordered", "add_to_cart_order"]].merge(
    df_orders[["order_id", "order_dow", "order_hour_of_day"]], 
    how="inner", on=["order_id"]
)

In [103]:
x_track = sample_until_lazy(df_g["product_id"], size=200, seed=1)
track_df = pd.DataFrame({"f": x_track})
track_df.to_csv("/Users/edwardgan/Documents/Projects/datasets/instacart/tracked.csv", index=False)

In [105]:
df_g["add_to_cart_order"] = np.clip(df_g["add_to_cart_order"], a_min=0, a_max=30)

In [106]:
gs = df_g.groupby(["reordered", "order_dow", "order_hour_of_day", "add_to_cart_order"])
print(len(gs))

10080


In [53]:
df_g[["product_id", "reordered", "order_dow", "order_hour_of_day", "add_to_cart_order"]].to_feather(
    "/Users/edwardgan/Documents/Projects/datasets/instacart/p_df.feather"
)

In [233]:
n = 32434489
k = 200

In [3]:
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [46]:
df_g = df_op[["order_id", "product_id"]].merge(
    df_orders[["order_id", "order_dow", "order_hour_of_day"]], 
    how="inner", on=["order_id"]
).sort_values(
    ["order_dow", "order_hour_of_day"]
).set_index(
    ["order_dow", "order_hour_of_day"]
)

In [90]:
product_lists = []
for day_of_week in tqdm(range(7)):
    for hour_of_day in range(24):
        cur_idx = (day_of_week, hour_of_day)
        if cur_idx in df_g.index:
            current_list = df_g.loc[cur_idx]["product_id"].values
            product_lists.append(current_list)

100%|██████████| 7/7 [00:00<00:00, 42.70it/s]


In [91]:
with open("/Users/edwardgan/Documents/Projects/datasets/instacart/products_grouped.txt", "w") as f:
    for plist in product_lists:
        f.write(json.dumps(plist.tolist())+"\n")

# MSFT Data

In [65]:
column_names = [
    'PipelineInfo_IngestionTime',
 'SDKVersion',
 'APIVersion',
 'DeviceHash_Id',
 'AppInfo_Language',
 'AppInfo_Version',
 'DeviceInfo_Make',
 'DeviceInfo_OsBuild',
 'DeviceInfo_OsVersion',
 'DeviceInfo_Model',
 'DeviceInfo_NetworkType',
 'DeviceInfo_NetworkProvider',
 'UserInfo_Language',
 'UserInfo_TimeZone',
 'eventpriority',
 'records_received_count',
 'records_tried_to_send_count',
 'records_sent_count',
 'olsize',
 'olsize_start',
 'olc_start',
 'ol_w',
 'olc',
 'records_dropped_count',
#  'UserHash_Id',
 'inq',
 'infl',
 'r_count',
 'PipelineInfo_ClientCountry',
 'EventInfo_InitId',
 'EventInfo_Sequence',
 'e_meth',
 'TenantId',
 'DataPackageId',
 'EventInfo_Time',
 'r_no_name',
 'r_size',
 'r_ban',
 'r_kl',
 'r_ps',
 'r_403',
 'r_inv',
 'd_assert',
 'd_bad_tenant',
 'd_disk_full',
 'd_io_fail',
 'd_bond_fail',
 'd_disk_off',
 'd_unk']

In [125]:
df = pd.read_csv(
    "/Users/edwardgan/Documents/Projects/datasets/msft/mb200k.tsv",
    sep="\t",
    names=column_names
)

In [126]:
df[q_metric] = df[q_metric].fillna(0)

In [127]:
for cur_f in tqdm(f_metrics + dims):
    df[cur_f] = df[cur_f].fillna("na")
    vc = df[cur_f].value_counts()
    vc_rep = dict(zip(
        vc.index, 
        range(len(vc))
    ))
    df.replace({cur_f: vc_rep}, inplace=True)

100%|██████████| 6/6 [00:05<00:00,  1.21it/s]


In [105]:
f_metrics = [
    "DeviceInfo_OsBuild",
    "DeviceInfo_NetworkProvider"
]
q_metric = "records_received_count"
dims = [
    "TenantId",
    "AppInfo_Version",
    "UserInfo_TimeZone",
    "DeviceInfo_NetworkType",
]

In [130]:
df = pd.read_feather("/Users/edwardgan/Documents/Projects/datasets/msft/mb200k.feather")

  labels, = index.labels


# Avazu Data

In [170]:
df = pd.read_csv(
    "/Users/edwardgan/Documents/Projects/datasets/avazu/all/train",
    nrows=2000000
)

In [191]:
target = "site_id"
dims = ["hour", "click", "banner_pos", "app_category", "C18", "C21"]
dims = ["hour", "click", "banner_pos", "app_category", "C18", "C21"]

In [184]:
dfh.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [192]:
np.max(dfg)

33984

In [187]:
dfh = df[df["hour"] == 14102100]
dfg = dfh.groupby(dims)["id"].count()
rr = len(dfh) / np.max(dfg)
print("ratio: {}".format(rr))
print("num groups: {}".format(len(dfg)))

ratio: 3.5018243879472695
num groups: 379


In [134]:
len(dfg)

535

In [135]:
500*240

120000

# CAIDA PCap

In [27]:
df = pd.read_csv("caida1M-dest-stream.csv")

In [37]:
x_track = np.random.choice(df["Destination"],400)

In [42]:
pd.DataFrame({"x_track": x_track}).to_csv("caida1M-xtrack.csv", index=False)

In [6]:
df_i = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/caida-pcap/time_10M.csv")

In [None]:
df = df_i.fillna("eth")
ip_map = {}
for i,k in enumerate(df["ip.dst"].value_counts().keys()):
    ip_map[k] = i
mapped_dest_ips = df["ip.dst"].map(ip_map)
df_out = pd.DataFrame()
df_out["ip.dst"] = mapped_dest_ips

In [None]:
df_out.to_csv("/Users/edwardgan/Documents/Projects/datasets/caida-pcap/caida10M-ipdst.csv", index=False)

In [19]:
df_out = pd.read_csv("/Users/edwardgan/Documents/Projects/datasets/caida-pcap/caida10M-ipdst.csv")

In [20]:
x_track = sample_until_lazy(df_out["ip.dst"],size=200)

In [21]:
pd.DataFrame({"x_track": x_track}).to_csv("/Users/edwardgan/Documents/Projects/datasets/caida-pcap/caida10M-ipdst-xtrack.csv", index=False)

# Power

In [31]:
df = pd.read_csv(
    "/Users/edwardgan/Documents/Projects/datasets/household/household_power_consumption.txt",
    sep=";",
    na_values=["?"]
)

In [33]:
df[["Global_active_power"]].dropna().to_csv("/Users/edwardgan/Documents/Projects/datasets/household/power.csv", index=False)

In [34]:
x_to_track = np.percentile(
    df["Global_active_power"].dropna(),
    q=np.arange(0,101),
)

In [37]:
pd.DataFrame({"x_track": x_to_track}).to_csv(
    "/Users/edwardgan/Documents/Projects/datasets/household/power_tracked.csv",
    index=False
)

# Zipf

In [49]:
r = np.random.RandomState(seed=0)
total_size = 10_000_000
n_max = 1_000_000
x_stream = r.zipf(1.1, size=2*total_size)
x_stream = x_stream[x_stream < n_max][:total_size]

In [56]:
pd.DataFrame({"x": x_stream}).to_csv("zipf10M.csv", index=False)

In [16]:
df = pd.read_csv("zipf10M.csv")

In [17]:
x_to_track = sample_until_lazy(df["x"], size=200, seed=1)
pd.DataFrame({"x_track": x_to_track}).to_csv("zipf10M-xtrack.csv", index=False)