# P2V-MAP data streamer

In [1]:
import os

import pandas as pd
import p2vmap_data_streamer

In [2]:
# the path that contains the data
path = os.path.expandvars("~/Dropbox/teaching/big-data/data/dr-g/upload")

## Data

In [3]:
baskets = pd.read_parquet(f"{path}/market-baskets.parquet")
baskets = baskets.head(1_000)
baskets = baskets[baskets.groupby("basket")["product"].transform("nunique")>2]
baskets.head()

Unnamed: 0,customer,product,basket
0,0,51,0
1,0,113,0
2,0,154,0
3,0,165,0
4,0,185,0


In [4]:
baskets[baskets["basket"]=="00000000"]

Unnamed: 0,customer,product,basket
0,0,51,0
1,0,113,0
2,0,154,0
3,0,165,0
4,0,185,0
5,0,253,0
6,0,266,0


## Initialize streamer

In [5]:
data_stream_p2v = p2vmap_data_streamer.DataStreamP2V(
    data=baskets,
    variable_basket="basket",
    variable_product="product",
    batch_size=4,
    shuffle=False,
    n_negative_samples=4,
)

## Generate one batch

In [6]:
center_products, context_products, negative_samples = data_stream_p2v.generate_batch()

In [7]:
center_products

array([51, 51, 51, 51])

In [8]:
context_products

array([113, 154, 165, 185])

In [9]:
negative_samples

array([[241, 189, 110, 107],
       [  6,  25, 105, 293],
       [226,  37, 136, 139],
       [145, 141,  71, 151]], dtype=int32)


<br>
<br>
&mdash; <br>
Sebastian Gabel <br>
`Learning from Big Data` 2021/22