# Creating the daily graph batches

In [79]:
# Necessary imports
%autosave 120
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path as osp
import utils as ut
%load_ext autoreload
%autoreload 2

Autosaving every 120 seconds
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# GNN related imports
import torch
from torch_geometric.nn import Node2Vec

import torch_geometric.datasets as datasets
import torch_geometric.data as data
import torch_geometric.transforms as transforms
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import TUDataset
from torch_geometric.data import InMemoryDataset, Data, download_url, extract_zip, HeteroData

import networkx as nx
from torch_geometric.utils.convert import to_networkx

from sklearn.manifold import TSNE
from tqdm.notebook import tqdm
from sklearn.preprocessing import minmax_scale


In [3]:
# Helper functions
def walk_up_folder(path, depth=1):
    """
    Helper method to navigate the file system and get to the file location
    """
    _cur_depth = 1
    while _cur_depth < depth:
        path = os.path.dirname(path)
        _cur_depth += 1
    return path



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"


log_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/logs/")
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print("Logs folder created")


raw_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/root/raw/")
proc_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/root/processed/")
mapping_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/root/mapping/")
split_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/root/split/")
node_dir = os.path.join(raw_dir, "node-feat")
edge_dir = os.path.join(raw_dir, "relations")


Logs folder created


## Creating out own inmemory dataset

You can find graph data to test from 

* https://networkrepository.com/econ.php 
* https://networkrepository.com/dynamic.php 
* https://networkrepository.com/heter.php
* https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html?highlight=OGB_MAG#torch_geometric.datasets.OGB_MAG  
* https://github.com/Soughing0823/LAGNN/blob/main/utils.py 
* 

From the official PyG documentation we see that some methods need to be override:

1) torch_geometric.data.InMemoryDataset.`raw_file_names()`: A list of files in the raw_dir which needs to be found in order to skip the download.

2) torch_geometric.data.InMemoryDataset.`processed_file_names()`: A list of files in the processed_dir which needs to be found in order to skip the processing.

3) torch_geometric.data.InMemoryDataset.`download()`: Downloads raw data into raw_dir.

4) torch_geometric.data.InMemoryDataset.`process()`: Processes raw data and saves it into the processed_dir.






In [31]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= os.path.join(ut.walk_up_folder(os.getcwd(), 5), "wmt-mobius-dev-svcmobius.json")

In [None]:
uploaded = []

In [40]:
myFolder = osp.join(ut.walk_up_folder(os.getcwd(),5),'dataset_old')
fileset = set()
for root, dirs, files in os.walk(myFolder):
    for fileName in files:
        if fileName=='.DS_Store': continue
        fileset.add( os.path.join( root[len(myFolder):], fileName )[1:])



# ut.gcp_df_to_csv()


'root/raw/relations/2021-02-24.csv'

In [85]:
for file in uploaded:
    try: 
        fileset.remove(file)
    except:
        x=1

In [86]:
for f in fileset: print(f)
# fileset.remove('root/processed/data.pt')

In [84]:
for file in fileset:
    # if file in uploaded: continue
    source = os.path.join(myFolder,file)
    destination =  os.path.join('n0s011m/dataset_s2h_and_mp_30_subcats',file)
    ut.gcp_df_to_csv('mobius_data_science',source,destination)
    uploaded.append(file)

CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-06-23.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-08-15.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-11-24.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-01-19.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-01-25.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-02-06.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-11-22.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-06-20.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-07-08.csv has been uploaded
CSV file - /Users/a0s0u22/Mobius/dataset_old/root/raw/relations/2021-05-05.csv has been uploaded
CSV file - /Users/a0s0u22/Mobi

In [12]:
class IdentityEncoder(object):
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)


class WmtNetworkDataset(InMemoryDataset):
    def __init__(self, root_dir, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data).
        """
        self.root = root_dir
        super(WmtNetworkDataset, self).__init__(root_dir, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self):
        return osp.join(self.root, 'raw/relations')

    @property
    def processed_dir(self):
        return osp.join(self.root, 'processed')

    @property
    def raw_file_names(self):
        # edge relations files
        file_list = sorted(os.listdir(self.raw_dir))
        return [osp.join(self.raw_dir, f) for f in file_list]

    @property
    def processed_file_names(self):
        return 'data.pt'

    def download(self):
        ...

    def process(self):

        proc_dir = osp.join(self.root, "processed/")
        mapping_dir = osp.join(self.root, "mapping/")
        split_dir = osp.join(self.root, "split/")

        node_dir = osp.join(ut.walk_up_folder(self.raw_dir, 2), "node-feat")
        edge_dir = self.raw_dir

        # File locations for the mappings and the features of the nodes
        cust_mapping_path = os.path.join(mapping_dir, 'cust_entidx2name.csv')
        ship_mapping_path = os.path.join(mapping_dir, 'shipnode_entidx2name.csv')
        prod_mapping_path = os.path.join(mapping_dir, 'product_entidx2name.csv')

        cust_node_feats = os.path.join(node_dir, 'custnode-feat.csv')
        ship_node_feats = os.path.join(node_dir, 'shipnode-feat.csv')
        prod_node_feats = os.path.join(node_dir, 'productnode-feat.csv')

        # Get node features
        # Loading Node mappings and their features
        cust_x, cust_idx_mapping = self.load_node_csv(cust_node_feats, cust_mapping_path)
        ship_x, ship_idx_mapping = self.load_node_csv(ship_node_feats, ship_mapping_path)
        prod_x, prod_idx_mapping = self.load_node_csv(prod_node_feats, prod_mapping_path)

        # Get edge features
        # In the loop we extract the nodes' embeddings, edges connectivity for
        # and label for a graph, process the information and put it in a Data
        # object, then we add the object to a list
        sdc_edge_indices, sdc_edge_attrs, sdc_edge_labels, cop_edge_indices, cop_edge_attrs, cop_edge_labels = self.get_edges_from_files(
            cust_idx_mapping, ship_idx_mapping, prod_idx_mapping)

        # Create data object

        data = HeteroData()
        data['product'].x = prod_x  # [num_products, num_features_product]
        data['customer'].x = cust_x  # [num_customers, num_features_customer]
        data['shipnode'].x = ship_x  # [num_shipnodes, num_features_shipnode]

        for node_type in ['customer', 'product', 'shipnode']:
            data[node_type].num_nodes = data[node_type].x.size(0)

        for edge_type in [('customer', 'orders', 'product'), ('shipnode', 'delivers', 'customer')]:
            if edge_type == ('customer', 'orders', 'product'):
                data['customer', 'orders', 'product'].edge_index = torch.cat(cop_edge_indices,
                                                                             1)  # [2, num_edges_orders]
                data['customer', 'orders', 'product'].edge_attr = minmax_scale(torch.cat(cop_edge_attrs,
                                                                            0))  # [num_edges_orders, num_features_orders]
            else:
                data['shipnode', 'delivers', 'customer'].edge_index = torch.cat(sdc_edge_indices,
                                                                                1)  # [2, num_edges_delivered]
                data['shipnode', 'delivers', 'customer'].edge_attr = minmax_scale(torch.cat(sdc_edge_attrs,
                                                                               0))  # [num_edges_affiliated, num_features_affiliated]
                data['shipnode', 'delivers', 'customer'].edge_label = torch.cat(sdc_edge_labels, 0)  # [num_edges,1]

        # Apply the functions specified in pre_filter and pre_transform
        if self.pre_filter is not None:
            data = self.pre_filter(data)

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # Store the processed data
        torch.save(self.collate([data]), self.processed_paths[0])

    def load_node_csv(self, featpath, idxpath):
        """
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        df = pd.read_csv(featpath)
        map_df = pd.read_csv(idxpath)
        mapping = dict(zip(map_df["ent_name"], map_df["ent_idx"]))
        x = torch.tensor(df.values, dtype=torch.float)
        return x, mapping

    def load_edge_csv(self, edge_file_path, edge_cols, src_index_col, src_mapping, dst_index_col, dst_mapping,
                      encoders=None):
        """
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        df = pd.read_csv(edge_file_path)
        # src = [src_mapping[index] for index in df[src_index_col]]
        # dst = [dst_mapping[index] for index in df[dst_index_col]]
        src = []
        dst = []
        for index, row in df.iterrows():
            try:
                s = src_mapping[row[src_index_col]]
                d = dst_mapping[row[dst_index_col]]
            except:
                df.drop(index, inplace=True)
                # print("Missed a key")
                continue
            src.append(s)
            dst.append(d)
        edge_index = torch.tensor([src, dst])

        edge_attr = df[edge_cols]
        edge_attr = torch.tensor(edge_attr.values, dtype=torch.float)

        edge_label = None
        if encoders is not None:
            edge_label = [encoder(df[col]) for col, encoder in encoders.items()]
            edge_label = torch.cat(edge_label, dim=-1)

        return edge_index, edge_attr, edge_label

    def get_edges_from_files(self, cust_idx_mapping, ship_idx_mapping, prod_idx_mapping):
        """
        """
        # Between customer and shipnode - involves delivery distance and how iot is shipped
        # cust_order_type

        dlvr_edge_attrb = ['same_zip', 'edge_travel_dist', 'shpg_cost_amt', 'inbound_shpg_cost_actl_amt']

        # Between shipnode and product - involves item quantity and shipping costs
        prod_edge_attrb = ['prod_cost_plcd_amt', 'net_qty', 'plcd_orig_qty', 'tax_plcd_amt', 'fulfmt_cost_amt',
                           'gmv_plcd_amt']

        time_attrb = ['order_hour', 'order_minute', 'order_plcd_tsYear', 'order_plcd_tsMonth',
                      'order_plcd_tsWeek', 'order_plcd_tsDay', 'order_plcd_tsDayofweek',
                      'order_plcd_tsDayofyear', 'order_plcd_tsIs_month_end',
                      'order_plcd_tsIs_month_start', 'order_plcd_tsIs_quarter_end',
                      'order_plcd_tsIs_quarter_start', 'order_plcd_tsIs_year_end',
                      'order_plcd_tsIs_year_start']

        label = ['SLA']

        sdc_edge_indices = []
        sdc_edge_attrs = []
        sdc_edge_labels = []
        cop_edge_indices = []
        cop_edge_attrs = []
        cop_edge_labels = []
        edges = []

        file_list = self.raw_file_names
        time_steps = len(file_list)
        print(time_steps)
        train_files, validate_files, test_files = np.split(file_list, [int(.8 * time_steps), int(.9 * time_steps)])

        for file_name in tqdm(file_list):
            edge_file_path = file_name
            # print(edge_file_path)

            sdc_edge_index, sdc_edge_attr, sdc_edge_label = self.load_edge_csv(edge_file_path,
                                                                               edge_cols=dlvr_edge_attrb + time_attrb,
                                                                               src_index_col="ship_node_codename",
                                                                               src_mapping=ship_idx_mapping,
                                                                               dst_index_col="custnode",
                                                                               dst_mapping=cust_idx_mapping,
                                                                               encoders={'SLA': IdentityEncoder(
                                                                                   dtype=torch.long)})
            edges.append(len(sdc_edge_label))

            cop_edge_index, cop_edge_attr, cop_edge_label = self.load_edge_csv(edge_file_path,
                                                                               edge_cols=prod_edge_attrb,
                                                                               src_index_col="custnode",
                                                                               src_mapping=cust_idx_mapping,
                                                                               dst_index_col="prmry_sku_id",
                                                                               dst_mapping=prod_idx_mapping)
            # encoders={'SLA': IdentityEncoder(dtype=torch.long)})
            sdc_edge_indices.append(sdc_edge_index)
            sdc_edge_attrs.append(sdc_edge_attr)
            sdc_edge_labels.append(sdc_edge_label)
            cop_edge_indices.append(cop_edge_index)
            cop_edge_attrs.append(cop_edge_attr)
            cop_edge_labels.append(cop_edge_label)

        return sdc_edge_indices, sdc_edge_attrs, sdc_edge_labels, cop_edge_indices, cop_edge_attrs, cop_edge_labels


    def __repr__(self) -> str:
        return 'wmt-network()'



In [32]:
import utils as ut
root_dir = os.path.join(walk_up_folder(os.getcwd(), 3), "dataset/root/")
data = WmtNetworkDataset(root_dir)[0]

In [33]:
data

HeteroData(
  [1mproduct[0m={
    x=[231219, 759],
    num_nodes=231219
  },
  [1mcustomer[0m={
    x=[754603, 66],
    num_nodes=754603
  },
  [1mshipnode[0m={
    x=[9818, 68],
    num_nodes=9818
  },
  [1m(customer, orders, product)[0m={
    edge_index=[2, 1278579],
    edge_attr=[1278579, 4],
    edge_label=[1278579, 1]
  },
  [1m(shipnode, delivers, customer)[0m={
    edge_index=[2, 1278579],
    edge_attr=[1278579, 18],
    edge_label=[1278579, 1]
  }
)

In [30]:
from copy import copy
import torch_geometric.transforms as T

def get_edge_split(data):
    edge_types = data.edge_types
    data = T.ToUndirected()(data)
    data = T.AddSelfLoops()(data)
    data = T.NormalizeFeatures()(data)
    rev_edge_types = [e for e in data.edge_types if e not in edge_types]

    for e in rev_edge_types:
        del data[e].edge_label
    #
    # data["edge_label"] = data[('shipnode', 'delivers', 'customer')].edge_label
    # del data[('shipnode', 'delivers', 'customer')].edge_label
    # Make a copy
    train_data, val_data, test_data = copy(data), copy(data), copy(data)
    # Create an array based on total data size
    num_edges = data[('shipnode', 'delivers', 'customer')].num_edges
    arr = torch.arange(num_edges)
    # Train test split of 80:10:10
    train_idx = arr[:int(0.8 * num_edges)]
    val_idx = arr[int(0.8 * num_edges):int(0.9 * num_edges)]
    test_idx = arr[int(0.9 * num_edges):]

    # Create indexing masks
    train_mask = torch.LongTensor(train_idx)
    val_mask = torch.LongTensor(val_idx)
    test_mask = torch.LongTensor(test_idx)

    # edge indexing
    for edge in edge_types+rev_edge_types:
        train_data[edge].edge_index = train_data[edge].edge_index[:, train_mask]
        train_data[edge].edge_attr = train_data[edge].edge_attr[train_mask, :]

        val_data[edge].edge_index = val_data[edge].edge_index[:, val_mask]
        val_data[edge].edge_attr = val_data[edge].edge_attr[val_mask, :]

        test_data[edge].edge_index = test_data[edge].edge_index[:, test_mask]
        test_data[edge].edge_attr = test_data[edge].edge_attr[test_mask, :]

    # Split labels
    for label in data.edge_label_dict:
        train_data[label].edge_label = train_data[label].edge_label[train_mask]
        val_data[label].edge_label = val_data[label].edge_label[val_mask]
        test_data[label].edge_label = test_data[label].edge_label[test_mask]

    return train_data, val_data, test_data

In [34]:
train_data, val_data, test_data = get_edge_split(data)
train_data

HeteroData(
  [1mproduct[0m={
    x=[231219, 759],
    num_nodes=231219
  },
  [1mcustomer[0m={
    x=[754603, 66],
    num_nodes=754603
  },
  [1mshipnode[0m={
    x=[9818, 68],
    num_nodes=9818
  },
  [1m(customer, orders, product)[0m={
    edge_index=[2, 1022863],
    edge_attr=[1022863, 4],
    edge_label=[1022863, 1]
  },
  [1m(shipnode, delivers, customer)[0m={
    edge_index=[2, 1022863],
    edge_attr=[1022863, 18],
    edge_label=[1022863, 1]
  },
  [1m(product, rev_orders, customer)[0m={
    edge_index=[2, 1022863],
    edge_attr=[1022863, 4]
  },
  [1m(customer, rev_delivers, shipnode)[0m={
    edge_index=[2, 1022863],
    edge_attr=[1022863, 18]
  }
)

In [55]:
data[("customer", "orders", "product")].edge_label.flatten().numpy()

array([1, 1, 1, ..., 1, 1, 2])

In [53]:
labels = pd.DataFrame({'netqty':data[("customer", "orders", "product")].edge_label.flatten().numpy()})
Q1 = labels['netqty'].quantile(0.25)
Q3 = labels['netqty'].quantile(0.75)
    
IQR = Q3 - Q1


lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR

lower_range,upper_range

(1.0, 1.0)

In [60]:
len(labels[(labels['netqty']<0) | (labels['netqty']>15)])

58

In [47]:
from torch_geometric.data import Data,Batch,LightningNodeData

In [49]:
LightningNodeData

ModuleNotFoundError: No module named 'pytorch_lightning'

In [50]:
!pip install pytorch_lightning

Looking in indexes: https://repository.cache.walmart.com/repository/pypi-proxy/simple/, https://pypi.org/simple
Collecting pytorch_lightning
  Downloading https://repository.cache.walmart.com/repository/pypi-proxy/packages/pytorch-lightning/pytorch_lightning-1.5.10-py3-none-any.whl?originalHref=aHR0cHM6Ly9maWxlcy5weXRob25ob3N0ZWQub3JnL3BhY2thZ2VzLzE4L2YxL2Y1OWIzMDdmNzVkYjE4ODZjOTZlMzk2ZWVjODc4NTAxNTEwNjc3Mzk0ODY4NjgwYjhkMmI4YjU4YzQ3Yy9weXRvcmNoX2xpZ2h0bmluZy0xLjUuMTAtcHkzLW5vbmUtYW55LndobCNzaGEyNTY9Yzg2ZWQ2MzNjZThkMjU2Njc3OTc2MzVkYjNlYTFmZTlmZTZiNTExMjFkNDNkNzAzMWQ1YmRiNGM1YjA0Njg5NQ== (527 kB)
[K     |████████████████████████████████| 527 kB 12 kB/s eta 0:00:011
[?25hCollecting setuptools==59.5.0
  Downloading https://repository.cache.walmart.com/repository/pypi-proxy/packages/setuptools/setuptools-59.5.0-py3-none-any.whl?originalHref=aHR0cHM6Ly9maWxlcy5weXRob25ob3N0ZWQub3JnL3BhY2thZ2VzLzQwL2E5LzdkZWFjNzZjNThmYTQ3Yzk1MzYwMTE2YTA2YjUzYjliNjJmNmRiMTEzMzZmZTYxYjZhYjUzNzg0ZDk4Yi9zZXR1cH

Collecting torchmetrics>=0.4.1
  Downloading https://repository.cache.walmart.com/repository/pypi-proxy/packages/torchmetrics/torchmetrics-0.7.2-py3-none-any.whl?originalHref=aHR0cHM6Ly9maWxlcy5weXRob25ob3N0ZWQub3JnL3BhY2thZ2VzL2Y3L2VjLzMxNjBmZDJkMzBiNTViMzVlOWNmZDg2NzBjOTVmY2FlYjFkYWE5ZGJhMjhhYTkxMmNmZTQwZDY5NmEzYi90b3JjaG1ldHJpY3MtMC43LjItcHkzLW5vbmUtYW55LndobCNzaGEyNTY9ZDBmYmY4NDQwOTEyZWY5M2YyMmUyMWJhZTQzZmRhOGZhMjZhNjUxMzEzYWNjM2VhOTNiZWFmZTNjODZkZDQ3NA== (397 kB)
[K     |████████████████████████████████| 397 kB 34 kB/s eta 0:00:011
Collecting absl-py>=0.4
  Downloading https://repository.cache.walmart.com/repository/pypi-proxy/packages/absl-py/absl_py-1.0.0-py3-none-any.whl?originalHref=aHR0cHM6Ly9maWxlcy5weXRob25ob3N0ZWQub3JnL3BhY2thZ2VzLzJjLzAzL2UzZTE5ZDNmYWY0MzBlZGUzMmU0MTIyMWIyOTRlMzc5NTJlMDZhY2M5Njc4MWM0MTdhYzI1ZDRhMDMyNC9hYnNsX3B5LTEuMC4wLXB5My1ub25lLWFueS53aGwjc2hhMjU2PTg0ZTZkY2RjNjljOTQ3ZDBjMTNlNTQ1N2QwNTZiZDQzY2FkZTRjMjM5M2RjZTAwZDY4NGFlZGVhNzdkZGMyYTM= (126 kB)
[K     

      Successfully uninstalled setuptools-58.0.4
Successfully installed absl-py-1.0.0 markdown-3.3.6 pyDeprecate-0.3.1 pytorch-lightning-1.5.10 setuptools-59.5.0 tensorboard-2.8.0 tensorboard-data-server-0.6.1 tensorboard-plugin-wit-1.8.1 torchmetrics-0.7.2 werkzeug-2.0.3


In [52]:
# from torch_geometric.loader import DataLoader
from torch.utils.data import DataLoader
loader = DataLoader(train_data, batch_size=32, shuffle=False)

In [44]:
for indices in loader.batch_sampler:
    collate_fn([train_data[i] for i in indices])

NameError: name 'collate_fn' is not defined

In [36]:
sampled_data = next(iter(loader))
sampled_data

TypeError: object of type 'int' has no len()