### Importing required libraries

In [47]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
from torch_geometric.data import Data
from torch_geometric import utils
from pyvis.network import Network
import re


### Convert raw data to graph

In [48]:

def process_edges(edges):
    edge_index = []
    in_degrees = np.zeros((262111, 1))

    for line in edges:
        if line.startswith('#'):
            continue
        start, end = map(int, line.strip().split())
        in_degrees[end][0] += 1
        edge_index.append([start, end])

    return edge_index, in_degrees

def create_graph(edges):
    edge_index, in_degrees = process_edges(edges)
    edge_index = torch.tensor(edge_index).t().contiguous()
    graph = Data(x=torch.tensor(in_degrees), edge_index=edge_index)
    return graph

#Import the node to node raw data taken from SNAP
with open('../AmazonNodeData.txt', 'r') as f:
    edges = f.readlines()

graph = create_graph(edges)
torch.save(graph, 'AmazonNodeGraph.pt')

In [49]:
print(f"The Number of nodes in the graph is {graph.num_nodes}")
print(f"The Number of edges in the graph is {graph.num_edges}")

The Number of nodes in the graph is 262111
The Number of edges in the graph is 1234877


### Extract the metadata


In [50]:
def parse_metadata(file_path):
    metadata = {}

    with open(file_path, 'r', encoding="utf8") as file:
        # Skip the first two lines
        for _ in range(2):
            next(file)

        product_data = {}
        for line in file:
            line = line.strip()
            if line:
                try:
                    key, value = map(str.strip, line.split(':', 1))
                    product_data[key] = value
                except ValueError:
                    continue  # Skip lines that cannot be split into key-value pairs
            else:
                if product_data:
                    product_id = product_data.get('Id')
                    if product_id:
                        metadata[product_id] = product_data.copy()  # Save a copy to avoid modifying the same dict
                    product_data.clear()  # Clear the dictionary for the next product

    return metadata


# Example usage
metadata = parse_metadata('../amazon-meta.txt')

product_1_metadata = metadata.get('1')

if product_1_metadata:
    print(product_1_metadata)
else:
    print("Product with ID '1' not found.")

{'Id': '1', 'ASIN': '0827229534', 'title': 'Patterns of Preaching: A Sermon Sampler', 'group': 'Book', 'salesrank': '396585', 'similar': '5  0804215715  156101074X  0687023955  0687074231  082721619X', 'categories': '2', 'reviews': 'total: 2  downloaded: 2  avg rating: 5', '2000-7-28  cutomer': 'A2JW67OY8U6HHK  rating: 5  votes:  10  helpful:   9', '2003-12-14  cutomer': 'A2VE83MZF98ITY  rating: 5  votes:   6  helpful:   5'}


In [51]:
product_676_metadata = metadata.get('676')
print(product_676_metadata)

{'Id': '676', 'ASIN': '1565113306', 'title': 'Fight Club', 'group': 'Book', 'salesrank': '359382', 'similar': '5  0385498721  0385720920  0393319296  0385722192  0385509472', 'categories': '5', 'reviews': 'total: 551  downloaded: 550  avg rating: 4.5', '1998-2-25  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   1  helpful:   0', '1998-6-18  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   3  helpful:   1', '1998-6-29  cutomer': 'AKZGJL6DVTYKM  rating: 3  votes:   2  helpful:   2', '1998-7-28  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   1  helpful:   0', '1998-8-14  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   2  helpful:   1', '1999-2-2  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   2  helpful:   1', '1999-2-17  cutomer': 'ATVPDKIKX0DER  rating: 5  votes:   2  helpful:   0', '1999-2-23  cutomer': 'ATVPDKIKX0DER  rating: 3  votes:   3  helpful:   2', '1999-2-25  cutomer': 'A2E942BYMDZWNE  rating: 5  votes:   1  helpful:   0', '1999-3-15  cutomer': 'A3Q39DYUT1PON8  rating: 5  votes:   1  h

In [52]:
print(f"The Number of ids in the metadata = {len(metadata)}")

The Number of ids in the metadata = 548552


#### Create a sample graph for representation

In [53]:
# Load the graph data from the file 'AmazonNodeGraph.pt'
graph = torch.load('AmazonNodeGraph.pt')

# Generate a mask where True indicates nodes to be preserved, and False indicates nodes to be discarded
mask = torch.zeros(graph.num_nodes, dtype=torch.bool)
mask[:100] = True

# Create and store a smaller graph by selecting nodes based on the mask
g = Data(x=graph.x[mask], edge_index=utils.subgraph(mask, graph.edge_index)[0])
torch.save(g, 'sampledGraph.pt')

In [54]:
# Load the graph
g = torch.load('sampledGraph.pt')

# Initialize the PyVis network
net = Network(height="750px", width="100%", bgcolor="#0b0114", font_color="#875cad")

# Iterate over edges and add nodes and edges to the PyVis network
for src, dst in tqdm(g.edge_index.T):
    src_id = str(src.item())
    dst_id = str(dst.item())
    
    if src_id == '0' or dst_id == '0':
        continue
    
    src_metadata = metadata.get(src_id, {})
    dst_metadata = metadata.get(dst_id, {})
    
    src_title = f"Title: {src_metadata.get('title', 'N/A')}"
    
    dst_title = f"Title: {dst_metadata.get('title', 'N/A')}"
    
    net.add_node(src_id, label=src_title, title=src_title)
    net.add_node(dst_id, label=dst_title, title=dst_title)
    net.add_edge(src_id, dst_id, value=0.1)

# Save the network visualization as an HTML file
net.show("sampledGraph.html", notebook=False)

100%|██████████| 353/353 [00:00<00:00, 19577.01it/s]

sampledGraph.html





## Generate customer data csv

In [55]:
import re

f = open('../amazon-meta.txt',mode='r',encoding='utf-8')
fulltxt=f.readlines()
f.close()
s=pd.Series(fulltxt)[2:]
s=s.reset_index(drop=True)
indices=s[s=='\n'].index

dataframe=pd.DataFrame(columns=['Id','ASIN','cutomer_id'])

txt=[]
for i,indx in enumerate(indices):
    if i<len(indices)-1:
        b=s[(indices[i]+1):indices[i+1]].sum()
        txt.append(b)


s_txt=pd.Series(txt)
dataframe=pd.DataFrame(s_txt)
dataframe.columns=['fulltxt']

dataframe['id']=dataframe['fulltxt'].map(lambda b:re.findall('Id:   \\d+',b)[0][6:])
dataframe['ASIN']=dataframe['fulltxt'].map(lambda b:re.findall('ASIN: .+\n',b)[0][6:-1])
dataframe['DiscontinuedORNOT']=dataframe['fulltxt'].map(lambda b:'discontinued' in b)

dataframe_F=dataframe[dataframe.DiscontinuedORNOT==False]
dataframe_T=dataframe[dataframe.DiscontinuedORNOT==True]
dataframe_F['cutomer_id']=dataframe_F['fulltxt'].map(lambda b:[a[10:] for a in re.findall('cutomer: \\w+',b)])
dataframe_F.to_csv('amazon_customer_data.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_F['cutomer_id']=dataframe_F['fulltxt'].map(lambda b:[a[10:] for a in re.findall('cutomer: \\w+',b)])
