In [38]:
import pandas as pd
import networkx as nx
import numpy as np
from itertools import combinations
from tqdm import tqdm

## Loading and Exploring Data

In [2]:
data_file = 'data/2023_filtered_data.csv'
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,category
0,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B0BFR5WF1R,1.0,1675826333052,All_Beauty
1,AF5PN3FPG5Z66P7Z7UWL56D6CGMA,B0BL3HSBZB,1.0,1674411398983,All_Beauty
2,AGAM2CCKV52HI4YZU7ASZTSXA7YQ,B0BXB4J297,4.0,1678571121367,All_Beauty
3,AGAHANLSS7DG4ZHNPP5S56W4SKHA,B07TT8JK51,4.0,1672923265267,All_Beauty
4,AFH7ZT2NJSHDKJ43IODXCE6RHWVA,B09XBSDCXP,1.0,1676202555202,All_Beauty


In [14]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,category
0,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B0BFR5WF1R,1.0,2023-02-08 03:18:53.052,All_Beauty
1,AF5PN3FPG5Z66P7Z7UWL56D6CGMA,B0BL3HSBZB,1.0,2023-01-22 18:16:38.983,All_Beauty
2,AGAM2CCKV52HI4YZU7ASZTSXA7YQ,B0BXB4J297,4.0,2023-03-11 21:45:21.367,All_Beauty
3,AGAHANLSS7DG4ZHNPP5S56W4SKHA,B07TT8JK51,4.0,2023-01-05 12:54:25.267,All_Beauty
4,AFH7ZT2NJSHDKJ43IODXCE6RHWVA,B09XBSDCXP,1.0,2023-02-12 11:49:15.202,All_Beauty


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25283082 entries, 0 to 25283081
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      object        
 1   parent_asin  object        
 2   rating       float64       
 3   timestamp    datetime64[ns]
 4   category     object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 964.5+ MB


In [24]:
df_sorted = df.sort_values(by='timestamp', ascending=False)
df_sorted.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,category
25104356,AFA6NBLDWCSLBLULY6XVIL6SUAUA,B085WF87DT,5.0,2023-09-14 13:16:54.993,Unknown
25104948,AFWIHPLU3U5DFPDWC2IZUIL3QKVA,B0BNP3ZYM4,5.0,2023-09-14 10:58:54.302,Unknown
25104022,AGOENGSCHOXXR7UCE3CNP7X3ZNEQ,B0CHL96VJB,4.0,2023-09-14 07:44:15.818,Unknown
21877294,AHZOQB35MWVZUUJVSHGDN23JMORQ,B08C2C8RKV,5.0,2023-09-14 05:40:41.162,Sports_and_Outdoors
25098078,AHW6IYJIDWA7TZACSMKHCJ7A2IOQ,B007SWYQVW,5.0,2023-09-14 02:43:35.987,Unknown


In [3]:
df['category'].unique()

array(['All_Beauty', 'Amazon_Fashion', 'Appliances',
       'Arts_Crafts_and_Sewing', 'Automotive', 'Baby_Products',
       'Beauty_and_Personal_Care', 'Books', 'CDs_and_Vinyl',
       'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry',
       'Digital_Music', 'Electronics', 'Gift_Cards',
       'Grocery_and_Gourmet_Food', 'Handmade_Products',
       'Health_and_Household', 'Health_and_Personal_Care',
       'Home_and_Kitchen', 'Industrial_and_Scientific', 'Kindle_Store',
       'Magazine_Subscriptions', 'Movies_and_TV', 'Musical_Instruments',
       'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies',
       'Software', 'Sports_and_Outdoors', 'Tools_and_Home_Improvement',
       'Toys_and_Games', 'Unknown', 'Video_Games'], dtype=object)

In [4]:
df['user_id'].nunique()

9462700

In [5]:
df['parent_asin'].nunique()

4931159

In [12]:
grouped_by_parent_asin = df.groupby('parent_asin')
grouped_by_parent_asin.size().sort_values(ascending=False)

parent_asin
B07TVHSDMQ    20515
B07XFXXZMV     4844
B008A0GNA8     4401
B08PQ4Y1KZ     4273
B0BRBSNXNT     4206
              ...  
B09GV9L88C        1
B01IAE1VE6        1
B01IAE1QZK        1
B09GV9NZ9S        1
B010K9VP86        1
Length: 4931159, dtype: int64

In [13]:
filtered_parent_asin = grouped_by_parent_asin.filter(lambda x: len(x) > 1)
grouped_filtered_parent_asin = filtered_parent_asin.groupby('parent_asin')
grouped_filtered_parent_asin.size().sort_values(ascending=False)

parent_asin
B07TVHSDMQ    20515
B07XFXXZMV     4844
B008A0GNA8     4401
B08PQ4Y1KZ     4273
B0BRBSNXNT     4206
              ...  
B0BGQRMD1B        2
B07KG7QSQ3        2
B07KG7VNFS        2
B01DGLPR8Y        2
0000098906        2
Length: 2293167, dtype: int64

In [12]:
grouped_by_user_id = df.groupby('user_id')
grouped_by_user_id.size().sort_values(ascending=False)

user_id
AEA7TVOBS3KUGMLOPFKUJVXCQHNA      1582
AG4D4BZAETIVJNMIUYMW6C4YGRUQ_1    1273
AGZAR2FVIRB3ZPU2LD2CMBJ2QKNA       830
AGJCZ3TS5CHMAP4DVRV2752DQ3NA       821
AHCLIDPL6OI3LWPULCTGUBWAVILQ       816
                                  ... 
AGGDIWAQ4AUS4YZTYBWJXUMG6REQ         1
AGGDIWA2YDCMV6UPOY363LTQEFNA         1
AGGDIW26TF6YJX7UEE7ADQNWWCTQ         1
AEJDVJG4DHHKM7FNRNWLYIEIZBGA         1
AHZZZZVLBL7D2RJVNDSW7MJ7LWBQ         1
Length: 9462700, dtype: int64

In [13]:
grouped_by_user_id['parent_asin'].nunique().sort_values(ascending=False)

user_id
AEA7TVOBS3KUGMLOPFKUJVXCQHNA      1582
AG4D4BZAETIVJNMIUYMW6C4YGRUQ_1    1273
AGZAR2FVIRB3ZPU2LD2CMBJ2QKNA       830
AGJCZ3TS5CHMAP4DVRV2752DQ3NA       821
AHCLIDPL6OI3LWPULCTGUBWAVILQ       816
                                  ... 
AGGDIWAQ4AUS4YZTYBWJXUMG6REQ         1
AGGDIWA2YDCMV6UPOY363LTQEFNA         1
AGGDIW26TF6YJX7UEE7ADQNWWCTQ         1
AEJDVJG4DHHKM7FNRNWLYIEIZBGA         1
AHZZZZVLBL7D2RJVNDSW7MJ7LWBQ         1
Name: parent_asin, Length: 9462700, dtype: int64

## Smaller Data

In [73]:
smaller_df = df[df['timestamp'] > '2023-08-01']

In [74]:
grouped_by_user_id = smaller_df.groupby('user_id')

In [75]:
users_products = grouped_by_user_id['parent_asin'].apply(list)

In [76]:
users_products.head()

user_id
AE2222GSOK3SXGGGIW5U5JGTFVXA                                         [B001EG0S0G]
AE2226YDSSVJIQVGFA57OVYURSQQ                                         [B083KHKZY8]
AE222BXSMOAK65FICCF3SAG2FTIA    [B09TT4SRNB, B0C3CNYF3Y, B0BLV4NC6T, B01BHFZY3...
AE222W6PJZ4OQBFODCYTF4Q5WKPA                             [B091JHW9B6, B07SH11M7B]
AE2233G7TWVYUT6ECTVL3NTK4WRQ                                         [1681063522]
Name: parent_asin, dtype: object

In [77]:
edge_weights = {}
for products in tqdm(users_products, desc='Calculating edge weights'):
    for pair in combinations(sorted(products), 2):
        if pair not in edge_weights:
            edge_weights[pair] = 0
        edge_weights[pair] += 1


Calculating edge weights: 100%|██████████| 350196/350196 [00:00<00:00, 817138.03it/s]


In [79]:
values = list(edge_weights.values())

In [80]:
G = nx.Graph()
for (product1, product2), weight in edge_weights.items():
    G.add_edge(product1, product2, weight=weight)

In [59]:
average_degree = np.mean(list(dict(G.degree()).values()))
average_weight = np.mean(list(edge_weights.values()))
print(f'Average degree: {average_degree}')
print(f'Average weight: {average_weight}')

Average degree: 8.788692517702103
Average weight: 1.0035125120804793


In [60]:
max_degree = max(dict(G.degree()).values())
max_degree

2214

In [61]:
max_weight = max(edge_weights.values())
max_weight

18

In [62]:
edges_to_remove = [(u, v) for u, v, w in G.edges(data=True) if w['weight'] == 1]
G.remove_edges_from(edges_to_remove)

In [63]:
nodes_to_remove = [node for node, degree in dict(G.degree()).items() if degree == 0]
G.remove_nodes_from(nodes_to_remove)

In [69]:
G.number_of_nodes()

2864

In [81]:
edges_data = [(u, v, d['weight']) for u, v, d in G.edges(data=True)]
edges_df = pd.DataFrame(edges_data, columns=['node1', 'node2', 'weight'])
edges_df.to_csv('data/network_edges.csv', index=False)