In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('electronics_final.csv', header = 0)

In [3]:
print('dataset dimensions are:', df.shape)
df.describe(include = 'all')

dataset dimensions are: (2006, 10)


Unnamed: 0,Product_code,Product_name,Product_description,Product_URL,Breadcrumb_parent,Breadcrumb_active,Product_price,Rating_Value,Rating_Count,Recommended_Prods
count,2006,2006,1640,2006,2006,2006,2006,2006.0,2006,2006
unique,1968,1957,1569,1968,16,255,1077,,370,1611
top,4655HRO97D87,"SAMSUNG 65"" Class 4K UHD (2160P) the Frame QLE...",Enter a world saturated with color and sharpen...,https://walmart.com/ip/4655HRO97D87,Electronics,/Digital SLR Cameras,$99.00,,0 ratings,['Not available']
freq,5,5,6,5,1616,233,22,,450,379
mean,,,,,,,,3.115105,,
std,,,,,,,,1.838493,,
min,,,,,,,,0.0,,
25%,,,,,,,,1.85,,
50%,,,,,,,,4.0,,
75%,,,,,,,,4.5,,


In [4]:
df.head()

Unnamed: 0,Product_code,Product_name,Product_description,Product_URL,Breadcrumb_parent,Breadcrumb_active,Product_price,Rating_Value,Rating_Count,Recommended_Prods
0,35X5YDKAGYGK,"RCA 50"" Class 4K Ultra HD (2160P) LED TV (RLDE...","The RCA 50"" 2160p Class 4K Ultra High-Definiti...",https://walmart.com/ip/35X5YDKAGYGK,Electronics,/All TVs,$219.99,3.9,1254 ratings,"['5YZBEUJO11MO', '1CFXZI1QNUKO', '6SA8LLNNSLMP..."
1,2RKSQC7C8QKZ,"RCA 55"" Class 4K Ultra HD (2160P) HDR Roku Sma...",Enjoying 4K Ultra HD just got easier. RCA Roku...,https://walmart.com/ip/2RKSQC7C8QKZ,Electronics,/All TVs,$279.99,4.2,595 ratings,"['1CFXZI1QNUKO', '2X3MQ9SM84AA', '6SA8LLNNSLMP..."
2,4U07UQJGM4NA,"LG 65"" Class 4K UHD 2160P Smart TV 65UN7300PUF...",Empower entertainment with LG UHD TV. LG's UHD...,https://walmart.com/ip/4U07UQJGM4NA,Electronics,/All TVs,$646.99,4.3,154 ratings,"['2MJRWZI33VET', '46VQULKOD37S', '2Z0G09Y8QKOL..."
3,75BV5KCCV6DE,"SAMSUNG 70"" Class 4K Crystal UHD (2160P) LED S...",This smart TV unlocks hidden detail at four ti...,https://walmart.com/ip/75BV5KCCV6DE,Electronics,/All TVs,$947.00,4.4,2843 ratings,"['46VQULKOD37S', '5NGD72QFL38K', '3YFRMHEUUEZD..."
4,7K3V50QQP6XT,"TCL 32"" Class 1080P FHD LED Roku Smart TV 3 Se...",The 3-Series TCL Roku TV puts all your enterta...,https://walmart.com/ip/7K3V50QQP6XT,Electronics,/All TVs,$149.99,4.4,208 ratings,"['2Z3IRIL1ATIA', '60R8ZDOQCUAS', '5HF3T8I5XH2A..."


In [5]:
#Let's take a smaller set of the data to speed up computations for this example
df_sample = df.iloc[:2006] 

In [6]:
rec_prods_ids = []
for i in range(len(df_sample.Product_code)):
    xx = df_sample.Recommended_Prods[i]
    for j in range(int(len(xx)/16)):
        rec_prods_ids.append(xx[16*j+2:16*j+14])

In [7]:
pr_code = df_sample.Product_code

In [8]:
pr = []
for i in range(len(pr_code)):
    pr.append(pr_code[i])

In [9]:
a = set(rec_prods_ids) & set(pr)
len(a)

922

In [10]:
coef_mat = np.zeros((len(pr_code), len(pr_code)))
for i in range(len(pr_code)):
    rec_prods_ids_ind = []
    xx = df_sample.Recommended_Prods[i]
    for j in range(int(len(xx)/16)):
        rec_prods_ids_ind.append(xx[16*j+2:16*j+14])
    b = set(rec_prods_ids_ind) & set(a)
    if len(b)>0:
        both = set(pr).intersection(b)
        ind = [pr.index(x) for x in both]
        coef_mat[i,ind] = 1

In [11]:
coef_mat

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#Data pre-processing 

#Delete rows with no Customer ID (if there is such a case)
cleaned_retail = df_sample.loc[pd.isnull(df_sample.Product_code) == False]

#Create a lookup table
item_lookup = cleaned_retail[['Breadcrumb_parent', 'Breadcrumb_active']].drop_duplicates()
item_lookup['Breadcrumb_parent'] = item_lookup.Breadcrumb_parent.astype(str)

In [None]:
item_lookup

In [None]:
#Do some 'data cleaning' to raw data
# cleaned_retail['CustomerID'] = cleaned_retail.CustomerID.astype(int)
# cleaned_retail = cleaned_retail[['StockCode', 'Quantity', 'CustomerID']]
grouped_cleaned = cleaned_retail.groupby(['CustomerID', 'StockCode']).sum().reset_index()
grouped_cleaned.Quantity.loc[grouped_cleaned.Quantity == 0] = 1
grouped_purchased = grouped_cleaned.query('Quantity > 0')

In [None]:
#Count number of products and number of customers in the reduced dataset 
no_products = len(grouped_purchased.StockCode.unique())
no_customers = len(grouped_purchased.CustomerID.unique())
print('Number of customers in dataset:', no_customers)
print('Number of products in dataset:', no_products)

In [None]:
#Turn raw data to pivot ('ratings' matrix)
ratings = grouped_purchased.pivot(index = 'CustomerID', columns='StockCode', values='Quantity').fillna(0).astype('int')
#Binarize the ratings matrix (indicate only if a customer has purchased a product or not)
ratings_binary = ratings.copy()
ratings_binary[ratings_binary != 0] = 1

In [None]:
#Initialize zeros dataframe for product interactions
products_integer = np.zeros((no_products,no_products))

#Count how many times each product pair has been purchased
print('Counting how many times each pair of products has been purchased...')
for i in range(no_products):
    for j in range(no_products):
        if i != j:
            df_ij = ratings_binary.iloc[:,[i,j]] #create a temporary df with only i and j products as columns
            sum_ij = df_ij.sum(axis=1)
            pairings_ij = len(sum_ij[sum_ij == 2]) #if s1_ij == 2 it means that both products were purchased by the same customer
            products_integer[i,j] = pairings_ij
            products_integer[j,i] = pairings_ij

In [None]:
#Count how many customers have purchased each item
print('Counting how many times each individual product has been purchased...')
times_purchased = products_integer.sum(axis = 1)

In [None]:
#Construct final weighted matrix of item interactions
print('Building weighted product matrix...')
products_weighted = np.zeros((no_products,no_products))
for i in range(no_products):
    for j in range(no_products):
        if (times_purchased[i]+times_purchased[j]) !=0: #make sure you do not divide with zero
            products_weighted[i,j] = (products_integer[i,j])/(times_purchased[i]+times_purchased[j])

In [None]:
#Get list of item labels (instead of Codes)
nodes_codes = np.array(ratings_binary.columns).astype('str')
item_lookup_dict = pd.Series(item_lookup.Description.values,index=item_lookup.StockCode).to_dict()
nodes_labels = [item_lookup_dict[code] for code in nodes_codes]

In [None]:
#Create Graph object using the weighted product matrix as adjacency matrix
G = nx.from_numpy_matrix(products_weighted)
pos=nx.random_layout(G)
labels = {}
for idx, node in enumerate(G.nodes()):
    labels[node] = nodes_labels[idx]

nx.draw_networkx_nodes(G, pos , node_color="skyblue", node_size=30)
nx.draw_networkx_edges(G, pos,  edge_color='k', width= 0.3, alpha= 0.5)
nx.draw_networkx_labels(G, pos, labels, font_size=4)
plt.axis('off')
plt.show() # display

In [None]:
#Export graph to Gephi
H=nx.relabel_nodes(G,labels) #create a new graph with Description labels and save to Gephi for visualizations
nx.write_gexf(H, "products.gexf")

In [None]:
#Find communities of nodes (products)
partition = community_louvain.best_partition(G, resolution = 1.5)
values = list(partition.values())

In [None]:
#Check how many communities were created
print('Number of communities:', len(np.unique(values)))

In [None]:
#Create dataframe with product description and community id
products_communities = pd.DataFrame(nodes_labels, columns = ['product_description'])
products_communities['community_id'] = values

In [None]:
#Lets take a peek at community 1
products_communities[products_communities['community_id']==1].head(15)