## Experiments with PageRank

In [1]:
import matplotlib.pyplot as plt
import networkit as nk
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from sklearn.cluster import SpectralClustering
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity

import utils

In [2]:
# Load data
df_data = pd.read_csv('../data/ice-cat-office-products.csv.gz', dtype=str, index_col=0)

# Filter out small product categories
# top_categories = utils.find_top_n_categories(df_data, top_n=10)
top_categories = [
   'Folders',
   'Self-Adhesive Labels',
   'Multimedia Carts & Stands',
   'Calculators',
   'Writing Notebooks'
]
df_data = df_data[df_data.category_name.isin(top_categories)]

# Sort by category for later visualisation
df_data.sort_values('category_name', inplace=True)

# select a few features to avoid computational issues.
product_features = ['Product colour', 'Format', 'Width',
    'Height', 'Maximum capacity', 'Orientation', 'Weight',
    'Material', 'Depth', 'Media weight', 'Quantity per pack',
    'supplier_name'
]
df_play_data = df_data[['category_name'] + product_features].copy()

In [3]:
df_data.category_name.value_counts()

Folders                      645
Self-Adhesive Labels         324
Multimedia Carts & Stands    317
Calculators                  305
Writing Notebooks            300
Name: category_name, dtype: int64

In [4]:
X = utils.preprocess_dataframe(df_play_data)
X.shape

(1891, 145)

In [5]:
# Reduce to number of dimensions so the remaining dimensions
# explain at least 90 percent of the variance.
svd = PCA(n_components=0.9, random_state=42)
X_transformed = svd.fit_transform(X)
X_transformed.shape

(1891, 20)

In [6]:
similarity_matrix = cosine_similarity(X_transformed)

# Ensure similarity matrix entries are normalised in the range [0,1]
similarity_matrix = (similarity_matrix - np.min(similarity_matrix)) / np.ptp(similarity_matrix)

In [7]:
np.min(similarity_matrix), np.max(similarity_matrix)

(0.0, 1.0)

In [8]:
g = nk.Graph(weighted=True, directed=False)

n = similarity_matrix.shape[0]
g.addNodes(n)
for i in range(n):
    for j in range(i, n):
        if i != j:
            g.addEdge(i, j, similarity_matrix[i, j])

In [9]:
communities = nk.community.detectCommunities(g)

PLM(balanced,pc,turbo) detected communities in 0.04289674758911133 [s]
solution properties:
-------------------  ----------
# communities          3
min community size   238
max community size   888
avg. community size  630.333
modularity             0.158258
-------------------  ----------


In [10]:
nk.community.Modularity().getQuality(communities, g)

0.15825763300109352

In [11]:
pr = nk.centrality.PageRank(g, 1e-6)

In [12]:
pr.run()

<networkit.centrality.PageRank at 0x7f8e43752dc0>

In [13]:
pr.ranking()[0:10]

[(1635, 0.0005288207550132773),
 (1798, 0.0005288207539329771),
 (1496, 0.0005288207538572518),
 (1502, 0.0005288207538572518),
 (1526, 0.0005288207538572518),
 (1527, 0.0005288207538572518),
 (1852, 0.0005288207537722454),
 (1511, 0.0005288207537598213),
 (1556, 0.0005288207537598213),
 (1432, 0.0005288207537202342)]