Goal in mind: Achieving Differential Privacy for data distribution using k-means clustering, and/or locally private k-means implementation

https://jmlr.csail.mit.edu/papers/volume22/20-721/20-721.pdf

In [1]:
from src.data_examples.ex2_data_loader import ExampleDataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse.linalg import svds as scipy_svds
from sklearn.cluster import KMeans as sklearn_kmeans

pd.options.mode.chained_assignment = None

In [2]:
RAND_SEED = 1337
np.random.seed(RAND_SEED)

In [3]:
data_loader = ExampleDataLoader()
data_loader.download().load().clean().remap().merge()

using cached file cache\datasets\movielens\ml-100k.zip
extracting zip file content:
file already exists: data\ex2\ml-100k
file already exists: data\ex2\ml-100k\allbut.pl
file already exists: data\ex2\ml-100k\mku.sh
file already exists: data\ex2\ml-100k\README
file already exists: data\ex2\ml-100k\u.data
file already exists: data\ex2\ml-100k\u.genre
file already exists: data\ex2\ml-100k\u.info
file already exists: data\ex2\ml-100k\u.item
file already exists: data\ex2\ml-100k\u.occupation
file already exists: data\ex2\ml-100k\u.user
file already exists: data\ex2\ml-100k\u1.base
file already exists: data\ex2\ml-100k\u1.test
file already exists: data\ex2\ml-100k\u2.base
file already exists: data\ex2\ml-100k\u2.test
file already exists: data\ex2\ml-100k\u3.base
file already exists: data\ex2\ml-100k\u3.test
file already exists: data\ex2\ml-100k\u4.base
file already exists: data\ex2\ml-100k\u4.test
file already exists: data\ex2\ml-100k\u5.base
file already exists: data\ex2\ml-100k\u5.test
fil

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)


<src.data_examples.ex2_data_loader.ExampleDataLoader at 0x22d94573c70>

In [4]:
def df_to_np_sparse_matrix(df, U_c, V_c, val_c):
  df = df[[U_c, V_c, val_c]]
  df_pivot = df.pivot(index=U_c, columns=V_c, values=val_c).fillna(0)
  return df_pivot.values

In [5]:
features_X = ['user_id', 'movie_id']
feature_Y = 'rating'
df_main = data_loader.df[features_X + [feature_Y]]
for feature in features_X:
  df_main[feature] = df_main[feature].astype('int32')
df_main

Unnamed: 0,user_id,movie_id,rating
0,195,241,0.2
1,304,241,0.6
2,5,241,0.4
3,233,241,0.4
4,62,241,0.2
...,...,...,...
99995,862,1678,0.2
99996,862,1677,-0.2
99997,862,1679,0.0
99998,895,1680,0.2


### trial: k-means clustering straight from csr sparse matrix from the dataset

In [6]:
# pred, (U, sig, Vt) = eval_svd_factorization(df_main, 'user_id', 'movie_id', 'rating')
mat_sparse_main = df_to_np_sparse_matrix(df_main, 'user_id', 'movie_id', 'rating')

mat_sparse_main

array([[0.6, 0.2, 0.4, ..., 0. , 0. , 0. ],
       [0.4, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0.6, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0.6, 0. , ..., 0. , 0. , 0. ]])

In [7]:
n_clusters = 10

kmeans = sklearn_kmeans(n_clusters=n_clusters)
kmeans.fit(mat_sparse_main)
kmeans.inertia_, kmeans.cluster_centers_



(9793.840887696468,
 array([[ 1.93063584e-01,  3.69942197e-02,  1.84971098e-02, ...,
          0.00000000e+00,  1.08420217e-19,  1.08420217e-19],
        [ 3.62500000e-01,  3.75000000e-02,  1.25000000e-02, ...,
          0.00000000e+00,  2.71050543e-20,  2.71050543e-20],
        [ 4.00000000e-02,  5.55555556e-04,  2.22222222e-03, ...,
          0.00000000e+00,  3.79470760e-19,  3.79470760e-19],
        ...,
        [ 1.40983607e-01,  2.29508197e-02,  3.27868852e-03, ...,
          0.00000000e+00, -8.13151629e-20, -8.13151629e-20],
        [ 3.55000000e-01,  1.40000000e-01, -1.00000000e-02, ...,
          0.00000000e+00,  5.00000000e-03, -2.71050543e-20],
        [ 3.42857143e-01,  1.71428571e-01,  1.14285714e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]))

In [8]:
user_idx = df_main['user_id'].unique()

user_clusters = []
for n in range(n_clusters):
  user_clusters.append(user_idx[kmeans.labels_ == n])


user_clusters

[array([ 62, 248,  89,  17, 269, 239, 143,  20, 128, 201, 203,  85, 460,
        485, 484, 532, 565, 573, 616, 679, 723, 719, 732, 757, 868, 893,
        936,  21,  86, 266, 127, 212, 124, 150, 313, 325, 335, 378, 386,
        428, 434, 477, 503, 513, 523, 544, 605, 624, 641, 642, 689, 704,
        711, 737, 795, 845, 859, 877, 896, 933, 255, 448, 564, 785, 794,
        343, 467, 755, 935,  61, 115, 234, 321, 487, 526, 617, 805, 822,
        912, 345, 111,  46,  38, 445, 468, 506, 798, 841, 873, 147, 213,
        296, 390, 420, 492, 604, 606, 775, 391, 451, 156, 250,  67, 191,
        401, 433, 500, 629, 717, 778, 769, 791, 824, 871, 167,  77,  52,
        226, 265, 533, 583, 833,  49, 240, 258, 160, 133, 145, 142, 146,
        257, 323, 371, 422, 528, 555, 569, 625, 644, 684, 735, 780, 796,
        826, 828, 874, 923, 187, 927, 247, 444, 549, 648, 729, 792, 159,
        608, 856, 525, 149, 741, 374, 800, 577, 812, 254, 385, 728,  32,
          3, 168, 854, 570]),
 array([270, 112, 696