In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv(
    './ml-100k/u.data',
    names=["user_id","item_id","rating","timestamp"],
    sep='\t'
)

In [2]:
print(dataset)

       user_id  item_id  rating  timestamp
0          196      242       3  881250949
1          186      302       3  891717742
2           22      377       1  878887116
3          244       51       2  880606923
4          166      346       1  886397596
...        ...      ...     ...        ...
99995      880      476       3  880175444
99996      716      204       5  879795543
99997      276     1090       1  874795795
99998       13      225       2  882399156
99999       12      203       3  879959583

[100000 rows x 4 columns]


In [3]:
from scipy.sparse import csr_matrix

matrix_data = csr_matrix(
    (dataset.rating,
    (dataset.user_id,dataset.item_id))
)

print(matrix_data)

  (1, 1)	5
  (1, 2)	3
  (1, 3)	4
  (1, 4)	3
  (1, 5)	3
  (1, 6)	5
  (1, 7)	4
  (1, 8)	1
  (1, 9)	5
  (1, 10)	3
  (1, 11)	2
  (1, 12)	5
  (1, 13)	5
  (1, 14)	5
  (1, 15)	5
  (1, 16)	5
  (1, 17)	3
  (1, 18)	4
  (1, 19)	5
  (1, 20)	4
  (1, 21)	1
  (1, 22)	4
  (1, 23)	4
  (1, 24)	3
  (1, 25)	4
  :	:
  (943, 739)	4
  (943, 756)	2
  (943, 763)	4
  (943, 765)	3
  (943, 785)	2
  (943, 794)	3
  (943, 796)	3
  (943, 808)	4
  (943, 816)	4
  (943, 824)	4
  (943, 825)	3
  (943, 831)	2
  (943, 840)	4
  (943, 928)	5
  (943, 941)	1
  (943, 943)	5
  (943, 1011)	2
  (943, 1028)	2
  (943, 1044)	3
  (943, 1047)	2
  (943, 1067)	2
  (943, 1074)	4
  (943, 1188)	3
  (943, 1228)	3
  (943, 1330)	3


In [4]:
topk = 10
rank_list = [i+1 for i in range(topk)]

In [5]:
user_data = csr_matrix(
    (dataset.rating,
    (dataset.user_id,dataset.item_id))
)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_data)
print(user_similarity.shape)

(944, 944)


In [7]:
uq_users = np.sort(dataset.user_id.unique().tolist())

df_similar_user_list = pd.DataFrame(
    columns=[
        "base_user_id",
        "similar_user_id",
        "cosine_score",
        "rank"
    ]
)

print(df_similar_user_list)

Empty DataFrame
Columns: [base_user_id, similar_user_id, cosine_score, rank]
Index: []


In [8]:
for user_id in uq_users:
    
    similar_score = user_similarity[user_id]

    df_similar_user = pd.DataFrame()

    df_similar_user['base_user_id'] = [user_id] * topk
    df_similar_user['similar_user_id'] = np.argsort(similar_score)[::-1][1:topk+1]
    df_similar_user['cosine_score'] = np.sort(similar_score)[::-1][1:topk+1]
    df_similar_user['rank'] = rank_list

    df_similar_user_list = pd.concat([df_similar_user_list, df_similar_user])
    # df_similar_user_list = df_similar_user_list.append(
    #     df_similar_user,
    #     ignore_index=True
    # )

In [9]:
print(df_similar_user_list)

   base_user_id similar_user_id  cosine_score rank
0             1             916      0.569066    1
1             1             864      0.547548    2
2             1             268      0.542077    3
3             1              92      0.540534    4
4             1             435      0.538665    5
..          ...             ...           ...  ...
5           943             276      0.498243    6
6           943             709      0.493219    7
7           943             586      0.491972    8
8           943             472      0.488882    9
9           943             864      0.488014   10

[9430 rows x 4 columns]
