In [46]:
"""
    Setting up Environment
"""

'\n    Setting up Environment\n'

In [61]:
import os
import numpy as np
import pandas as pd
from tabulate import tabulate
import datetime
import random
import time
import networkx as nx
from sklearn.metrics import pairwise_distances
from cdlib import algorithms
from surprise import accuracy, Dataset, Reader
from surprise import (
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    NMF,
    NormalPredictor,
    SlopeOne,
    SVD,
    SVDpp,
)
from surprise.model_selection import cross_validate, KFold
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [48]:
"""
    Setting experiment parameters
"""

'\n    Setting experiment parameters\n'

In [49]:
algos = {
    'SVD': SVD(random_state=0),
    'SVDpp': SVDpp(random_state=0),
    'NMF':NMF(random_state=0),
    'SlopeOne':SlopeOne(),
    'kNNBasic':KNNBasic(),
    'kNNWithMeans':KNNWithMeans(),
    'kNNBaseLine':KNNBaseline(),
    'CoClustering':CoClustering(random_state=0),
    'BaselineOnly':BaselineOnly(),
    'NormalPredictor':NormalPredictor(),
}

kf = KFold(n_splits=3)
np.random.seed(0)
random.seed(0)

#datasets = ['ml-100k', 'ml-1m', 'jester']
dataset = 'ml-100k'

In [50]:
"""
    Generating train and test sets for experiments
"""

'\n    Generating train and test sets for experiments\n'

In [51]:
data = Dataset.load_builtin(dataset)

train_sets=[]
test_sets=[]
for trainset, testset in kf.split(data):
        train_sets.append(trainset)
        test_sets.append(testset)

In [52]:
"""
    Executing experiments
"""

'\n    Executing experiments\n'

In [53]:
df_trains=[]
df_tests=[]
for trainset, testset in zip(train_sets, test_sets):
    iterator = trainset.all_ratings()
    df_train = pd.DataFrame(columns=['uid', 'iid', 'rating'])
    i = 0
    for (uid, iid, rating) in iterator:
        df_train.loc[i] = [uid, iid, rating]
        i = i+1
    df_trains.append(df_train)

    df_test = pd.DataFrame.from_records(testset, columns = ['uid', 'iid', 'rating'])
    df_test = df_test.astype({'uid':'float', 'iid':'float', 'rating':'float'})
    df_test['uid'] = df_test['uid']-1
    df_test['iid'] = df_test['iid']-1
    df_tests.append(df_test)

In [54]:
df_trains[0].head()

Unnamed: 0,uid,iid,rating
0,0.0,0.0,4.0
1,0.0,23.0,5.0
2,0.0,53.0,5.0
3,0.0,299.0,5.0
4,0.0,61.0,5.0


In [55]:
df_tests[0].head()

Unnamed: 0,uid,iid,rating
0,22.0,527.0,4.0
1,694.0,241.0,5.0
2,773.0,27.0,3.0
3,416.0,549.0,3.0
4,233.0,1034.0,3.0


In [62]:
for trainset, testset in zip(df_trains, df_tests):
    df_ratings = trainset.pivot_table(index=['uid'],columns=['iid'],values='rating')
    df_ratings_dummy = df_ratings.copy().fillna(0)
    similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
    similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings.index, columns=df_ratings.index)

    def calculate_ratings(id_movie, id_user):
        if id_movie in df_ratings:
            cosine_scores = similarity_matrix_df[id_user] 
            ratings_scores = df_ratings[id_movie]      
            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            cosine_scores = cosine_scores.drop(index_not_rated)
            ratings_movie = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()
        else:
            return 2.5
        return ratings_movie

    def score_on_test_set(data):
        user_movie_pairs = zip(data['iid'], data['uid'])
        predicted_ratings = [calculate_ratings(movie, user) for (movie,user) in user_movie_pairs]
        true_ratings = list(data['rating'])
        results = pd.DataFrame({'y_pred': predicted_ratings, 'y_true': true_ratings})
        results.dropna(axis=0, inplace=True)
        predicted_ratings =  np.array(results['y_pred'])
        true_ratings =  np.array(results['y_true'])
        score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
        return score

    test_set_score = score_on_test_set(testset)
    print(test_set_score)

1.279733663554009
1.267145293273595
1.2820285032007164


In [64]:
# == Weighted Average Approach (Improved)

for trainset, testset in zip(df_trains, df_tests):
    df_ratings = trainset.pivot_table(index=['uid'],columns=['iid'],values='rating')
    df_ratings_dummy = df_ratings.copy().fillna(0)
    similarity_matrix = cosine_similarity(df_ratings_dummy, df_ratings_dummy)
    similarity_matrix_df = pd.DataFrame(similarity_matrix, index=df_ratings.index, columns=df_ratings.index)
    # |-------- Creating an adjacency matrix
    adjacency_matrix = similarity_matrix.copy()
    adjacency_matrix[similarity_matrix > np.percentile(similarity_matrix, 75)] = 1
    adjacency_matrix[similarity_matrix <= np.percentile(similarity_matrix, 75)] = 0
    # |-------- Transforming the adjacency matrix into a graph
    G = nx.from_numpy_matrix(adjacency_matrix)  
    # |-------- Louvain Community Detection
    coms_louvain = algorithms.louvain(G, weight='weight', resolution=1., randomize=False)

    def calculate_ratings_new(item_id, user_id):
        global community_nodes
        coms = coms_louvain
        for i in coms.communities:
            for j in i:
                # if (user_id - 1) == j:
                if user_id == j:
                    community_index = coms.communities.index(i) #--return the user's community
                    community_nodes = coms.communities[community_index] #--return the user's in that community
                    #community_nodes = [x+1 for x in community_nodes]
        
        rt_train_data = trainset["uid"].isin(community_nodes)
        rt_train_data = trainset[rt_train_data]

        rt_df_ratings = rt_train_data.pivot(index='uid', columns='iid', values='rating')
        rt_df_ratings_dummy = rt_df_ratings.copy().fillna(0)

        similarity_matrix = cosine_similarity(rt_df_ratings_dummy, rt_df_ratings_dummy)
        similarity_matrix_df = pd.DataFrame(similarity_matrix, index=rt_df_ratings_dummy.index, columns=rt_df_ratings_dummy.index)


        if item_id in rt_df_ratings:
            cosine_scores = similarity_matrix_df[user_id] #similarity of id_user with every other user
            ratings_scores = rt_df_ratings[item_id]      #ratings of every other user for the movie id_movie
            #won't consider users who havent rated id_movie so drop similarity scores and ratings corresponsing to np.nan
            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            cosine_scores = cosine_scores.drop(index_not_rated)
            #calculating rating by weighted mean of ratings and cosine scores of the users who have rated the movie
            ratings_movie = np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()

        else:
            ratings_movie = 2.5
        return ratings_movie

    def score_on_test_set(data):
        user_movie_pairs = zip(data['iid'], data['uid'])
        predicted_ratings = [calculate_ratings_new(movie, user) for (movie,user) in user_movie_pairs]
        true_ratings = list(data['rating'])
        results = pd.DataFrame({'y_pred': predicted_ratings, 'y_true': true_ratings})
        results.dropna(axis=0, inplace=True)
        #exl = np.where(np.isnan(predicted_ratings))[0][0]
        #predicted_ratings.pop(exl)
        predicted_ratings =  np.array(results['y_pred'])
        #true_ratings = list(data['rating'])
        #true_ratings.pop(exl)
        true_ratings =  np.array(results['y_true'])
        score = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
        return score
    test_set_score = score_on_test_set(testset)
    print(test_set_score)

1.3509045567321694
1.35236926783618
1.3580617000373258


In [20]:
"""
    Runing Experiments
"""

'\n    Runing Experiments\n'

In [21]:
# Loading the dataset
data = Dataset.load_builtin(dataset)

In [23]:
for key, value in algos:
    print(key)

ValueError: too many values to unpack (expected 2)

In [25]:
algos['SVD']

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2accb1dd790>

In [27]:
algos.keys()

dict_keys(['SVD', 'SVDpp', 'NMF', 'SlopeOne', 'kNNBasic', 'kNNWithMeans', 'kNNBaseLine', 'CoClustering', 'BaselineOnly', 'NormalPredictor'])

In [28]:
for trainset, testset in kf.split(data):
    # train and test algorithm.
    for key in algos:
        print(f'Algorithm => {key}')
        algos[key].fit(trainset)
        predictions = algos[key].test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
    

Algorithm => SVD
RMSE: 0.9310
Algorithm => SVDpp
RMSE: 0.9165
Algorithm => NMF
RMSE: 0.9554
Algorithm => SlopeOne
RMSE: 0.9406
Algorithm => kNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9740
Algorithm => kNNWithMeans
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9501
Algorithm => kNNBaseLine
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9272
Algorithm => CoClustering
RMSE: 0.9632
Algorithm => BaselineOnly
Estimating biases using als...
RMSE: 0.9417
Algorithm => NormalPredictor
RMSE: 1.5204
Algorithm => SVD
RMSE: 0.9428
Algorithm => SVDpp
RMSE: 0.9269
Algorithm => NMF
RMSE: 0.9713
Algorithm => SlopeOne
RMSE: 0.9484
Algorithm => kNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9851
Algorithm => kNNWithMeans
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9563
Algorithm => 

In [5]:
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9480
RMSE: 0.9392
RMSE: 0.9518


In [11]:
testt = accuracy.rmse(predictions, verbose=True)
testt

RMSE: 0.9518


0.9518102470214965

In [121]:


for trainset, testset in kf.split(data):
    abc =  trainset
    bcd =  testset

In [22]:
abc.all_ratings

<generator object Trainset.all_ratings at 0x289d809e0>

In [37]:
abc.ur

defaultdict(list,
            {0: [(0, 4.0),
              (13, 3.0),
              (430, 5.0),
              (105, 3.0),
              (226, 3.0),
              (129, 4.0),
              (159, 4.0),
              (48, 4.0),
              (98, 5.0),
              (748, 4.0),
              (468, 4.0),
              (344, 4.0),
              (442, 3.0),
              (300, 4.0),
              (403, 5.0),
              (741, 3.0),
              (1097, 5.0),
              (370, 5.0),
              (941, 3.0),
              (529, 3.0),
              (1103, 4.0),
              (452, 5.0),
              (321, 3.0),
              (253, 3.0),
              (140, 5.0),
              (436, 4.0),
              (697, 3.0),
              (428, 4.0),
              (131, 3.0),
              (69, 4.0),
              (865, 4.0),
              (49, 4.0),
              (564, 3.0),
              (117, 3.0),
              (1209, 5.0),
              (580, 3.0),
              (348, 4.0),
              (243, 4

In [7]:
iterator = trainset.all_ratings()
new_df = pd.DataFrame(columns=['uid', 'iid', 'rating'])
i = 0
for (uid, iid, rating) in iterator:
    new_df.loc[i] = [uid, iid, rating]
    i = i+1

new_df.head(2)

Unnamed: 0,uid,iid,rating
0,0.0,0.0,4.0
1,0.0,463.0,3.0


In [8]:
new_df['uid'].unique().size

943

In [9]:
df = new_df.copy()
ratings_matrix = df.pivot_table(index=['uid'],columns=['iid'],values='rating')
ratings_matrix.fillna(0, inplace = True)
ratings_matrix

iid,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,1604.0,1605.0,1606.0,1607.0,1608.0,1609.0,1610.0,1611.0,1612.0,1613.0
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,0.0,1.0,3.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,4.0,0.0,3.0,4.0,3.0,0.0,5.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
similarity_matrix = 1 - pairwise_distances(ratings_matrix.to_numpy(), metric='cosine')
np.fill_diagonal(similarity_matrix, 0)
similarity_matrix

array([[0.        , 0.21296654, 0.31883426, ..., 0.04657311, 0.01249149,
        0.07039025],
       [0.21296654, 0.        , 0.3486366 , ..., 0.03179044, 0.02557978,
        0.05855829],
       [0.31883426, 0.3486366 , 0.        , ..., 0.0453604 , 0.0160082 ,
        0.06540013],
       ...,
       [0.04657311, 0.03179044, 0.0453604 , ..., 0.        , 0.02890222,
        0.08143279],
       [0.01249149, 0.02557978, 0.0160082 , ..., 0.02890222, 0.        ,
        0.09828585],
       [0.07039025, 0.05855829, 0.06540013, ..., 0.08143279, 0.09828585,
        0.        ]])

In [11]:
adjacency_matrix = similarity_matrix.copy()
adjacency_matrix[similarity_matrix > np.percentile(similarity_matrix, 0.75)] = 1
adjacency_matrix[similarity_matrix <= np.percentile(similarity_matrix, 0.75)] = 0
adjacency_matrix

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [12]:
G = nx.from_numpy_matrix(adjacency_matrix)

In [13]:
G

<networkx.classes.graph.Graph at 0x1769a90f0>

In [14]:
coms_louvain = algorithms.louvain(G, weight='weight', resolution=1., randomize=False)

In [15]:
len(coms_louvain.communities)

2

In [16]:
"""
    See below all the pipeline 
"""

'\n    See below all the pipeline \n'

In [48]:

data = Dataset.load_builtin("ml-100k") #==> dataset
kf = KFold(n_splits=3, random_state=0) #==> folds 
algo = SVD() #==> algorithm

In [17]:
for trainset, testset in kf.split(data):
    train_set =  trainset
    test_set =  testset

In [18]:
iterator = train_set.all_ratings()
df_train = pd.DataFrame(columns=['uid', 'iid', 'rating'])
i = 0
for (uid, iid, rating) in iterator:
    df_train.loc[i] = [uid, iid, rating]
    i = i+1

df_train.head()

Unnamed: 0,uid,iid,rating
0,0.0,0.0,2.0
1,0.0,148.0,3.0
2,0.0,159.0,3.0
3,0.0,131.0,2.0
4,0.0,397.0,5.0


In [19]:
testset

[('617', '185', 5.0),
 ('305', '143', 3.0),
 ('123', '132', 3.0),
 ('183', '202', 4.0),
 ('852', '826', 3.0),
 ('234', '623', 2.0),
 ('799', '654', 5.0),
 ('690', '443', 3.0),
 ('936', '137', 4.0),
 ('518', '1114', 2.0),
 ('293', '568', 4.0),
 ('741', '95', 2.0),
 ('222', '276', 5.0),
 ('10', '463', 4.0),
 ('429', '58', 4.0),
 ('174', '80', 1.0),
 ('753', '523', 4.0),
 ('113', '262', 2.0),
 ('746', '281', 3.0),
 ('551', '1039', 4.0),
 ('697', '455', 4.0),
 ('936', '257', 3.0),
 ('312', '639', 5.0),
 ('296', '705', 5.0),
 ('922', '15', 4.0),
 ('487', '781', 3.0),
 ('758', '292', 4.0),
 ('635', '1', 4.0),
 ('642', '1076', 2.0),
 ('838', '111', 4.0),
 ('705', '393', 4.0),
 ('308', '521', 3.0),
 ('766', '176', 2.0),
 ('634', '340', 4.0),
 ('57', '204', 4.0),
 ('78', '294', 3.0),
 ('311', '418', 4.0),
 ('227', '1010', 3.0),
 ('488', '474', 2.0),
 ('791', '322', 4.0),
 ('313', '423', 4.0),
 ('677', '475', 4.0),
 ('222', '90', 2.0),
 ('621', '1', 3.0),
 ('302', '307', 4.0),
 ('537', '513', 4.

In [20]:
#Creating an user-item matrix
ratings_matrix = df_train.pivot_table(index=['uid'],columns=['iid'],values='rating') # pivoting the test_data datafrane to an matrix layout
ratings_matrix.fillna(0, inplace = True) # replacing the 'nulls' for 0
ratings_matrix

iid,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,1616.0,1617.0,1618.0,1619.0,1620.0,1621.0,1622.0,1623.0,1624.0,1625.0
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#Creating an user-user similarity matrix
similarity_matrix = 1 - pairwise_distances(ratings_matrix.to_numpy(), metric="cosine")
np.fill_diagonal(similarity_matrix, 0) # zeroing the self-similarity
similarity_matrix

array([[0.        , 0.10783792, 0.31174978, ..., 0.19532948, 0.06389934,
        0.        ],
       [0.10783792, 0.        , 0.11551116, ..., 0.12122193, 0.10110901,
        0.11147192],
       [0.31174978, 0.11551116, 0.        , ..., 0.1856178 , 0.0223805 ,
        0.        ],
       ...,
       [0.19532948, 0.12122193, 0.1856178 , ..., 0.        , 0.        ,
        0.        ],
       [0.06389934, 0.10110901, 0.0223805 , ..., 0.        , 0.        ,
        0.14009081],
       [0.        , 0.11147192, 0.        , ..., 0.        , 0.14009081,
        0.        ]])

In [22]:
adjacency_matrix = similarity_matrix.copy()
adjacency_matrix[similarity_matrix > np.percentile(similarity_matrix, 75)] = 1
adjacency_matrix[similarity_matrix <= np.percentile(similarity_matrix, 75)] = 0

In [23]:
adjacency_matrix

array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
#Transforming the adjacency matrix into a graph
G = nx.from_numpy_matrix(adjacency_matrix)

In [25]:
print(f'# nodes: {G.number_of_nodes():f}')
print(f'# edges: {G.number_of_edges():f}')

# nodes: 943.000000
# edges: 111156.000000


In [26]:
#Louvain Community Detection
coms_louvain = algorithms.louvain(G, weight='weight', resolution=1., randomize=False)

In [31]:
len(coms_louvain.communities)


4

In [28]:
for i in coms_louvain.communities:
  print(min(i))

0
1
3
376


In [32]:
data={}
for i, communitie in enumerate(coms_louvain.communities):
    data[i] = pd.DataFrame({'uid': communitie, 'community': i})

In [33]:
dataset = pd.concat(data, ignore_index=True)
dataset

Unnamed: 0,uid,community
0,0,0
1,2,0
2,4,0
3,5,0
4,6,0
...,...,...
938,942,2
939,376,3
940,629,3
941,839,3


In [34]:
df_train_with_community = pd.merge(df_train,dataset, on='uid', how='left')
df_train_with_community

Unnamed: 0,uid,iid,rating,community
0,0.0,0.0,2.0,0
1,0.0,148.0,3.0,0
2,0.0,159.0,3.0,0
3,0.0,131.0,2.0,0
4,0.0,397.0,5.0,0
...,...,...,...,...
66662,942.0,326.0,4.0,2
66663,942.0,272.0,4.0,2
66664,942.0,956.0,4.0,2
66665,942.0,957.0,4.0,2


In [35]:
# treinando para o fold XPTO
 # community 0
temp = df_train_with_community.copy()
temp = temp[temp['community'] == 0]
temp.drop(['community'], axis=1, inplace=True)
temp

Unnamed: 0,uid,iid,rating
0,0.0,0.0,2.0
1,0.0,148.0,3.0
2,0.0,159.0,3.0
3,0.0,131.0,2.0
4,0.0,397.0,5.0
...,...,...,...
66505,931.0,1561.0,5.0
66506,931.0,480.0,1.0
66507,931.0,388.0,4.0
66508,931.0,394.0,4.0


In [36]:
len(set(temp['uid']))

472

In [37]:
temp['rating'].value_counts()

4.0    17314
3.0    13766
5.0    10535
2.0     5590
1.0     2782
Name: rating, dtype: int64

In [38]:
#pandas para trainset
reader = Reader(rating_scale=(1, 5))
new_df2 = Dataset.load_from_df(temp, reader)

In [39]:
new_df2 = new_df2.build_full_trainset()
type(new_df2)

surprise.trainset.Trainset

In [40]:
new_df2.n_users

472

In [41]:
algo.fit(new_df2)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x176bc1690>

In [None]:
### e o testset?


In [42]:
testset

[('617', '185', 5.0),
 ('305', '143', 3.0),
 ('123', '132', 3.0),
 ('183', '202', 4.0),
 ('852', '826', 3.0),
 ('234', '623', 2.0),
 ('799', '654', 5.0),
 ('690', '443', 3.0),
 ('936', '137', 4.0),
 ('518', '1114', 2.0),
 ('293', '568', 4.0),
 ('741', '95', 2.0),
 ('222', '276', 5.0),
 ('10', '463', 4.0),
 ('429', '58', 4.0),
 ('174', '80', 1.0),
 ('753', '523', 4.0),
 ('113', '262', 2.0),
 ('746', '281', 3.0),
 ('551', '1039', 4.0),
 ('697', '455', 4.0),
 ('936', '257', 3.0),
 ('312', '639', 5.0),
 ('296', '705', 5.0),
 ('922', '15', 4.0),
 ('487', '781', 3.0),
 ('758', '292', 4.0),
 ('635', '1', 4.0),
 ('642', '1076', 2.0),
 ('838', '111', 4.0),
 ('705', '393', 4.0),
 ('308', '521', 3.0),
 ('766', '176', 2.0),
 ('634', '340', 4.0),
 ('57', '204', 4.0),
 ('78', '294', 3.0),
 ('311', '418', 4.0),
 ('227', '1010', 3.0),
 ('488', '474', 2.0),
 ('791', '322', 4.0),
 ('313', '423', 4.0),
 ('677', '475', 4.0),
 ('222', '90', 2.0),
 ('621', '1', 3.0),
 ('302', '307', 4.0),
 ('537', '513', 4.

In [43]:
type(testset[0][0])

str

In [44]:
test_set = pd.DataFrame.from_records(testset, columns = ['uid', 'iid', 'rating'])
test_set

Unnamed: 0,uid,iid,rating
0,617,185,5.0
1,305,143,3.0
2,123,132,3.0
3,183,202,4.0
4,852,826,3.0
...,...,...,...
33328,395,1060,2.0
33329,372,159,5.0
33330,871,286,3.0
33331,326,507,2.0


In [45]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33333 entries, 0 to 33332
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   uid     33333 non-null  object 
 1   iid     33333 non-null  object 
 2   rating  33333 non-null  float64
dtypes: float64(1), object(2)
memory usage: 781.4+ KB


In [46]:
test_set['uid'] = pd.to_numeric(test_set['uid'])
test_set['iid'] = pd.to_numeric(test_set['iid'])

In [47]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33333 entries, 0 to 33332
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   uid     33333 non-null  int64  
 1   iid     33333 non-null  int64  
 2   rating  33333 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 781.4 KB


In [48]:
df_test_with_community = pd.merge(test_set,dataset, on='uid', how='left')
df_test_with_community

Unnamed: 0,uid,iid,rating,community
0,617,185,5.0,1.0
1,305,143,3.0,0.0
2,123,132,3.0,0.0
3,183,202,4.0,0.0
4,852,826,3.0,1.0
...,...,...,...,...
33328,395,1060,2.0,0.0
33329,372,159,5.0,0.0
33330,871,286,3.0,0.0
33331,326,507,2.0,1.0


In [49]:
# predict para o fold XPTO
 # community 0
temp2 = df_test_with_community.copy()
temp2 = temp2[temp2['community'] == 0]
temp2.drop(['community'], axis=1, inplace=True)
temp2

Unnamed: 0,uid,iid,rating
1,305,143,3.0
2,123,132,3.0
3,183,202,4.0
5,234,623,2.0
9,518,1114,2.0
...,...,...,...
33322,284,906,3.0
33328,395,1060,2.0
33329,372,159,5.0
33330,871,286,3.0


In [50]:
temp2['uid'] = temp2['uid'].apply(lambda x: str(x))

In [51]:
temp2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16000 entries, 1 to 33332
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   uid     16000 non-null  object 
 1   iid     16000 non-null  int64  
 2   rating  16000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 500.0+ KB


In [52]:
temp2

Unnamed: 0,uid,iid,rating
1,305,143,3.0
2,123,132,3.0
3,183,202,4.0
5,234,623,2.0
9,518,1114,2.0
...,...,...,...
33322,284,906,3.0
33328,395,1060,2.0
33329,372,159,5.0
33330,871,286,3.0


In [53]:
tttt = temp2.to_records(index=False)
tttt

rec.array([('305', 143, 3.), ('123', 132, 3.), ('183', 202, 4.), ...,
           ('372', 159, 5.), ('871', 286, 3.), ('327', 186, 2.)],
          dtype=[('uid', 'O'), ('iid', '<i8'), ('rating', '<f8')])

In [54]:
predictions = algo.test(tttt)

In [55]:
predictions

[Prediction(uid='305', iid=143, r_ui=3.0, est=2.992289048685051, details={'was_impossible': False}),
 Prediction(uid='123', iid=132, r_ui=3.0, est=3.9654910944481667, details={'was_impossible': False}),
 Prediction(uid='183', iid=202, r_ui=4.0, est=3.7674566311191544, details={'was_impossible': False}),
 Prediction(uid='234', iid=623, r_ui=2.0, est=3.2106255257257397, details={'was_impossible': False}),
 Prediction(uid='518', iid=1114, r_ui=2.0, est=3.594872038503579, details={'was_impossible': False}),
 Prediction(uid='222', iid=276, r_ui=5.0, est=3.7685236865050142, details={'was_impossible': False}),
 Prediction(uid='10', iid=463, r_ui=4.0, est=4.031921988508553, details={'was_impossible': False}),
 Prediction(uid='429', iid=58, r_ui=4.0, est=3.1433173447027762, details={'was_impossible': False}),
 Prediction(uid='113', iid=262, r_ui=2.0, est=3.5884200167448497, details={'was_impossible': False}),
 Prediction(uid='746', iid=281, r_ui=3.0, est=3.7054385670643164, details={'was_imposs

In [63]:
predictions[0][3]

2.992289048685051

In [65]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.1717


1.1717153496142778

In [66]:
tste_1 = pd.DataFrame (predictions, columns = ['uid', 'iid', 'r_ui', 'est', 'details'])
tste_1

Unnamed: 0,uid,iid,r_ui,est,details
0,305,143,3.0,2.992289,{'was_impossible': False}
1,123,132,3.0,3.965491,{'was_impossible': False}
2,183,202,4.0,3.767457,{'was_impossible': False}
3,234,623,2.0,3.210626,{'was_impossible': False}
4,518,1114,2.0,3.594872,{'was_impossible': False}
...,...,...,...,...,...
15995,284,906,3.0,3.153708,{'was_impossible': False}
15996,395,1060,2.0,3.653659,{'was_impossible': False}
15997,372,159,5.0,3.396015,{'was_impossible': False}
15998,871,286,3.0,4.098881,{'was_impossible': False}


In [67]:
tste_1['diff'] =  tste_1['r_ui'] - tste_1['est']
tste_1

Unnamed: 0,uid,iid,r_ui,est,details,diff
0,305,143,3.0,2.992289,{'was_impossible': False},0.007711
1,123,132,3.0,3.965491,{'was_impossible': False},-0.965491
2,183,202,4.0,3.767457,{'was_impossible': False},0.232543
3,234,623,2.0,3.210626,{'was_impossible': False},-1.210626
4,518,1114,2.0,3.594872,{'was_impossible': False},-1.594872
...,...,...,...,...,...,...
15995,284,906,3.0,3.153708,{'was_impossible': False},-0.153708
15996,395,1060,2.0,3.653659,{'was_impossible': False},-1.653659
15997,372,159,5.0,3.396015,{'was_impossible': False},1.603985
15998,871,286,3.0,4.098881,{'was_impossible': False},-1.098881


In [68]:
tste_1['sqd'] = tste_1['diff']**2
tste_1

Unnamed: 0,uid,iid,r_ui,est,details,diff,sqd
0,305,143,3.0,2.992289,{'was_impossible': False},0.007711,0.000059
1,123,132,3.0,3.965491,{'was_impossible': False},-0.965491,0.932173
2,183,202,4.0,3.767457,{'was_impossible': False},0.232543,0.054076
3,234,623,2.0,3.210626,{'was_impossible': False},-1.210626,1.465614
4,518,1114,2.0,3.594872,{'was_impossible': False},-1.594872,2.543617
...,...,...,...,...,...,...,...
15995,284,906,3.0,3.153708,{'was_impossible': False},-0.153708,0.023626
15996,395,1060,2.0,3.653659,{'was_impossible': False},-1.653659,2.734587
15997,372,159,5.0,3.396015,{'was_impossible': False},1.603985,2.572767
15998,871,286,3.0,4.098881,{'was_impossible': False},-1.098881,1.207540


In [70]:
sum(tste_1['sqd'])/len(tste_1)

1.3729168605217068

In [161]:
testset

[('468', '724', 4.0),
 ('117', '117', 5.0),
 ('209', '898', 3.0),
 ('253', '523', 4.0),
 ('88', '302', 3.0),
 ('545', '449', 2.0),
 ('334', '10', 4.0),
 ('16', '602', 5.0),
 ('287', '250', 3.0),
 ('747', '315', 4.0),
 ('606', '124', 3.0),
 ('104', '926', 1.0),
 ('790', '401', 4.0),
 ('162', '943', 4.0),
 ('286', '408', 4.0),
 ('43', '294', 5.0),
 ('76', '156', 3.0),
 ('327', '117', 3.0),
 ('880', '2', 3.0),
 ('533', '77', 4.0),
 ('233', '12', 2.0),
 ('405', '1110', 1.0),
 ('846', '700', 2.0),
 ('655', '236', 3.0),
 ('330', '465', 5.0),
 ('776', '525', 2.0),
 ('432', '246', 4.0),
 ('938', '258', 5.0),
 ('308', '7', 4.0),
 ('615', '629', 4.0),
 ('276', '300', 4.0),
 ('638', '511', 3.0),
 ('612', '924', 5.0),
 ('881', '69', 3.0),
 ('171', '340', 3.0),
 ('249', '96', 4.0),
 ('2', '293', 4.0),
 ('62', '697', 4.0),
 ('188', '1213', 2.0),
 ('496', '7', 4.0),
 ('381', '134', 5.0),
 ('459', '989', 5.0),
 ('145', '448', 5.0),
 ('87', '732', 4.0),
 ('8', '341', 2.0),
 ('11', '12', 2.0),
 ('843', 

In [170]:
tttt[0][0] == testset[1][0]

False

In [169]:
testset[1][0]

'117'

In [172]:
temp2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16921 entries, 1 to 33330
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   uid     16921 non-null  object 
 1   iid     16921 non-null  object 
 2   rating  16921 non-null  float64
dtypes: float64(1), object(2)
memory usage: 528.8+ KB
