In [None]:
# if this way of importing another jupyter notebook fails for you
# then you can use any one of the many methods described here:
# https://stackoverflow.com/questions/20186344/ipynb-import-another-ipynb-file
%run 'revisedcode.ipynb'

# IV: Test Script / Experiment
The following jupyter notebook cells make calls to above cells to run experiments on a recommendation dataset.

### Setting constants

In [13]:
'''Dataset Parameters'''
DATA_PATH = './ml-100k/u.data' # ml-100k data set has 100k ratings, 943 users and 1682 items
#DATA_TYPE =  0              # 0: CSR format, 1: 2D matrix format  # TODO: use it
DELIMITER = "\t"             # tab separated or comma separated data format
FIRST_INDEX = 1
N_RATINGS = 100000
USERS = 943
ITEMS = 1682

In [14]:
'''Hyperparameters'''
C1 = 0.2                # probability of edges in training set going to E1
C2 = 0.3                # probability of edges in training set going to E2
C3 = 1 - C1 - C2        # probability of edges in training set going to E3
RADIUS = 3              # radius of neighborhood, radius = # edges between start and end vertex, keep it -1 to use default value given in paper
THRESHOLD = 943

#checks on parameters
if C3 <= 0:
    print('ERROR: Please set the values of C1 and C2, s.t, C1+C2 < 1')

In [15]:
'''Hardcoding values'''
OFFSET = USERS + 10                     # offset so that user_id and item_id are different in graph; keep it >= #USERS
UNOBSERVED = -1
GET_PRODUCT_FAIL_RETURN = UNOBSERVED    #TODO: This hardcoding can be removed in future
TRAIN_TEST_SPLIT = 0.2                  # %age of test ratings wrt train rating ; value in between 0 and 1
AVG_RATING = 3                          # ratings for which we dont have predicted rating

### Read and prepare the dataset

In [16]:
data_csr = read_data_csr(fname=DATA_PATH, delimiter=DELIMITER)

if data_csr.shape[0] == N_RATINGS:  # gives total no of ratings read; useful for verification
    print('Reading dataset: done')
else:
    print('Reading dataset: FAILED')
    #print( '# of missing ratings: ' + str(N_RATINGS - data_csr.shape[0]))  #TODO

Reading dataset: done


In [17]:
check_dataset_csr(data_csr=data_csr)

USERS: 943
ITEMS: 1682
All users and items have at least one rating! Good!
Sparsity of given matrix p: 0.0630466936422
Sparsity of large symmetricized matrix p: 0.0290249433107
Asymm matrix: p is polynomially larger than 1/n, all guarantees applicable
Sym matrix : p is polynomially larger than 1/n, all guarantees applicable


In [18]:
#TODO : normalize the ratings and symmtericize the given matrix

In [19]:
[train_data_csr, test_data_csr] = generate_train_test_split_csr(data_csr=data_csr, split=TRAIN_TEST_SPLIT)

Generating train test split: done


### Make predictions using THE algorithm 

##### Step 1: Sample splitting

In [20]:
[m1_csr, m2_csr, m3_csr] = sample_splitting_csr(data_csr=data_csr, c1=C1, c2=C2, shuffle=False)

Sample splitting: done


##### Step 2: Expanding the Neighborhood

In [21]:
product_matrix = generate_product_matrix(data_csr, m1_csr, c1=C1, radius=RADIUS)
#TODO: check why generating product matrix is taking about a minute longer w.r.t. rawcode

Creating graph as dictionary:


100%|██████████| 2635/2635 [00:04<00:00, 568.31it/s]

Generating product matrix:



100%|██████████| 943/943 [01:42<00:00,  9.18it/s]


##### Step 3: Computing the distances

In [22]:
user_sim_matrix = generate_user_sim_matrix(data_csr, m1_csr, product_matrix)
# del product_matrix

Generating user sim matrix (pearson similarity):


  r = r_num / r_den
100%|██████████| 943/943 [00:53<00:00, 17.77it/s]


##### Step 4: Averaging datapoints to produce final estimate

In [23]:
predicted_matrix = generated_weighted_averaged_prediction_matrix(data_csr, m3_csr, user_sim_matrix, bounded=True)
# del user_sim_matrix

Generating prediction matrix:


100%|██████████| 943/943 [00:49<00:00, 18.88it/s]


### Evaluate the predictions

In [24]:
[y_actual, y_predict] = generate_true_and_test_labels(test_data_csr, predicted_matrix)
# del predicted_matrix

Generating true and test label:


100%|██████████| 20000/20000 [00:00<00:00, 199634.65it/s]


In [25]:
get_rmse(y_actual, y_predict)

1.2437242459645144

In [26]:
get_avg_err(y_actual, y_predict)

0.99985000000000002

In [27]:
check_mse(data_csr, y_actual, y_predict) # TODO: this might be because the matrix considered here is not symmetric?

ERROR: Contrary to the discusssion in the paper, MSE is NOT bounded by O((pn)**(-1/5))
