In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, save_npz, load_npz
# Scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import os

# Model Evaluation
from evaluation import evaluate
%matplotlib inline

ModuleNotFoundError: No module named 'ml_metrics'

In [None]:
! explorer.exe .

## Understanding the data

- The dataset that you selected is appropriated for building a RS?
- Do you have data regarding the items or only about the users' preference?
- Do you have a test dataset or do you have to create it?

In [3]:
print("BookRatings.csv \n")
!head -4 data/BookRatings.csv

BookRatings.csv 

User-ID,ISBN,Book-Rating
99,0316748641,7
99,0446677450,10
99,0553347594,9


In [4]:
print("BooksMetaInfo.csv \n")
!head -2 data/BooksMetaInfo.csv

BooksMetaInfo.csv 

ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg,"['Mark P. O. Morford', 'Robert J. Lenardon']","Provides an introduction to classical myths placing the addressed topics within their historical context, discussion of archaeological evidence as support for mythical events, and how these themes have been portrayed in literature, art, music, and film.",808.0,['Social Science']


In [5]:
print("BooksUsers.csv \n")
!head -4 data/BooksUsers.csv

BooksUsers.csv 

User-ID,Location,Age
2,"stockton, california, usa",18.0
8,"timmins, ontario, canada",
9,"germantown, tennessee, usa",


In [6]:
print("test_users.csv \n")
!head -4 data/test_users.csv

test_users.csv 

User-ID
114
507
850


## Load the Data

In [7]:
path = os.path.join('data','BookRatings.csv')
book_ratings_df = pd.read_csv(path)
book_ratings_df.head(n=20)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,99,0316748641,7
1,99,0446677450,10
2,99,0553347594,9
3,99,0451166892,3
4,99,0671621009,10
5,99,0312252617,8
6,114,0671027360,10
7,114,0553584383,9
8,114,0451208080,8
9,243,155874262X,5


In [8]:
book_ratings_df.shape

(109209, 3)

In [9]:
path = os.path.join('data','BooksMetaInfo.csv')
book_meta_info_df = pd.read_csv(path)
book_meta_info_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,authors,description,pageCount,categories
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"['Mark P. O. Morford', 'Robert J. Lenardon']",Provides an introduction to classical myths pl...,808.0,['Social Science']
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,['Richard Bruce Wright'],"In a small town in Canada, Clara Callan reluct...",414.0,['Actresses']
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,"[""Carlo D'Este""]","Here, for the first time in paperback, is an o...",555.0,['1940-1949']
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,['Gina Bari Kolata'],"Describes the great flu epidemic of 1918, an o...",330.0,['Medical']
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,['E. J. W. Barber'],A look at the incredibly well-preserved ancien...,240.0,['Design']


In [10]:
path = os.path.join('data','BooksUsers.csv')
book_user_df = pd.read_csv(path)
book_user_df.head()

Unnamed: 0,User-ID,Location,Age
0,2,"stockton, california, usa",18.0
1,8,"timmins, ontario, canada",
2,9,"germantown, tennessee, usa",
3,10,"albacete, wisconsin, spain",26.0
4,12,"fort bragg, california, usa",


In [11]:
path = os.path.join('data','test_users.csv')
test_user_df = pd.read_csv(path)
test_user_df.head()

Unnamed: 0,User-ID
0,114
1,507
2,850
3,3346
4,4092


In [12]:
test_user_df.shape

(589, 1)

## Process and clean data
- Check if data needs to be processed and cleaned.
- Process and clean data if necessary.

In [13]:
book_ratings_df.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

## Identify and separate the Users
- Which users are present in the training data?
- Make sure that you identify which test users are present in the training data and which are not.
- Can you use personalized methologies for all users?

In [14]:
# YOUR CODE HERE
# The indexes in training data(book_ratings_df)
test_index_in_training_data = test_user_df[test_user_df['User-ID'].isin(book_ratings_df['User-ID'].values).values].index
test_index_in_training_data

Int64Index([  0,   1,   2,   3,   4,   5,   6,   8,   9,  10,
            ...
            577, 578, 579, 580, 581, 582, 583, 584, 585, 588],
           dtype='int64', length=489)

In [15]:
# indixes not in training data(book_ratings_df)
test_index_not_in_training_data = test_user_df[~test_user_df['User-ID'].isin(book_ratings_df['User-ID'].values).values].index
test_index_not_in_training_data

Int64Index([  7,  11,  18,  28,  31,  37,  51,  54,  56,  57,  59,  62,  66,
             71,  86,  91,  96,  99, 107, 113, 121, 125, 126, 142, 144, 151,
            158, 160, 164, 173, 174, 177, 186, 197, 199, 205, 206, 209, 212,
            225, 228, 229, 244, 249, 258, 259, 263, 264, 266, 267, 280, 288,
            296, 298, 300, 302, 304, 306, 318, 338, 342, 359, 361, 364, 368,
            379, 399, 413, 414, 415, 421, 423, 426, 437, 448, 453, 454, 460,
            461, 462, 463, 465, 487, 488, 490, 505, 508, 509, 511, 513, 520,
            539, 545, 549, 555, 566, 567, 571, 586, 587],
           dtype='int64')

In [16]:
len(test_index_not_in_training_data)

100

In [17]:
len(test_index_in_training_data)

489

In [31]:
!explorer.exe .

In [32]:
#data frmae for the test data exists in training data
test_index_in_training_data_df = test_user_df.iloc[test_index_in_training_data,:]
test_index_in_training_data_df

Unnamed: 0,User-ID
0,114
1,507
2,850
3,3346
4,4092
...,...
582,276463
583,276626
584,276681
585,276847


In [33]:
# test indexes not in training data dataframe
test_index_not_in_training_data_df = test_user_df.iloc[test_index_not_in_training_data,:]
test_index_not_in_training_data_df

Unnamed: 0,User-ID
7,5490
11,8454
18,12569
28,15185
31,17003
...,...
566,272715
567,273190
571,274056
586,277901


## Create the Ratings Matrix

In [34]:
# YOUR CODE HERE
user_id, user_pos = np.unique(book_ratings_df.iloc[:, 0].values, return_inverse=True)
ISBN, ISBN_pos = np.unique(book_ratings_df.iloc[:, 1].values, return_inverse=True)
values = book_ratings_df.iloc[:, 2].fillna(0).values
    
#R Matrix dimensions (n_users, n_items)
shape = (len(user_id), len(ISBN))
R = csr_matrix((values, (user_pos, ISBN_pos)), shape=shape)
R.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
ISBN[10]

'0002197154'

In [36]:
ISBN_pos[10]

11141

# Split train and validation set from train set and creating rating matrix for each one of them

In [37]:
test_size = 0.2
train_data, val_data = train_test_split(book_ratings_df, test_size=test_size, random_state=8)

In [38]:
train_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
24506,56959,0385313632,10
56801,135149,014023828X,3
84895,209486,0679450424,5
666,2110,0590879979,9
99455,248718,0415073294,9


In [39]:
val_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
26666,62891,140119906,10
95881,240568,425108783,5
16464,35859,70212570,10
59610,141493,385336179,10
46170,106816,679721037,8


In [40]:
 #Store the indexes of each observation to identify which records to replace with zero
train_index = train_data.index
val_index = val_data.index
#make copies of data to replace the observations
train_data_clean = book_ratings_df.copy()
val_data_clean = book_ratings_df.copy()
train_data_clean.loc[val_index,['Book-Rating']]=0
val_data_clean.loc[train_index,['Book-Rating']] = 0



In [41]:
# function to make rating matrix
def make_ratings(data: pd.DataFrame) -> csr_matrix:
    """Creates the ratings matrix of listening history with optional shape
    
    Creates the ratings matrix from the listening history imported using the read_users_history() method.
    
    Args:
        data (pd.DataFrame):  Listening history for the users.
        shape (tuple): The overall (n_users, n_items) shape desired for the matrix. 
                       If None, define the shape with the (n_users, n_items) from data argument.
        
    Returns:
        ratings (csr_matrix): Ratings matrix with shape (n_users, n_items).
    
    """
    users, user_pos = np.unique(data.iloc[:, 0].values, return_inverse=True)
    items, item_pos = np.unique(data.iloc[:, 1].values, return_inverse=True)
    values = data.iloc[:, 2].fillna(0).values
    
    #R Matrix dimensions (n_users, n_items)
    shape = (len(users), len(items))

    R_ = csr_matrix((values, (user_pos, item_pos)), shape=shape)
    return R_


In [44]:
R_train = make_ratings(train_data_clean)
R_val = make_ratings(val_data_clean)
R_train.eliminate_zeros()
R_val.eliminate_zeros()

In [45]:
R_train.shape

(5719, 47768)

In [46]:
R_val.shape

(5719, 47768)

## Non-Personalized Recommendations
- Create non-personalized recommendations as a baseline.
- Apply the recommendations to the test users.
- Store results in the required format for submission.
- Submit baseline recommendations.

In [47]:
# This is for the test ids which are not in the training set

def get_most_rated(ratings: csr_matrix,treshold : int, n: int) -> np.matrix:
    """Returns the n most rated items in a ratings matrix.
    
    Args:
        ratings (csr_matrix): A sparse ratings matrix
        n (int): The number of top-n items we should retrieve.
        
    Returns:
        most_rated (np.matrix): An array of the most rated items.
    
    """ 
    ratings_=ratings.toarray()
    R_greater =  np.greater(ratings_,treshold)
    R_mask_int = R_greater.astype(np.int)
    R_treshold = np.multiply(ratings_,R_mask_int)
    R_best_array = R_treshold.sum(axis=0)
        
        
    return np.negative(R_best_array).argsort()[:n]


non_pers_most_rated = get_most_rated(R_train,5,10)
non_pers_most_rated


array([ 7192,  5791, 23964,  3950, 16148, 17420, 25509, 19960, 32475,
       16034])

In [48]:
non_pers_most_rated_isbn = ISBN[non_pers_most_rated]
non_pers_most_rated_isbn

array(['0316666343', '0312195516', '059035342X', '0142001740',
       '043935806X', '0446310786', '0671027360', '0452282152',
       '0786868716', '0439139597'], dtype=object)

In [49]:
book_ratings_unique_df = pd.DataFrame(np.unique(book_ratings_df.iloc[:,0].values),columns = ['User-ID'])
book_ratings_unique_df

Unnamed: 0,User-ID
0,99
1,114
2,243
3,244
4,254
...,...
5714,278554
5715,278582
5716,278633
5717,278843


In [50]:
non_pers_df = pd.DataFrame(np.zeros((len(book_ratings_unique_df), 1), dtype=non_pers_most_rated.dtype) + non_pers_most_rated)
non_pers_df = pd.concat([book_ratings_unique_df, non_pers_df], axis=1)
non_pers_df = non_pers_df.set_index('User-ID')
non_pers_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
99,7192,5791,23964,3950,16148,17420,25509,19960,32475,16034
114,7192,5791,23964,3950,16148,17420,25509,19960,32475,16034
243,7192,5791,23964,3950,16148,17420,25509,19960,32475,16034
244,7192,5791,23964,3950,16148,17420,25509,19960,32475,16034
254,7192,5791,23964,3950,16148,17420,25509,19960,32475,16034


In [51]:
def create_dict_preds(preds_df: pd.DataFrame) -> dict:
    """Convert the predictions DataFrame (index:users -> columns: items) to a dictionary of key (user->list of items).
    
    Args: 
        preds_df (pd.DataFrame): DataFrame containing the users and the ordered predictions.
        
    Returns:
        preds_dict (dict): Dict of (user_id: list of items) used for evaluating the performance.
    
    """
    return {preds_df.index[i]: preds_df.values[i].tolist() for i in range(len(preds_df))}


non_pers_dict = create_dict_preds(non_pers_df)
# Since dicts in python are not ordered, we need to HAMMER DOWN a way to print some values.
dict(list(non_pers_dict.items())[0:5])

{99: [7192, 5791, 23964, 3950, 16148, 17420, 25509, 19960, 32475, 16034],
 114: [7192, 5791, 23964, 3950, 16148, 17420, 25509, 19960, 32475, 16034],
 243: [7192, 5791, 23964, 3950, 16148, 17420, 25509, 19960, 32475, 16034],
 244: [7192, 5791, 23964, 3950, 16148, 17420, 25509, 19960, 32475, 16034],
 254: [7192, 5791, 23964, 3950, 16148, 17420, 25509, 19960, 32475, 16034]}

In [52]:
def get_y_true(R_val_: csr_matrix, users_to_pred: pd.DataFrame, n=10):
    """Get the ground truth (best recommendations) of the users in the validation set.
    
    Args:
        R_val_ (csr_matrix): Validation set ratings matrix.
        users_to_pred: 
        n (int): Number of top-n items.
        
    Returns:
        y_true_df (pd.DataFrame): DataFrame which returns the y_true items.
        
    """
    top_from_R_val = pd.DataFrame(np.negative(R_val_).toarray().argsort()[:, :n])
    y_true_df = pd.concat([users_to_pred, top_from_R_val], axis=1)
    y_true_df = y_true_df.set_index("User-ID")
    return y_true_df


y_true_df = get_y_true(R_val, book_ratings_unique_df, n=10)
y_true_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
99,17954,0,31839,31840,31841,31842,31843,31844,31845,31846
114,23519,0,31839,31840,31841,31842,31843,31844,31845,31846
243,11141,3301,0,31841,31842,31843,31844,31845,31846,31847
244,33261,31840,31841,31842,31843,31844,31845,31846,31847,31848
254,15995,12652,11700,42600,11699,6424,10942,0,31844,31845


In [53]:
# Create the dictionary with the ground truth.
y_true_dict = create_dict_preds(y_true_df)
# Since dicts in python are not ordered, we need to HAMMER DOWN a way to print some values.
dict(list(y_true_dict.items())[0:5])

{99: [17954, 0, 31839, 31840, 31841, 31842, 31843, 31844, 31845, 31846],
 114: [23519, 0, 31839, 31840, 31841, 31842, 31843, 31844, 31845, 31846],
 243: [11141, 3301, 0, 31841, 31842, 31843, 31844, 31845, 31846, 31847],
 244: [33261, 31840, 31841, 31842, 31843, 31844, 31845, 31846, 31847, 31848],
 254: [15995, 12652, 11700, 42600, 11699, 6424, 10942, 0, 31844, 31845]}

## Evaluate results
- Calculate the evaluation metric on the validation users.
- Compare it later with the personalized recommendations

In [54]:
# YOUR CODE HERE
evaluate(y_true_dict, non_pers_dict)

NameError: name 'evaluate' is not defined

# Predict

In [None]:
# Join both dataframes with user_id's
all_test_users = pd.concat([test_index_in_training_data_df,test_index_not_in_training_data_df]).reset_index(drop=True)
all_test_users

In [None]:
def convert_non_pers_recommendations_to_df(non_pers_recs: np.array, users_to_pred: pd.DataFrame) -> pd.DataFrame:
    non_pers_df = pd.DataFrame(np.zeros((len(users_to_pred), 1), dtype=non_pers_recs.dtype) + non_pers_recs)
    non_pers_df = pd.concat([users_to_pred, non_pers_df], axis=1)
    non_pers_df = non_pers_df.set_index("User-ID")
    
    return non_pers_df


non_pers_test_most_rated_df = convert_non_pers_recommendations_to_df(non_pers_most_rated, all_test_users)
print(non_pers_test_most_rated_df.shape)
non_pers_test_most_rated_df.head()

# Final format of test recomendation

In [None]:
values = ISBN[non_pers_test_most_rated_df.values].reshape(-1,1)
index_to_list = 10*non_pers_test_most_rated_df.index.tolist()
index_to_list.sort()
test_final_format = pd.DataFrame(index_to_list, columns = ['User-ID'])
test_final_format['ISBN']=values
#test_final_format = test_final_format.set_index('User-ID')
test_final_format

In [None]:
def save_predictions(predictions: pd.DataFrame, output_path: str):
    """Save predictions to csv.
    
    Saves the predictions into a csv file with the format we need.
    We keep the index since it contains the user ids.
    
    Args:
        predictions (pd.DataFrame): DataFrame with user_id as index and ordered recommendations in the columns.
        output_path (str): Filepath for the predictions file.
    
    """
    predictions.to_csv(output_path,index = False)
    print(f"Saved to csv in '{output_path}'.")
    
    
save_predictions(non_pers_test_most_rated_df, os.path.join("data", "test_non_personalized_recommendations_books.csv"))

In [None]:
save_predictions(test_final_format, os.path.join("data", "test_final_format.csv"))

In [None]:
path2 = os.path.join('data','test_final_format.csv')
ff = pd.read_csv(path2)
ff

In [None]:
# Filter the non-personalized recommendations for the users without training data and save
new_users_recommendations = non_pers_test_most_rated_df.iloc[test_index_not_in_training_data]
save_predictions(new_users_recommendations, os.path.join("data", "new_users_non_personalized_without_infor.csv"))

In [None]:
from validate_sample_submission import validate_submission


In [None]:
validate_submission(test_final_format)

## Personalized Recommendations: Collaborative Filtering
- Compute the user similarities matrix.
- Predict ratings.
- Select the best recommendations.
- Submit recommendations.

In [None]:
# YOUR CODE HERE
def make_user_similarities(R_: csr_matrix) -> csr_matrix:
    """Creates the user similarities matrix.
    
    Args:
        R_ (csr_matrix): Ratings matrix.
        
    Returns:
        user_similarities (csr_matrix): Matrix with user similarities.
    
    """
    return cosine_similarity(R_, dense_output=False)


user_similarities = make_user_similarities(R_train)
user_similarities

In [None]:
def make_user_predictions_collab_filt(S: csr_matrix, R_: csr_matrix):
    """Predict using collaborative filtering.
    
    Args:
        S (csr_matrix): Similarities matrix (tipically using the cosine_similarity).
        R_ (csr_matrix): Ratings matrix.
        
    Returns:
        preds (csr_matrix): Predictions matrix.
    
    """
    weighted_sum = np.dot(S, R_)
    
    # We use the absolute value to support negative similarities.
    # In this particular example there are none.
    sum_of_weights = np.abs(S).sum(axis=1)
    
    preds = weighted_sum / sum_of_weights
    
    # Exclude previously rated items.
    preds[R_.nonzero()] = 0
    
    return csr_matrix(preds)
 

collab_filt_user_preds = make_user_predictions_collab_filt(user_similarities, R_train)
collab_filt_user_preds

In [None]:
def sparsity(matrix: csr_matrix) -> float:
    """Calculates the sparsity of a matrix.
    
    Args:
        matrix (csr_matrix): Sparse matrix.
        
    Returns:
        sparsity_ (float): Sparsity percentage (between 0 and 1).
    
    """
    return 1 - matrix.nnz / (matrix.shape[0] * matrix.shape[1])


sparsity(collab_filt_user_preds)

In [None]:
def get_most_rated_from_user_preds(user_preds_: csr_matrix, n: int) -> np.matrix:
    """Returns the n most rated items from the user predictions.
    
    Args:
        user_preds_ (csr_matrix): A sparse ratings matrix
        n (int): The number of top-n items we should retrieve.
        
    Returns:
        most_rated (np.matrix): An array of the most rated items.
    
    """
    pred_ = np.negative(user_preds_).toarray()
    return pred_.argsort()[:, :n]


collab_filt_most_rated = get_most_rated_from_user_preds(collab_filt_user_preds, 10)
print(collab_filt_most_rated.shape)
collab_filt_most_rated

In [None]:
def convert_pers_recommendations_to_df(pers_recs: np.array, users_to_pred: pd.DataFrame) -> pd.DataFrame:
    """Converts the personalized most rated to an DataFrame with the users and the recommendations.
    
    Args:
        pers_recs (np.array): Array of indices for the best personalized items to recommend.
        users_to_pred (pd.DataFrame): DataFrame containing the users which need recommendations.
        
    Returns:
        non_pers_most_rated_matrix (np.array): Two dimensional array of (n_users, top_n_items)
    
    """
    pers_df = pd.concat([users_to_pred, pd.DataFrame(pers_recs)], axis=1)
    pers_df = pers_df.set_index("User-ID")
    
    return pers_df


collab_filt_most_rated_df = convert_pers_recommendations_to_df(collab_filt_most_rated, book_ratings_unique_df)
collab_filt_most_rated_df.head()

In [None]:
collab_filt_dict = create_dict_preds(collab_filt_most_rated_df)
# Since dicts in python are not ordered, we need to HAMMER DOWN a way to print some values.
dict(list(collab_filt_dict.items())[0:1])

## Evaluate results (Again)
- Calculate the evaluation metric on the validation users.

In [None]:
evaluate(y_true_dict, collab_filt_dict)

# Predict

In [None]:
pers_test_most_rated_df = convert_pers_recommendations_to_df(collab_filt_most_rated, all_test_users)
print(pers_test_most_rated_df.shape)
pers_test_most_rated_df.head()

# Final Format for submission

In [None]:
values = ISBN[df_test_prediction.values].reshape(-1,1)
index_to_list_pers = 10*df_test_prediction.index.tolist()
#index_to_list_pers.sort()
test_final_format_pers = pd.DataFrame(index_to_list, columns = ['User-ID'])
test_final_format_pers['ISBN']=values
#test_final_format = test_final_format.set_index('User-ID')
test_final_format_pers

In [None]:
validate_submission(test_final_format_pers)

In [None]:
save_predictions(test_final_format_pers, os.path.join("data", "test_final_format_pers_.csv"))

In [None]:
!explorer.exe .

## Content-based Recommendations

- Compute the item similarities matrix.
- Predict ratings.
- Select the best recommendations.
- Submit recommendations.

In [None]:
# YOUR CODE HERE

## Evaluate results (Yet again)
- Calculate the evaluation metric on the validation users.

In [None]:
# YOUR CODE HERE

## Potential improvements

At this point you can try to improve your prediction using several approaches:
- Aggregation of ratings from different sources. 
- Mixing Collaborative Filtering and Content-based Recommendations.
- Matrix Factorization.
- Could you use a classification or regression models to predict users' preference? 🤔

In [None]:
# YOUR CODE HERE

In [None]:
!explorer.exe .