# BLU10 - Exercises Notebook

In [1]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd

import scipy.sparse

from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

## Q0: Create the ratings matrix (ungraded)

In [2]:
path = os.path.join('data', 'ml-latest-small', 'ratings.csv')
data = pd.read_csv(path)
# Shuffle Data
data = data.sample(10493, random_state=200)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
58111,423,1206,5.0,1353691236
94054,624,3268,1.0,1028111170
97308,652,26843,5.0,1440269953
55435,401,924,5.0,977458816
22437,157,5378,2.5,1323618006


In [3]:
len(data.userId.unique())

657

In [4]:
len(data[data.userId==652])

28

So, we'll ignore the _timestamp_ and use the _rating_ column as our only source of information for our recommender system. Remember that if you had data of other types of interactions between users and the movies, you could create a metric that incorporates all of that information in the ratings matrix (maybe by averaging them).

Keep the following ratings matrix schema in your mind while developing non-personalized systems. These systems rely heavily on the ratings matrix, so maybe also write it on a piece of paper to remember it better!

<img align="left" width="413" height="239" src="./media/ratings_matrix3.png">


In the following exercise you will build the ratings matrix with users as rows and products as columns.

Tip: you can use the pandas' _pivot_ function or flex your numpy muscles with the _genfromtxt_ function (it is good for your health!).

In [5]:
data.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
def make_ratings(original_data: pd.DataFrame):
    """
    Parameters
        original_data - the original data with ratings per user and product.
        
    Returns:
        R - (numpy.ndarray) Ratings matrix with the userId, movieId and rating
        hint: don't forget to put zeros on places where you do not have ratings
    
    Extra Hint: Your input is a pandas DataFrame but you want to output an array (use .to_numpy)!
    """
    # YOUR CODE HERE
    return pd.DataFrame(data).pivot(index='userId', columns='movieId', values='rating').fillna(0).to_numpy()
    
    # YOUR CODE HERE
    #raise NotImplementedError()
    

R = make_ratings(data)
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
f"We have {R.shape[0]} user and {R.shape[1]} items."

'We have 657 user and 3628 items.'

In [8]:
expected_hash = '0825c15053e635376af0a569e8f37cfaef0e1dfce37ae6878517e14e061f13c4'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '8ab31b5afaea56114427e1f01b81d001b079a0f59539f6db3f099816ca794055'
assert hashlib.sha256(str(R[0].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = 'b5967724d1225caa9c6af28a9b333a29e6d5c11a24e9d381acf5c3377524b776'
assert hashlib.sha256(str(R[:,0].sum()).encode()).hexdigest() == expected_hash_2



## Q1: Convert the Ratings Matrix to a Sparse Representation

In this exercise, convert the ratings matrix to a sparse row representation. 

Hint: Remember what we have done with scipy library!

In [9]:
from scipy.sparse import csr_matrix

def get_csr(orig_matrix):
    """
    Parameters
        orig_matrix - The original Ratings Matrix.
    
    Returns
        H_ - The Compressed Sparse Row Matrix
    """
    # YOUR CODE HERE
    return csr_matrix(orig_matrix)
    #raise NotImplementedError()
    
sparse_mat = get_csr(R)

In [10]:
expected_hash = 'e33275c9e0741880dc0334b76fb8cb17e27020dec75dda07ab221dbb97277d30'
assert hashlib.sha256(str(sparse_mat).encode()).hexdigest() == expected_hash

In [11]:
sparse_mat.indices

array([1093, 1444, 1738, ..., 1479, 1665, 2135], dtype=int32)

## Q2: What is the density score of this matrix?

In this exercise, let's understand the density score (this is, the fraction of rows that are non zero in the original rating matrix).

Calculate the get_density_score function below:

In [12]:
B=np.array([[1,0,0],[0,0,3],[2,5,0]])
B

array([[1, 0, 0],
       [0, 0, 3],
       [2, 5, 0]])

In [13]:
B.nonzero()

(array([0, 1, 2, 2]), array([0, 2, 0, 1]))

In [14]:
B[B.nonzero()].size/B.size

0.4444444444444444

In [15]:
def get_density_score(orig_matrix):
    """
    Parameters
        orig_matrix - Ratings Matrix
        
    Returns:
        dense_score - (float) Density Score of matrix Orig Matrix. 
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    return orig_matrix[orig_matrix.nonzero()].size/orig_matrix.size
    #raise NotImplementedError()
    
dense_score = get_density_score(R)

In [16]:
np.testing.assert_almost_equal(dense_score,0.004,3)

In [17]:
f"The Density Score is {dense_score}."

'The Density Score is 0.004402172180184897.'

## Q3: Popular Items - What are the Top-3 Most Rated items?
More ratings give us the current trends but not necessarily the best suggestions - but let's check Items that have more ratings given.

In this exercise you will have to retrieve the indexes of the products so you may need to recreate the ratings matrix as a dataframe or come up with another creative solution!

Return the product IDs from the product with most ratings to the lowest.

**Hint: To get the ID's it's easier if you work with the original data frame other than the rating matrix!**
**Remember the pandas pivot method! ;)**

In [18]:
def top3items(original_df, n):
    """
    Parameters
        original_df - Original Data Frame with ratings
        n - Number of Top-n items to retrieve
        
    Returns
        top_ids - (list) list of product ids of  
        the top-n most rated items
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    series_data = pd.DataFrame(original_df).pivot(index = 'userId',columns ='movieId', values = 'rating').fillna(0).sum(axis=0)
    sort_original_df = sorted(series_data.items(), key=lambda x: x[1], reverse=True)
    list_ids = []
    for i in sort_original_df:
        list_ids.append(i[0])
    return list_ids[:n]
#raise NotImplementedError()

most_wanted = top3items(data, 3)

In [19]:
expected_hash = '3f54e8a66e4924557721acdcb1d40e4399376f780ce6d2b11f48931fc5e1376f'
assert hashlib.sha256(str(most_wanted).encode()).hexdigest() == expected_hash

## Q4: Influencers - What are the Top-5 Most Active Users?

Now let's do the following for users! Return the top 5 most active users - this is, the ones that have rated most products. 

Return the user IDs from the users with most ratings to the lowest ones (but only the top 5).

In [20]:
def get_influencers(original_df, n):
    """
    Parameters
        original_df - Original DataFrame with ratings
        n - number of top-n most active users
        
    Returns
        influencers - (list) list of 
        ids of the top-n most active users
    """    
    # YOUR CODE HERE
    series_data = pd.DataFrame(original_df).pivot(index = 'userId',columns ='movieId', values = 'rating').fillna(0)
    series_data['nonzero']= np.count_nonzero(series_data, axis=1)
    series_nonzero = series_data['nonzero']
    sort_series_nonzero = sorted(series_nonzero.items(), key=lambda x: x[1], reverse=True)
    list_ids = []
    for i in sort_series_nonzero:
        list_ids.append(i[0])
    influencers= list_ids[:n]
    #raise NotImplementedError()
    
    return influencers

influencers = get_influencers(data, 5)

In [21]:
expected_hash = '2cbf47fd3da8e22be1bff24eb3e2ad0b99b8992d870e7a5ea4f21880588accc6'
assert hashlib.sha256(str(influencers).encode()).hexdigest() == expected_hash

## Q5: Elite - What are the Top-7 Better Rated Items (On Average)?

Since this can be biased by a low number of ratings, we need items to have more than 10 ratings. Use average to obtain the ids of the top average rated products. Return the product ID from the highest rated to the lowest rated item.

Hint: In this exercise and to filter the movies by rating, it may be easier to use the original data and then reconstruct the ratings matrix!
<br>
Hint 2: Don't forget that we are asking for the **top** rated items so you have to sort your average ratings in some way!

In [22]:
data

Unnamed: 0,userId,movieId,rating,timestamp
58111,423,1206,5.0,1353691236
94054,624,3268,1.0,1028111170
97308,652,26843,5.0,1440269953
55435,401,924,5.0,977458816
22437,157,5378,2.5,1323618006
...,...,...,...,...
56012,405,2405,3.0,1061363297
72495,509,1079,4.0,940018077
9949,70,376,4.0,853954323
37093,268,231,3.5,1314894209


In [23]:
def elite(original_data, n, k):
    """
    Parameters
        original_data - The original dataframe with ratings.
        n - Top-n items
        k - Mininum number of ratings
        
    Returns
        best_items - (list) list of ids of top-n best mean rated items.
        Your indices should refer only to items with more than k ratings (subset of original matrix).
    """
    
    # YOUR CODE HERE
    #series_data = pd.DataFrame(data).pivot(index = 'userId',columns ='movieId', values = 'rating').fillna(0)
    #series_data['nonzero']= np.count_nonzero(series_data, axis=0)
    #series_data_filtered = series_data[series_data.nonzero>k]
    #series_data_filtered['Ave_rating'] = series_data_filtered.drop(columns = 'nonzero').sum(axis=0)/series_data_filtered['nonzero']
    #series_avg = series_data_filtered['Ave_rating']
    #sort_series_avg = sorted(series_avg.items(), key=lambda x: x[1], reverse=True)
    #list_ids = []
    #for i in sort_series_avg:
     #   list_ids.append(i[0])
    #return list_ids[:n]
    #raise NotImplementedError()
    
    # YOUR CODE HERE
    series_data = pd.DataFrame(original_data).pivot(index = 'userId',columns ='movieId', values = 'rating').fillna(0)
    series_data.loc['nonzero',:]= np.count_nonzero(series_data, axis=0)
    for movieid in series_data.columns:
        if series_data.loc['nonzero',movieid]<=k:
            series_data = series_data.drop(columns = movieid,axis = 1)
    series_data.loc['Ave_rating'] = series_data.drop(index = 'nonzero').sum(axis=0)/series_data.loc['nonzero']
    series_avg = series_data.loc['Ave_rating']
    #series_data_filtered
    sort_series_avg = sorted(series_avg.items(), key=lambda x: x[1], reverse=True)
    list_ids = []
    for i in sort_series_avg:
        list_ids.append(i[0])
    return list_ids[:n]


    #raise NotImplementedError()

    
best_items = elite(data, 7, 10)
best_items 

[527, 2959, 50, 318, 1193, 1219, 2571]

In [24]:
expected_hash = '0ed83bd0067fb63b2c591d0039829a3a9bc6887a7bb8f076bc15f6e6944e9216'
assert hashlib.sha256(str(best_items).encode()).hexdigest() == expected_hash

## Q6: Apriori - What are the 5 most common 2-piece itemsets?
We define "common itemsets" as at least 2 different items that are usually rated together at least by 0.5% of the population (erheeem support!).
Show your results sorted by support in descending way.

Hint: Check the mlxtend documentation for help: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

In [33]:
def getBundlesSolution(original_data, n=None, min_support=None, top=None):
    """
    Parameters
        original_data: Original data frame with ratings.
        n: Number of items in commonset
        min_support: Minimum percentage of users that contains the itemset
        top: Number of most common itemsets
        
    Return
        df: the return dataframe should have two columns ["support", "itemsets"],
            with the support percentage and the itemsets.
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    series_data = pd.DataFrame(original_data).pivot(index = 'userId',columns ='movieId', values = 'rating').fillna(0)
    data_nonzero_mask = np.greater(series_data,0)    
    frequent_itemsets = apriori(data_nonzero_mask, min_support=min_support, max_len = n, use_colnames=True)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    frequent_itemsets =  frequent_itemsets[ frequent_itemsets['length'] == 2].drop('length',axis = 1)
    df = frequent_itemsets.sort_values('support',ascending = False)
    return df.iloc[:top,:]
    #raise NotImplementedError()
    

df = getBundlesSolution(data, n=2, min_support=0.005, top=5)

In [34]:
df

Unnamed: 0,support,itemsets
834,0.010654,"(457, 318)"
846,0.010654,"(480, 377)"
823,0.009132,"(288, 377)"
847,0.009132,"(377, 589)"
813,0.009132,"(296, 260)"


In [35]:
expected_hash = 'f7441550a0ca5274581d023417c99540e3a8a4cca68824a87cbe6d95c07742ea'
assert hashlib.sha256(str(df.shape).encode()).hexdigest() == expected_hash

np.testing.assert_almost_equal(df.iloc[0,0],0.011,3)

np.testing.assert_almost_equal(df.iloc[4,0],0.009,3)