# Recommender for Amazon Books with Matrix Factorisation

Based on Nick Becker's <a href="https://beckernick.github.io/matrix-factorization-recommender/">work</a>.

## Creating a SQLite database

We create a SQLite database and insert the ratings data into it. This makes it easier for us to work with this data.

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import datetime as dt

disk_engine = create_engine('sqlite:///ratings.db')

def load_data(chunksize = 100000):
    start = dt.datetime.now()
    j = 0
    index_start = 1
    data_columns = ['user_id', 'item_id', 'rating', 'time']

    for df in pd.read_csv('dist/ratings_Books.csv', names=data_columns, chunksize=chunksize, iterator=True, encoding='utf-8'):
        df.index += index_start
        j+=1
        print('{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize))

        df.to_sql('ratings', disk_engine, if_exists='append')
        index_start = df.index[-1] + 1

#load_data(1000000)

## Inserting meta data into the database

As we want to show the titles and pictures of the items that the system recommends, we build a table that contains these information.

In [2]:
import json
import re
from sqlalchemy import create_engine
import datetime as dt

def extract_title(line):
    matches = re.findall("'title': (?:\"([^\"]+)\"|'([^']+)')", line)
    if len(matches) == 1:
        title1,title2 = matches[0]
        if title1:
            return title1
        return title2
    return ''

def extract_price(line):
    matches = re.findall("'price': (\d+)\.(\d+)", line)
    if len(matches) == 1:
        num1,num2 = matches[0]
        return int(num1) * 100 + int(num2)
    return -1

def extract_property(line, prop_name):
    matches = re.findall("'{0}': '([^']+)'".format(prop_name), line)
    if len(matches) == 1:
        prop_value = matches[0]
        return prop_value
    return ''

def insert_meta_data(chunksize = 10000):
    start = dt.datetime.now()
    j = 0
    i = 0
    
    sql = "INSERT INTO items (asin, title, image_url, price) VALUES (:asin, :title, :image_url, :price)"
    engine = create_engine('sqlite:///ratings.db')

    print('Inserting meta data into the database ...')
    with open('dist/ignored_data.json','w') as ignored_file:
        with open('dist/metadata.json') as json_file:
            data_array = []
            for line in json_file:
                i = i + 1
                
                asin = extract_property(line, 'asin')
                if not asin:
                    ignored_file.write(line)
                    continue

                title = extract_title(line)
                img_url = extract_property(line, 'imUrl')
                price = extract_price(line)

                data_array.append({'asin': asin, 'title': title, 'image_url': img_url, 'price': price})

                if i % chunksize == 0:
                    engine.execute(sql, data_array)
                    data_array = []
                    print('{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, i))
            
            # In case we have items that are not yet committed to the database
            if len(data_array) > 0:    
                engine.execute(sql, data_array)
                print('{} seconds: Inserted {} rows in total!'.format((dt.datetime.now() - start).seconds, i))
#insert_meta_data(300000)

## Building the model

Now, we build a model so we can make predictions for any user in the database. We create a function that can build this model given a DataFrame containing ratings.

In [3]:
from scipy.sparse.linalg import svds

def build_model(ratings_df, svd_k = 50):
    # Generate a DataFrame where there is one row per user and one column per item.
    R_df = ratings_df.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)
    
    # Convert the DataFrame into a matrix R (ratings matrix)
    R = R_df.as_matrix()

    # Normalise by each user's mean (de-mean)
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)

    # Decompose the ratings matrix R into three matrices; U, sigma and Vt
    U, sigma, Vt = svds(R_demeaned, k = svd_k)

    # Create the weights matrix
    sigma = np.diag(sigma)

    # Make the item predictions for every user
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    predictions_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
    return (predictions_df, R_df)

## Generating the recommendations

Next step is to generate the recommendations.

In [4]:
def generate_recommendations(predictions_df, R_df, user_ratings_df, user_id, number_of_recommendations=20):
    user_row_index = R_df.index.get_loc(user_id)

    # Get and sort the selected user's predictions
    sorted_user_predictions = predictions_df.iloc[user_row_index].sort_values(ascending=False)
    
    # From the recommendation list, remove items that the user has already rated
    recommendations = sorted_user_predictions[~sorted_user_predictions.index.isin(user_ratings_df['item_id'])]

    # Select only the top recommendations
    top_recommendations = recommendations.head(number_of_recommendations).index.values
    
    # Create a string representation of the IDs
    recommended_item_ids = "'{0}'".format(str.join("', '", top_recommendations))
    
    # Find the items that the selected user has rated
    sql = 'SELECT `asin`, `title`, `image_url` FROM `items` WHERE `asin` IN ({0})'.format(recommended_item_ids)
    recommended_items_df = pd.read_sql_query(sql, disk_engine)
    return recommended_items_df

## Utility functions

These functions are needed to properly display the recommendations and user preferences. 


In [5]:
from IPython.display import display, HTML

def get_user_ratings(user_id):
    # Find the items that the selected user has rated
    sql = 'SELECT r.item_id, i.title, i.image_url, r.rating FROM ratings r INNER JOIN items i ON r.item_id=i.asin WHERE r.user_id = "{0}"'.format(user_id)
    user_ratings_df = pd.read_sql_query(sql, disk_engine)
    print('User {0} has already rated {1} items'.format(user_id, user_ratings_df.shape[0]))
    return user_ratings_df

def get_random_user_id(ratings_df):
    return ratings_df['user_id'].drop_duplicates().sample(n=1).values[0]

def render_recommendations(recommendations_df):
    html_output =  '<h3>Recommending {0} items</h3>'.format(recommendations_df.shape[0])
    html_output += '<div style="max-height:400px;overflow:auto;margin:5px 0 0 0;">'
    for row in recommendations_df.itertuples():
        title = row[2]
        image_url = row[3]
        html_output += '<div style="float:left;width:100px;height:190px;margin-left:5px;margin-bottom:5px">'
        html_output += '<img src="{0}" alt="{1}" title="{1}" style="width:99px;">'.format(image_url, title)
        html_output += '</div>'
    display(HTML(html_output))

def render_user_ratings(user_id, user_ratings_df):
    html_output =  '<h3>User {0} rated {1} items</h3>'.format(user_id, user_ratings_df.shape[0])
    html_output += '<div style="max-height:400px;overflow:auto;margin:5px 0 0 0;">'
    
    for row in user_ratings_df.itertuples():
        title = row[2]
        image_url = row[3]
        rating = row[4]
        html_output += '<div style="float:left;width:100px;height:190px;margin-left:5px;margin-bottom:5px">'
        html_output += '<img src="{0}" alt="{1}" title="{1}" style="width:99px;">'.format(image_url, title)
        html_output += '<p style="margin:0;padding:0">Rating: {0}</p>'.format(rating)
        html_output += '</div>'
    html_output += '</div>'
    display(HTML(html_output))

## Connecting everything together

Now, the code that connects everything together. To avoid running into memory issues, we only select ratings from users that have already rated more than 700 items. The variable <code>user_id</code> specifies which user, we would like to generate recommendations for. Remember that this user must exist in the <code>ratings_df</code> DataFrame. You can use <code>get_random_user_id(ratings_df)</code> to get a random user from the pool.

In [10]:
start_time = dt.datetime.now()
print('Query the database ...')

ratings_df = pd.read_sql_query('SELECT `user_id`, `item_id`, `rating` FROM `ratings` WHERE `user_id` IN (SELECT `user_id` '
                       'FROM `ratings`'
                       'GROUP BY `user_id`'
                       'HAVING COUNT(`rating`) > 700)', disk_engine)

runtime_seconds = (dt.datetime.now() - start_time).seconds
print('Fetched {0} rows in {1} seconds'.format(ratings_df.shape[0], runtime_seconds))

#user_id = get_random_user_id(ratings_df)
user_id = 'A3NIQK6ZLYEP1L' # rated 1330 items, history books and other novels, interesting

# Fetches user's current ratings
user_ratings_df = get_user_ratings(user_id)

# Note that we just choose a random k value here instead of choosing an optimal k value for the SVD. 
# We need to play with this value in order to optimise the performance of our model.
predictions_df, R_df = build_model(ratings_df, svd_k = 50)

# Finally, we generate the recommendations
recommendations_df = generate_recommendations(predictions_df, R_df, user_ratings_df, user_id, 25)


runtime_seconds = (dt.datetime.now() - start_time).seconds
print('Recommendations in {1} seconds'.format(ratings_df.shape[0], runtime_seconds))

# Render the results
render_recommendations(recommendations_df)
render_user_ratings(user_id, user_ratings_df)

Query the database ...
Fetched 560409 rows in 45 seconds
User A3NIQK6ZLYEP1L has already rated 1330 items
Recommendations in 88 seconds
