In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/MyDrive/apr.csv"

apr = pd.read_csv(path)

## Adjusted Rating System

In [None]:
# Calculate universal mean rating
universal_mean = ((apr.star_rating * apr.purchased_counts).sum()) / (apr.purchased_counts.sum())

# Find 50% quantile for k
k = apr.purchased_counts.describe()[5]

# Assign adjusted ratings to each review
apr['adjusted_rating'] = (apr.purchased_counts * apr.star_rating + k * universal_mean) / (apr.purchased_counts + k)

In [None]:
print('universal mean rating: {}\n'.format(universal_mean))

pd.options.display.float_format = "{:.2f}".format
print('product ratings distribution: \n{}'.format(apr.purchased_counts.describe()))

universal mean rating: 4.339513043878752

product ratings distribution: 
count   5833870.00
mean       4922.08
std        9012.30
min         100.00
25%         341.00
50%        1064.00
75%        4679.00
max       49789.00
Name: purchased_counts, dtype: float64


We can now see how the adjusted rating system works in various situations, accounting for popularity and perceived quality.

In [None]:
# Show how purchased counts affect adjusted ratings of various random products
np.random.seed(13)

apr[
    ['customer_id',
     'product_id',
     'product_title',
     'purchased_counts',
     'star_rating',
     'adjusted_rating']
].iloc[np.random.choice(apr.index, size=1000, replace=False, )].set_index('customer_id').head()

Unnamed: 0_level_0,product_id,product_title,purchased_counts,star_rating,adjusted_rating
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26842075,B00QW8TYWO,Crossy Road,27876,5.0,4.98
2255176,B00DR0PDNE,Google Chromecast HDMI Streaming Media Player,35756,5.0,4.98
44838902,B00I29VGMC,Archer's Voice,2614,3.0,3.39
14193348,B004VMVTNU,Alarm Clock Xtreme & Timer,1632,5.0,4.74
28359012,B00G4NUQYM,Game of Words,515,5.0,4.55


In [None]:
# Show how purchased counts affect adjusted ratings of one product
np.random.seed(15)

apr[
    ['customer_id',
     'product_id',
     'product_title',
     'purchased_counts',
     'star_rating',
     'adjusted_rating']
].iloc[np.random.choice(apr.index, size=1000, replace=False, )][
    apr.product_title == 'Candy Crush Saga'].set_index('customer_id').head()

Unnamed: 0_level_0,product_id,product_title,purchased_counts,star_rating,adjusted_rating
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
46450853,B00FAPF5U0,Candy Crush Saga,49789,3.0,3.03
31942905,B00FAPF5U0,Candy Crush Saga,49789,4.0,4.01
20746569,B00FAPF5U0,Candy Crush Saga,49789,5.0,4.99
3163390,B00FAPF5U0,Candy Crush Saga,49789,5.0,4.99
28858375,B00FAPF5U0,Candy Crush Saga,49789,2.0,2.05


## Build Keyword Search Module
We will now start building a keyword search module that will take in a combination of product title and category to return recommendations.  But we first need a dataset that has the total average rating of each unique product.

In [None]:
# Condense dataset to only have each unique product and its total average rating
apr_mean = apr.groupby(['product_id', 'product_category', 'product_title']).mean().reset_index()
# apr_mean.to_csv('APR_data/apr_mean.csv')

In [None]:
class Recommender:

    def __init__(self, n=5, adjusted_rating=True):

        """Initiate a recommender object by passing the number of recommendations, default is 5.
        The adjusted rating is the default rating score.  Original rating can be used by passing
        adjusted_rating=False"""

        self.n = n # Number of recommendations to return, default is 5
        self.adjusted_rating = adjusted_rating # Boolean determines if original star rating or adjust rating is used
        # Initiate product variables to display in recommendation results
        self.product_variables = ['product_id', 'product_title',
                                  'product_category', 'star_rating', 'adjusted_rating', 'purchased_counts']

        # Initiate list of recommendations to be sorted by rating scores, original or adjusted
        if self.adjusted_rating: # Set standard sorting criteria to adjusted rating
            rating = 'adjusted_rating'
        else: # Set sorting criteria to originial, or star rating
            rating = 'star_rating'
        self.recommend = apr_mean.sort_values(rating, ascending=False)


    def _filter_by_product_category(self):
        """Filter reccomendations by the product category
        Note: should only be called in 'keyword' method"""

        idx = []
        for i in self.recommend.index: # Search through index
            if self.recommend.loc[i, 'product_category'] is not np.nan:
                keyword_search = self.recommend.loc[i, 'product_category'].split(',') # Locate index, product category
                if self.product_category.lower() in str(keyword_search).lower(): # Check if search item in keyword_search
                    idx.append(i) # Place index of row in a list
        self.recommend = self.recommend.loc[idx]

    def _filter_by_product_title(self):
        """Filter reccomendations by the product title
        Note: should only be called in 'keyword' method"""

        idx = []
        for i in self.recommend.index: # Search through index
            if self.recommend.loc[i, 'product_title'] is not np.nan:
                keyword_search = self.recommend.loc[i, 'product_title'].split(',') # Locate index, product category
                if self.product_title.lower() in str(keyword_search).lower(): # Check if search item in keyword_search
                    idx.append(i) # Place index of row in a list
        self.recommend = self.recommend.loc[idx]

    def return_recommendations(self):
        """Returns a list of the top n recommended products"""

        if len(self.recommend) == 0:
            print('No products recommended.')
        elif self.n < len(self.recommend): # Returns top n products from list of recommendations
            print('Top {} recommended products for you:'.format(self.n))
            print(self.recommend.iloc[:self.n][self.product_variables])
        else: # Returns all products if amount found is less than n
            print('Top {} recommended products for you:'.format(len(self.recommend)))
            print(self.recommend[self.product_variables])

    # Keyword search filtering recommender module
    def keyword(self, df=apr_mean, product_category=None, product_title=None):
        """Keyword search filtering recommendation system.
        Filters by product_parent (similiar items), product title, product categoy or combination of all."""

        self.recommend = df # Assign dataframe
        self.product_variables = ['product_id', 'product_title',
                                  'product_category', 'star_rating', 'adjusted_rating', 'purchased_counts']

        # Assign variables based on user's keyword search
        self.product_title = product_title
        self.product_category = product_category

        # Filter by product title
        if self.product_title != None:
            self._filter_by_product_title()
            if len(self.recommend) == 0:
                print('No matching products found for {}'.format(self.product_title))
                return None

        # Filter by product category
        if self.product_category != None:
            self._filter_by_product_category()
            if len(self.recommend) == 0:
                print('No matching products found for {}'.format(self.product_category))
                return None

        # Sort by rating of interest
        if self.adjusted_rating:
            rating = 'adjusted_rating'
        else:
            rating = 'star_rating'

        self.recommend = self.recommend.sort_values(rating, ascending=False)

        # Return top n recommendations
        self.return_recommendations()

        return self.recommend

## Test Keyword Search Module

We want to test the keyword module with a series of queries to make sure no errors occur.

**Adjusted Rating System**

In [None]:
%%time


# Original ratings
kw = Recommender(n=10, adjusted_rating=False)

# Test 1
print('\n-------------------\nTest 1: top products only, no adjusted rating system')
kw.return_recommendations()

# Adjusted rating system
kw = Recommender(n=10)

# Test 2
print('\n-------------------\nTest 2: top products only, adjusted rating system')
kw.return_recommendations()

# Test 3
print('\n-------------------\nTest 3: No keywords, top products only')
kw.keyword()

# Test 4
print('\n-------------------\nTest 4: product title only')
kw.keyword(product_title='Candy')

# Test 5
print('\n-------------------\nTest 5: product title and category')
kw.keyword(product_category='mobile apps', product_title='candy')

# Test 6
print('\n-------------------\nTest 6: one word')
kw.keyword(product_title='fire')

# Test 7
print('\n-------------------\nTest 7: more detail in search')
kw.keyword(product_title='played with fire')

# Test 8
print('\n-------------------\nTest 8: more detail in search')
kw.keyword(product_title='the girl who')

# Test 9
print('\n-------------------\nTest 9: specifying category in more detailed search')
kw.keyword(product_category='digital', product_title='the girl')

# Test 10
print('\n-------------------\nTest 10: category only')
kw.keyword(product_category='books')


-------------------
Test 1: top products only, no adjusted rating system
Top 10 recommended products for you:
       product_id                                      product_title  \
3048   B00005JOFQ                                 Brokeback Mountain   
1417   0887431461             BIG Kindergarten Workbook - Ages 5 - 6   
3763   B0007OCG4W                          Closer (Superbit Edition)   
8712   B009E3EWPI  Celebration Day (2 CD + 1 DVD, DVD sized digipak)   
554    0375831002                                     The Book Thief   
3955   B000E1MXSW                                    The Nun's Story   
3042   B00005JO20                     King Kong (Widescreen Edition)   
9247   B00AYJCEVK                                    Revealing Jesus   
10064  B00EEPFLHE                                No More Hell To Pay   
1909   6300151379                                    Labyrinth [VHS]   

      product_category  star_rating  adjusted_rating  purchased_counts  
3048             Books 

Unnamed: 0.1,product_id,product_category,product_title,Unnamed: 0,customer_id,product_parent,star_rating,helpful_votes,downvotes,total_votes,purchased_counts,adjusted_rating
1858,1616550414,Books,The Legend of Zelda: Hyrule Historia,3854317.00,24133042.06,929151668.00,4.85,0.57,0.62,1.18,2319.00,4.69
1040,0679805273,Books,"Oh, the Places You'll Go!",195833.50,27891411.89,689067271.00,4.86,0.69,0.35,1.04,1860.00,4.67
1471,1250038820,Books,Humans of New York,4644116.00,25818407.67,477457103.00,4.83,0.98,0.63,1.62,1941.00,4.65
1334,0802473156,Books,The 5 Love Languages: The Secret to Love That ...,355857.50,26894127.98,277335936.00,4.76,1.42,0.44,1.86,2202.00,4.62
561,0375869026,Books,Wonder,2516690.50,29078005.33,585109048.00,4.81,2.52,0.83,3.35,1540.00,4.62
...,...,...,...,...,...,...,...,...,...,...,...,...
452,0316228532,Books,The Casual Vacancy,3310627.50,33131169.33,165338194.00,3.05,6.89,4.68,11.57,1592.00,3.57
629,0385504225,Books,The Lost Symbol,1361776.00,36018201.90,17146931.00,3.00,5.20,2.17,7.37,2202.00,3.44
1904,193700788X,Books,Dead Ever After (Sookie Stackhouse/True Blood),4122623.00,30795659.23,278368308.00,2.72,24.47,13.31,37.78,1565.00,3.37
516,0345803485,Books,Fifty Shades of Grey: Book One of the Fifty Sh...,2680621.50,28621078.19,600633062.00,3.12,6.91,2.44,9.34,8973.00,3.25


**Original Star Rating**

As we can see from the test, original rating gives us different results than the adjusted rating system.  We can also hone our search with more detail in the product title or by adding a product category.  We also see that if a search query doesn't have enough items in the results it displays only 'n' amount of results (tests 4, 5 and 7).  The results give the title, the product id (which users can actually search online to get the product from amazon), the star rating, adjusted star rating and amount of purchases.