In [1]:
import os
import ujson
import gzip
import pandas as pd
import gc
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import SVD
from collections import defaultdict
import operator
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import KNNWithMeans

In [2]:
data = pd.read_pickle("/Users/sidhantarora/work/ALDA/Project/data_nov_8.pkl")

In [3]:
data.columns

Index(['overall', 'reviewerID', 'asin', 'unixReviewTime', 'category',
       'description', 'title', 'also_buy', 'brand', 'feature', 'main_cat',
       'date', 'price', 'review_summary_combined', 'rating_category', 'time'],
      dtype='object')

In [4]:
data.description.iloc[0]

['The videosecu TV mount is a mounting solution for most 22"-47" LCD LED Plasma TV and some LED up to 55" with VESA 600x400mm (24"x16"), 400x400mm (16"x16"),600x300mm(24"x12"), 400x200mm (16"x8"),300x300mm (12"x12"),300x200mm(12"x8"),200x200mm (8"x8"),200x100mm (8"x4") mounting hole pattern .Heavy gauge steel construction provides safety loading up to 66lbs display .It can tilt 15 degree forward or backward and swivel 180 degree. The removable VESA plate can be taken off for easy installation. Post-installation level adjustment allows the TV to perfectly level. The on arm cable management ring system design, guides wires and prevent cable pinching. Standard hardware and user manual included. <br />Notice: This mount fits most 22-47" TVs (VESA 200x200mm (8x8")/200x100mm(8x4")/100x100mm(4x4") without extender, fits VESA 600x400(24x16")/400x400mm(16x16")/600x300mm(24x12")/400x200mm(16x8")/300x300mm(12x12")/300x200mm(12x8")with 4 plate extender), some up to 50 55 inch TVs manufactured in r

In [5]:
data.feature.iloc[0]

['Fits most 22" to 47" HDTV and some up to 55" LED TV (check VESA and weight)',
 'Fits VESA (mounting hole patterns) 100x100/200x100/200x200mm without extender, and also fits VESA 600x400/400x400/600x300/400x200/300x300/300x200mm with 4 plate extender',
 'Heavy-duty steel construction, loading capacity up to 66 lbs',
 'Adjustable tilt +/-15 degrees, swivel 180 degrees, extends 24" from the wall',
 'VESA plate can be taken off, quick release design for easy installation; Post-installation level adjustment allows the TV to perfectly level']

In [6]:
newData = data[['asin', 'reviewerID', 'overall']].copy()


In [7]:
newData.head(5)

Unnamed: 0,asin,reviewerID,overall
246,972683275,A1KECIKIWDB4HH,4.0
247,972683275,A2MQ47BBL7F6B6,5.0
248,972683275,ANWW7PT6Z6MHD,5.0
249,972683275,A2DEU0B3AUINV9,5.0
250,972683275,AE8R1JSMJYIU,4.0


In [8]:
set(list(newData.overall))

{1.0, 2.0, 3.0, 4.0, 5.0}

In [9]:
newData = newData.rename(columns={'asin': 'itemID', 'reviewerID': 'userID','overall': 'rating' })

In [10]:
newData

Unnamed: 0,itemID,userID,rating
246,0972683275,A1KECIKIWDB4HH,4.0
247,0972683275,A2MQ47BBL7F6B6,5.0
248,0972683275,ANWW7PT6Z6MHD,5.0
249,0972683275,A2DEU0B3AUINV9,5.0
250,0972683275,AE8R1JSMJYIU,4.0
...,...,...,...
3368220,B01HISA452,AG8SMUXFYGYH8,4.0
3368221,B01HISA452,AB443G89K25H2,5.0
3368222,B01HISA452,A1X6MABURKFRQA,5.0
3368223,B01HISA452,A3NJEEYRMW3FJR,5.0


In [11]:
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
surprise_data = Dataset.load_from_df(newData[["userID", "itemID","rating"]], reader)

### Finding the best model for item-item filtering

In [12]:
param = {
    "name": ["msd","cosine"],
    "min_support": [3,6,9,12],
    "user_based": [False],
}
param_grid = {"sim_options": param}

In [13]:
memory_based = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=5)

In [14]:
memory_based.fit(surprise_data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [17]:
print(memory_based.best_score["rmse"])
print(memory_based.best_params["rmse"])

1.1674483770360704
{'sim_options': {'name': 'msd', 'min_support': 12, 'user_based': False}}


In [18]:
evaluation = pd.DataFrame.from_dict(memory_based.cv_results)

In [19]:
evaluation

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_sim_options
0,1.17447,1.176033,1.174122,1.17094,1.174991,1.174111,0.001712,7,1.100323,0.008396,1.42071,0.301961,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 3, 'user_based'..."
1,1.170822,1.172223,1.1704,1.166309,1.17079,1.170109,0.001998,5,1.086726,0.028467,1.060581,0.234371,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 6, 'user_based'..."
2,1.169056,1.1708,1.168779,1.164748,1.168919,1.16846,0.001996,3,1.091813,0.029273,1.054139,0.233335,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 9, 'user_based'..."
3,1.16755,1.17002,1.167677,1.163749,1.168247,1.167448,0.002049,1,1.081077,0.021934,1.058894,0.254803,"{'sim_options': {'name': 'msd', 'min_support':...","{'name': 'msd', 'min_support': 12, 'user_based..."
4,1.177301,1.178879,1.176987,1.174101,1.177883,1.17703,0.0016,8,1.32673,0.067824,1.105675,0.283655,"{'sim_options': {'name': 'cosine', 'min_suppor...","{'name': 'cosine', 'min_support': 3, 'user_bas..."
5,1.172453,1.174006,1.172163,1.168209,1.172575,1.171881,0.001944,6,1.272382,0.024872,1.208185,0.316198,"{'sim_options': {'name': 'cosine', 'min_suppor...","{'name': 'cosine', 'min_support': 6, 'user_bas..."
6,1.170174,1.171955,1.169978,1.166041,1.170074,1.169644,0.001944,4,1.234103,0.038445,1.072659,0.249411,"{'sim_options': {'name': 'cosine', 'min_suppor...","{'name': 'cosine', 'min_support': 9, 'user_bas..."
7,1.16841,1.170925,1.168567,1.164727,1.16914,1.168354,0.002021,2,1.23516,0.027785,1.076477,0.249897,"{'sim_options': {'name': 'cosine', 'min_suppor...","{'name': 'cosine', 'min_support': 12, 'user_ba..."


### Testing on Common Data

In [20]:
sim_options = {'name': 'msd', 'min_support': 12, 'user_based': False}

In [21]:
test_model = KNNWithMeans(sim_options=sim_options)

In [22]:
kf = KFold(n_splits=5)

for trainset, testset in kf.split(surprise_data):

    # train and test algorithm.
    test_model .fit(trainset)
    predictions = test_model.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1631
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1658
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1650
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1725
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1714


### Using Best Parameters

In [23]:
best = memory_based.best_estimator['rmse']

In [24]:
best.fit(surprise_data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fdb6940a700>

In [25]:
best.predict("A28T6TZRAJF7J5","B01HIY64XM")

Prediction(uid='A28T6TZRAJF7J5', iid='B01HIY64XM', r_ui=None, est=5, details={'actual_k': 1, 'was_impossible': False})

### Generating Recommendation

In [26]:
#No. of all unique items
all_items = list(set(list(newData.itemID)))
len(all_items)

6576

In [27]:
def get_rating_predictions(user_id):
    
    item_rating = defaultdict(int)
    
    for item in all_items:
        item_rating[item] = best.predict(user_id, item).est
        
    return item_rating

In [28]:
len(set(list(data['reviewerID'])))

815053

In [29]:
len(set(list(data['asin'])))

6576

In [30]:
len(data)

1071586

In [31]:
def reviewed_items(user_id):
    
    items = set()
    
    for idx in range(len(data)):
        if data.iloc[idx]['reviewerID'] == user_id:
            items.add(data.iloc[idx]['asin'])
            
    return items

In [None]:
reviewed_items("A28T6TZRAJF7J5")

###  Mapping Item id to Product Name

In [33]:
itemID_to_name = defaultdict(str)

for idx in range(len(data)):
    itemID_to_name[data.iloc[idx]['asin']] = data.iloc[idx]['title']

In [35]:
def get_recommendation(user_id):
    item_rating = get_rating_predictions(user_id)
    already_bought = reviewed_items(user_id)
    sorted_items = sorted(item_rating.items(), key=operator.itemgetter(1), reverse = True)
    print(sorted_items[:10])
    
    items_to_suggest = []
    
    #Removing already bought items
    count = 0
    for item in sorted_items:
        
        if count == 10:
            break
        
        item_id = item[0]
        
        if item not in already_bought:
            items_to_suggest.append(itemID_to_name[item[0]])
            count += 1
    
    return items_to_suggest

####  Items purchased by user A3AKVALGT4Y02G

In [36]:
bought_items = reviewed_items("A3AKVALGT4Y02G")
for i in bought_items:
    print(itemID_to_name[i]) 

  PTC Premium Gold Series HDMI hembra/hembra Coupler
Panasonic KX-TCA60 Hands-Free Headset with Comfort Fit Headband for Use with Cordless Phones
VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH
Belkin N300 Pocket WiFi Adapter, 300Mbps Link Rate
Mediabridge HDMI Cable (10 Feet) Supports 4K@60Hz, High Speed, Hand-Tested, HDMI 2.0 Ready - UHD, 18Gbps, Audio Return Channel
Belkin N150 Wireless/WiFi N Router (Latest Generation)


In [37]:
get_recommendation("A3AKVALGT4Y02G")

[('B000067SLV', 5), ('B00K6T1QIU', 5), ('B01F6KXSHK', 5), ('B0145MG480', 5), ('B00F3T2FQA', 5), ('B00IQYTTLS', 5), ('B004V1M412', 5), ('B01E7RL1SG', 5), ('B005FYNSUA', 5), ('B000MTWVMO', 5)]


['StarTech.com 6ft Standard Computer Power Cord - NEMA5-15P to C13 - PXT101',
 'Smartbuy 25gb 6x Bd-r BDR Blu-ray Single Layer Logo Blank Data Recordable Media Disc Spindle Pack (50-Disc)',
 'First2savvv XJD-XT1-D10 dark Brown Leather Half Camera Case Bag Cover base for FUJIFILM X-T1 XT1',
 'NEC NP-V332W Multimedia Projector',
 'UpBright 18V AC/DC Adapter For Cricut 14-0001 JOD-SWR-05758 JOD-SDU40A-6 J0D-SWR-05758 J0D-SDU40A-6 ALL Cutting Machine Cutter Expression Create KSAH1800250T1M2 KSAS0451800250M2 18VDC Power Supply',
 'Projector Ceiling Mount for OPTOMA DH1014 DH1017 EH500 TH1060 TH1060P',
 '7x-45x Trinocular Stereo Zoom Microscope +3D Boom Stand',
 '36W 4.8Amp 3-Port USB Rapid Car Charger with Smart Detect for Amazon Kindle, DX, Fire, Fire HD 6, HD 7 8 10 - Fire HD 8.9, HD8, HD10, Kids Edition - Kindle Fire HDX, HDX 7, HDX 8.9',
 'SanDisk Cruzer Fit 8GB USB 2.0 Low-Profile Flash Drive- SDCZ33-008G-B35',
 'CRU DX115 Rugged Removable 6G SAS/SATA Drive Carrier']