In [51]:
import os
import ujson
import gzip
import pandas as pd
import gc
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import SVD
from collections import defaultdict
import operator
from surprise.model_selection import KFold
from surprise import accuracy

In [2]:
data = pd.read_pickle("/Users/sidhantarora/work/ALDA/Project/data_nov_8.pkl")

In [3]:
data.columns

Index(['overall', 'reviewerID', 'asin', 'unixReviewTime', 'category',
       'description', 'title', 'also_buy', 'brand', 'feature', 'main_cat',
       'date', 'price', 'review_summary_combined', 'rating_category', 'time'],
      dtype='object')

In [35]:
data.description.iloc[0]

['The videosecu TV mount is a mounting solution for most 22"-47" LCD LED Plasma TV and some LED up to 55" with VESA 600x400mm (24"x16"), 400x400mm (16"x16"),600x300mm(24"x12"), 400x200mm (16"x8"),300x300mm (12"x12"),300x200mm(12"x8"),200x200mm (8"x8"),200x100mm (8"x4") mounting hole pattern .Heavy gauge steel construction provides safety loading up to 66lbs display .It can tilt 15 degree forward or backward and swivel 180 degree. The removable VESA plate can be taken off for easy installation. Post-installation level adjustment allows the TV to perfectly level. The on arm cable management ring system design, guides wires and prevent cable pinching. Standard hardware and user manual included. <br />Notice: This mount fits most 22-47" TVs (VESA 200x200mm (8x8")/200x100mm(8x4")/100x100mm(4x4") without extender, fits VESA 600x400(24x16")/400x400mm(16x16")/600x300mm(24x12")/400x200mm(16x8")/300x300mm(12x12")/300x200mm(12x8")with 4 plate extender), some up to 50 55 inch TVs manufactured in r

In [36]:
data.feature.iloc[0]

['Fits most 22" to 47" HDTV and some up to 55" LED TV (check VESA and weight)',
 'Fits VESA (mounting hole patterns) 100x100/200x100/200x200mm without extender, and also fits VESA 600x400/400x400/600x300/400x200/300x300/300x200mm with 4 plate extender',
 'Heavy-duty steel construction, loading capacity up to 66 lbs',
 'Adjustable tilt +/-15 degrees, swivel 180 degrees, extends 24" from the wall',
 'VESA plate can be taken off, quick release design for easy installation; Post-installation level adjustment allows the TV to perfectly level']

In [4]:
newData = data[['asin', 'reviewerID', 'overall']].copy()


In [5]:
newData.head(5)

Unnamed: 0,asin,reviewerID,overall
246,972683275,A1KECIKIWDB4HH,4.0
247,972683275,A2MQ47BBL7F6B6,5.0
248,972683275,ANWW7PT6Z6MHD,5.0
249,972683275,A2DEU0B3AUINV9,5.0
250,972683275,AE8R1JSMJYIU,4.0


In [6]:
set(list(newData.overall))

{1.0, 2.0, 3.0, 4.0, 5.0}

In [7]:
newData = newData.rename(columns={'asin': 'itemID', 'reviewerID': 'userID','overall': 'rating' })

In [8]:
newData

Unnamed: 0,itemID,userID,rating
246,0972683275,A1KECIKIWDB4HH,4.0
247,0972683275,A2MQ47BBL7F6B6,5.0
248,0972683275,ANWW7PT6Z6MHD,5.0
249,0972683275,A2DEU0B3AUINV9,5.0
250,0972683275,AE8R1JSMJYIU,4.0
...,...,...,...
3368220,B01HISA452,AG8SMUXFYGYH8,4.0
3368221,B01HISA452,AB443G89K25H2,5.0
3368222,B01HISA452,A1X6MABURKFRQA,5.0
3368223,B01HISA452,A3NJEEYRMW3FJR,5.0


In [9]:
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
surprise_data = Dataset.load_from_df(newData[["userID", "itemID","rating"]], reader)

### Finding the best model

In [10]:
%%time
tuning_parameters = {
    'n_epochs': [5, 10, 20 ], 'lr_all': [0.001, 0.002, 0.005],
    'reg_all': [0.2, 0.4, 0.6]
}
SVD_model = GridSearchCV(SVD, tuning_parameters, measures=["rmse","mae"], cv= 5)

SVD_model.fit(surprise_data)

CPU times: user 1h 5min 35s, sys: 2min 3s, total: 1h 7min 39s
Wall time: 10h 3min 48s


In [45]:
evaluation = pd.DataFrame.from_dict(SVD_model.cv_results)

In [46]:
evaluation

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,1.236987,1.232424,1.234208,1.234724,1.235913,1.234851,0.00155,25,0.949761,0.9473,...,0.000977,18,13.477587,0.332194,1.581788,0.787205,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.2}",5,0.001,0.2
1,1.239317,1.234837,1.236361,1.23709,1.238183,1.237158,0.001532,26,0.954488,0.952192,...,0.000925,25,13.906854,0.727332,1.816025,1.526542,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.4}",5,0.001,0.4
2,1.241848,1.237272,1.238606,1.239573,1.240644,1.239589,0.001584,27,0.958509,0.95621,...,0.000905,27,13.26158,0.06228,1.655491,1.200488,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.6}",5,0.001,0.6
3,1.231118,1.226968,1.228922,1.228835,1.230185,1.229206,0.001403,18,0.945212,0.943254,...,0.000832,15,13.344432,0.18634,1.767615,1.474384,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.2}",5,0.002,0.2
4,1.233899,1.229705,1.231336,1.231836,1.232882,1.231932,0.001421,22,0.950662,0.948636,...,0.000833,23,13.314748,0.136251,1.726158,1.424686,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
5,1.236813,1.232498,1.234026,1.234683,1.235702,1.234745,0.001467,23,0.95517,0.953015,...,0.000809,26,13.288942,0.112221,2.325212,1.633734,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
6,1.223729,1.219868,1.221545,1.221507,1.222787,1.221887,0.001307,9,0.939904,0.938015,...,0.000737,10,13.363988,0.238036,1.774897,1.481649,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.2}",5,0.005,0.2
7,1.227037,1.223153,1.224701,1.224995,1.226027,1.225183,0.001307,13,0.945969,0.944069,...,0.00074,16,13.369319,0.082838,1.669669,1.320448,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
8,1.230407,1.226391,1.227671,1.228293,1.22927,1.228406,0.001369,16,0.950833,0.948664,...,0.000798,21,13.229221,0.079921,1.740592,1.463883,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
9,1.231032,1.226787,1.228707,1.228812,1.230241,1.229116,0.001458,17,0.943963,0.941861,...,0.000923,12,24.452202,0.130784,1.834519,1.573918,"{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.2}",10,0.001,0.2


In [11]:
print(SVD_model.best_score["rmse"])
print(SVD_model.best_params["rmse"])

1.1984498132451091
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}


### Testing on Common Data

In [48]:
test_model = SVD(n_epochs =  20, lr_all =  0.005, reg_all = 0.2)

In [52]:
kf = KFold(n_splits=5)

for trainset, testset in kf.split(surprise_data):

    # train and test algorithm.
    test_model .fit(trainset)
    predictions = test_model.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1943
RMSE: 1.2017
RMSE: 1.2010
RMSE: 1.1969
RMSE: 1.2006


### Using Best Parameters

In [12]:
best = SVD_model.best_estimator['rmse']
best.fit(surprise_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f92d8db5cd0>

In [13]:
best.predict("A28T6TZRAJF7J5","B01HIY64XM")

Prediction(uid='A28T6TZRAJF7J5', iid='B01HIY64XM', r_ui=None, est=4.4638176589821885, details={'was_impossible': False})

### Generating Recommendation

In [14]:
#No. of all unique items
all_items = list(set(list(newData.itemID)))
len(all_items)

6576

In [15]:
def get_rating_predictions(user_id):
    
    item_rating = defaultdict(int)
    
    for item in all_items:
        item_rating[item] = best.predict(user_id, item).est
        
    return item_rating

In [16]:
len(set(list(data['reviewerID'])))

815053

In [17]:
len(set(list(data['asin'])))

6576

In [18]:
len(data)

1071586

In [19]:
def reviewed_items(user_id):
    
    items = set()
    
    for idx in range(len(data)):
        if data.iloc[idx]['reviewerID'] == user_id:
            items.add(data.iloc[idx]['asin'])
            
    return items

In [20]:
reviewed_items("A28T6TZRAJF7J5")

{'B01HIY64XM'}

###  Mapping Item id to Product Name

In [21]:
itemID_to_name = defaultdict(str)

for idx in range(len(data)):
    itemID_to_name[data.iloc[idx]['asin']] = data.iloc[idx]['title']

In [22]:
itemID_to_name["0972683275"]

'VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH'

In [37]:
def get_recommendation(user_id):
    item_rating = get_rating_predictions(user_id)
    already_bought = reviewed_items(user_id)
    sorted_items = sorted(item_rating.items(), key=operator.itemgetter(1), reverse = True)
    print(sorted_items[:10])
    
    items_to_suggest = []
    
    #Removing already bought items
    count = 0
    for item in sorted_items:
        
        if count == 10:
            break
        
        item_id = item[0]
        
        if item not in already_bought:
            items_to_suggest.append(itemID_to_name[item[0]])
            count += 1
    
    return items_to_suggest

### Finding a user with more than 5 reviews

In [43]:
for user in data.reviewerID:
    count = len(reviewed_items(user))
    if count >= 5:
        break
        
print(user)

A3AKVALGT4Y02G


####  Items purchased by user A3AKVALGT4Y02G

In [44]:
bought_items = reviewed_items("A3AKVALGT4Y02G")
for i in bought_items:
    print(itemID_to_name[i]) 

Panasonic KX-TCA60 Hands-Free Headset with Comfort Fit Headband for Use with Cordless Phones
Belkin N300 Pocket WiFi Adapter, 300Mbps Link Rate
Belkin N150 Wireless/WiFi N Router (Latest Generation)
Mediabridge HDMI Cable (10 Feet) Supports 4K@60Hz, High Speed, Hand-Tested, HDMI 2.0 Ready - UHD, 18Gbps, Audio Return Channel
VideoSecu 24" Long Arm TV Wall Mount Low Profile Articulating Full Motion Cantilever Swing Tilt wall bracket for most 22" to 55" LED LCD TV Monitor Flat Screen VESA 200x200 400x400 up to 600x400mm MAH
  PTC Premium Gold Series HDMI hembra/hembra Coupler


In [38]:
get_recommendation("A3AKVALGT4Y02G")

[('B00JHPLUMQ', 4.850044437459997), ('B004ZQIEO4', 4.840560496088436), ('B005B2XMY8', 4.838719446326047), ('B00LV1D0UE', 4.83277433283397), ('B004Z2BK4O', 4.820361287540826), ('B000JQV8FG', 4.81919005577683), ('B017T5NLYU', 4.815183676845506), ('B007PFRCGS', 4.811453938155106), ('B00066HOXY', 4.802800606878478), ('B0009VPFT6', 4.801426487554226)]


['Axiom 3Pack Rechargeable Battery For VTech BT1018, BT1022, BT1011, BT183348, BT18432, BT283348, BT800, BT6010, BT84342, BY0736, CBD8005',
 'Zinwell MS2X4RO-03 2x4 Multi-Switch',
 'AmScope Cordless LED Stereo Microscope 20x-40x-80x',
 'Buteny Car Aux Cable Compatible with iPhone 7/8/X , [Upgraded] 3.5mm Premium Auxiliary Audio Cable Accessories',
 "GLS Audio 50 feet Speaker Cable 12AWG Patch Cords - 50 ft Speakon to Speakon Professional Cables Black Neutrik NL4FX (NL4FC) 12 Gauge Wire - Pro 50' Speak-on Cord 12G - Single",
 'Kingston Technology 2 GB Unbuffered System Specific Memory Model 2 Not a kit (Single) DDR2 667 (PC2 5300) 200-Pin SO-DIMM KTH-ZD8000B/2G',
 'Wireless Earbuds Upgraded Graphene 3D Stereo Sound Bluetooth 5.0 with 28Hr Play Time Noise Cancelling HonShoop Lightweight Bluetooth Headphones Built-in Mic',
 'KnuKonceptz Krystal Kable 2 Channel Twisted Pair OFC RCA Interconnect Cable 6 Feet',
 'StarTech.com Slimline Serial DB9 Gender Changer - M/M',
 'Transcend TS1GCF80 1G