In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm


In [2]:
class Data:
    def __init__(self):
        '''init Data class'''
        self.data = None
        
    def load_data(self, filename, format='txt'):
        '''loads data from excel, csv, tsv, or txt file'''
        if format == 'excel':
            self.data = pd.read_excel(filename)
        elif format == 'csv':
            self.data = pd.read_csv(filename)
        elif format == 'tsv':
            self.data = pd.read_csv(filename, sep='\t')
        elif format == 'txt':
            self.data = pd.read_table(filename)
        else:
            raise ValueError('Invalid file format.  Please specify "excel", "csv", "tsv", or "txt".')
    
    def drop_small_orders(self, order_col='order_number', min_order_size=2):
        '''drop orders from self.data that have min_order_size or less unique items in basket'''
        self.data = self.data[self.data.groupby('order_number').order_number.transform(len) >= min_order_size]
    
    def expand_columns(self, columns=[]):
        '''performs one-hot encoding on specified columns and appends them to self.data'''
        dfs = []
        dfs.append(data.data)
        for col in columns:
            dfs.append(pd.get_dummies(self.data[col], prefix=None, sparse=False))
        data.data = pd.concat(dfs, axis=1)
          
    def drop_columns(self, columns=[]):
        '''drops columns from self.data'''
        self.data.drop(columns, axis=1, inplace=True)
        
    def consolidate_orders(self, order_col='order_number'):
        '''consolidates each order in self.data into single record.  order number is maintained and all other columns summed.'''
        data_cols = list(data.data.columns)
        data_cols.remove(order_col)
        self.data = self.data.groupby(order_col).sum()[data_cols].reset_index()

In [3]:
load_and_process_data = True
get_columns = True
run_rec_engine = True

if load_and_process_data:
    # drop orders with few items, one-hot encode l3 category information, drop unnecessary columns, 
    # and consolidate unique orders into single records
    data = Data()
    data.load_data('All Transations - 2 Weeks.txt', format='tsv')
    data.drop_small_orders(order_col='order_number', min_order_size=10)
#     data.expand_columns(['l3'])  
#     data.drop_columns(['l1', 'l2', 'l3', 'sku', 'brand'])
#     data.consolidate_orders(order_col='order_number')

KeyboardInterrupt: 

In [12]:
sample = data.data[:1000]

In [15]:
data.data.head()

Unnamed: 0,order_number,l1,l2,l3,sku,brand
22,173984,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",46841,1726
37,649979,Power Tools,Drilling Accessories,Step Drill Bits,299689,2163
41,565917,"Paint, Equipment and Supplies",Spray Paints and Primers,Spray Paints,310773,3612
58,482732,Hand Tools,Marking Tools,Paint Markers,38789,4477
64,173984,Cleaning,Floor Care,Floor Finishes and Sealers,264751,3712


In [21]:
data.data['item_id'] = data.data.groupby(['l3']).ngroup()

In [45]:
 data.data.groupby('l3', group_keys=False).apply(lambda df: df.head(1))

Unnamed: 0,order_number,l1,l2,l3,sku,brand,item_id,rating
3388,598075,Fleet and Vehicle Maintenance,Automotive Interior,12 Volt Accessories,182640,449,0,1
181778,351245,Fasteners,Screws,12-Point Flange Head Cap Screws,107792,1793,1,1
284486,752055,Office Supplies,Binders and Clipboards,3-Ring Binder Accessories,160036,358,2,1
20109,603976,Office Supplies,Binders and Clipboards,3-Ring Binders,114226,4724,3,1
15302,151029,Motors,HVAC Motors,3.3 Inch Diameter Motors,193852,1068,4,1
...,...,...,...,...,...,...,...,...
439049,636335,Plumbing,Hose Bibs and Hydrants,Yard Hydrants,129403,698,4564,1
431130,824837,Fasteners,Bolts,Yoke Ends,259551,1793,4565,1
222168,863700,HVAC and Refrigeration,HVAC Controls and Thermostats,Zone Valve Actuators,213320,2237,4566,1
75749,648050,HVAC and Refrigeration,HVAC Controls and Thermostats,Zone Valves,105484,1989,4567,1


In [24]:
data.data[data.data['l3']=='Paper Towels, Rolls']

Unnamed: 0,order_number,l1,l2,l3,sku,brand,item_id
22,173984,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",46841,1726,2850
241,258067,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",184171,1726,2850
2108,548174,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",277217,1726,2850
2259,243426,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",277216,1726,2850
3000,363486,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",121881,1726,2850
...,...,...,...,...,...,...,...
2103215,800215,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",117663,1726,2850
2103792,799745,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",177681,1726,2850
2105550,764155,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",46845,1726,2850
2105664,223389,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",184171,1726,2850


In [1]:
import os
import csv
import sys
import re

from surprise import Dataset
from surprise import Reader

import numpy as np
import pandas as pd
from operator import itemgetter

from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut

In [1]:
from surprise import KNNBasic

from ECom import ECom
from UserBased import UserBased
from ItemBased import ItemBased
from HybridAlgorithm import HybridAlgorithm

In [2]:
ecom = ECom('Par Transations - 2 Weeks.txt', format='tsv')

In [3]:
UserKNN = KNNBasic(sim_options={})
alg = UserBased(UserKNN, 'user')
alg.Evaluate(ecom)

Evaluating hit rate...

Using recommender  user
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'HR': 0.13924050632911392}

In [5]:
ItemKNN = KNNBasic(sim_options={})
alg = ItemBased(ItemKNN, 'item')
alg.Evaluate(ecom)

Evaluating hit rate...

Using recommender  item
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'HR': 0.0108499095840868}

In [4]:
hybrid = HybridAlgorithm()
hybrid.Evaluate(ecom)

Evaluating hit rate...

Using recommender  UserBased
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.

Using recommender  ItemBased
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'HR': 0.009041591320072333}

In [26]:
import tensorflow as tf

In [367]:
set({'a':2}) | set({})

{'a'}

In [273]:
KNNBasic.__name__

'KNNBasic'

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\sabareesh\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-440-782467b8f0c5>", line 3, in <module>
    alg.SampleTopNRecs(ecom)
  File "<ipython-input-439-aea10071f294>", line 22, in SampleTopNRecs
    candidates = self.getAllRecs(ECom, testOrderID)
  File "<ipython-input-439-aea10071f294>", line 11, in getAllRecs
    self.simsMatrix = self.getSimilarity(ECom)
AttributeError: 'ItemBased' object has no attribute 'getSimilarity'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\sabareesh\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'AttributeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred

AttributeError: 'ItemBased' object has no attribute 'getSimilarity'

In [165]:
for innerID, score in enumerate(simsMatrix[0][1]):
    print(score)

0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0


In [None]:
class Evaluator:
    
    algorithms = []
    
    def __init__(self, dataset, rankings):
        ed = EvaluationData(dataset, rankings)
        self.dataset = ed
        
    def AddAlgorithm(self, algorithm, name):
        alg = EvaluatedAlgorithm(algorithm, name)
        self.algorithms.append(alg)
        
    def Evaluate(self, doTopN):
        results = {}
        for algorithm in self.algorithms:
            print("Evaluating ", algorithm.GetName(), "...")
            results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)

        # Print results
        print("\n")
        
        if (doTopN):
            print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
                    "Algorithm", "RMSE", "MAE", "HR", "cHR", "ARHR", "Coverage", "Diversity", "Novelty"))
            for (name, metrics) in results.items():
                print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
                        name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["cHR"], metrics["ARHR"],
                                      metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
        else:
            print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
            for (name, metrics) in results.items():
                print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
                
        print("\nLegend:\n")
        print("RMSE:      Root Mean Squared Error. Lower values mean better accuracy.")
        print("MAE:       Mean Absolute Error. Lower values mean better accuracy.")
        if (doTopN):
            print("HR:        Hit Rate; how often we are able to recommend a left-out rating. Higher is better.")
            print("cHR:       Cumulative Hit Rate; hit rate, confined to ratings above a certain threshold. Higher is better.")
            print("ARHR:      Average Reciprocal Hit Rank - Hit rate that takes the ranking into account. Higher is better." )
            print("Coverage:  Ratio of users for whom recommendations above a certain threshold exist. Higher is better.")
            print("Diversity: 1-S, where S is the average similarity score between every possible pair of recommendations")
            print("           for a given user. Higher means more diverse.")
            print("Novelty:   Average popularity rank of recommended items. Higher means more novel.")
        
    def SampleTopNRecs(self, ml, testSubject=85, k=10):
        
        for algo in self.algorithms:
            print("\nUsing recommender ", algo.GetName())
            
            print("\nBuilding recommendation model...")
            trainSet = self.dataset.GetFullTrainSet()
            algo.GetAlgorithm().fit(trainSet)
            
            print("Computing recommendations...")
            testSet = self.dataset.GetAntiTestSetForUser(testSubject)
        
            predictions = algo.GetAlgorithm().test(testSet)
            
            recommendations = []
            
            print ("\nWe recommend:")
            for userID, movieID, actualRating, estimatedRating, _ in predictions:
                intMovieID = int(movieID)
                recommendations.append((intMovieID, estimatedRating))
            
            recommendations.sort(key=lambda x: x[1], reverse=True)
            
            for ratings in recommendations[:10]:
                print(ml.getMovieName(ratings[0]), ratings[1])


In [None]:
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter
from surprise.model_selection import LeaveOneOut
from RecommenderMetrics import RecommenderMetrics
from EvaluationData import EvaluationData

data = ecom.convertToSurprise()

evalData = EvaluationData(data, ecom.getPopularityRanks())

# Train on leave-One-Out train set
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {'name': 'cosine',
               'user_based': True}

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

leftOutTestSet = evalData.GetLOOCVTestSet()

In [5]:
# Build up dict to lists of (int(movieID), predictedrating) pairs
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
    
    testUserRatings = trainSet.ur[uiid]
    kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])
    # Get top N similar users to this one    
    candidates = defaultdict(float)
    for itemID, rating in kNeighbors:
        similarityRow = simsMatrix[itemID]
        for innerID, score in enumerate(similarityRow):
            candidates[innerID] += score
    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[uiid]:
        watched[itemID] = 1
        
    # Get top-rated items from similar users:
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            try:
                movieID = trainSet.to_raw_iid(itemID)
                topN[int(trainSet.to_raw_uid(uiid))].append( (int(movieID), 0.0) )
                pos += 1
            except:
                pass
            if (pos > 40):
                break
    
# Measure
print("HR", RecommenderMetrics.HitRate(topN, leftOutTestSet))   

HR 0.005986570665803738


In [11]:
ecom.data.shape

(346556, 3)

In [27]:
data.data['rating'] = 1

data.data.head()

Unnamed: 0,order_number,l1,l2,l3,sku,brand,item_id,rating
22,173984,Cleaning,Paper Products and Dispensers,"Paper Towels, Rolls",46841,1726,2850,1
37,649979,Power Tools,Drilling Accessories,Step Drill Bits,299689,2163,3852,1
41,565917,"Paint, Equipment and Supplies",Spray Paints and Primers,Spray Paints,310773,3612,3771,1
58,482732,Hand Tools,Marking Tools,Paint Markers,38789,4477,2807,1
64,173984,Cleaning,Floor Care,Floor Finishes and Sealers,264751,3712,1670,1


In [37]:
for index, row in data.data[['order_number', 'item_id', 'rating']].iterrows():
    print(row[0])

173984
649979
565917
482732
173984
339808
258067
649979
565917
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
258067
649979
565917
700098
482732
173984
339808
429170
258067
838326
649979
565917
700098
482732
173984
339808
429170
258067
838326
649979
565917
700098
482732
173984
339808
429170
258067
838326
649979
700098
482732
173984
339808
429170
838326
649979
700098
482732
173984
339808
429170
838326
649979
482732
173984
339808
429170
838326
649979
380746
482732
173984
339808
429170
838326
649979
380746
756820
482732
173984
339808
429170
838326
649979
380746
756820
482732
173984
429170
838326
380746
756820
482732
173984
258237
429170
290364
838326
380746
672584
756820
482732
258237
290364
838326
380746
672584
451350
756820
482732
258237
290364
838326
380746
672584
451350

280786
613157
315841
332850
234449
280983
402536
178852
243319
700706
625136
280786
613157
315841
332850
234449
280983
402536
374633
243319
625136
280786
613157
315841
234449
280983
402536
374633
243319
625136
280786
613157
315841
550477
234449
280983
402536
374633
243319
625136
280786
613157
315841
117191
550477
234449
280983
402536
374633
243319
625136
280786
613157
315841
117191
550477
234449
371829
280983
848050
402536
374633
243319
625136
280786
613157
315841
117191
550477
234449
371829
280983
848050
402536
374633
243319
280786
613157
315841
264141
117191
550477
234449
253311
371829
280983
848050
402536
374633
243319
222670
280786
613157
315841
264141
117191
550477
234449
253311
371829
280983
848050
402536
374633
243319
222670
280786
613157
315841
264141
117191
550477
234449
253311
371829
280983
848050
402536
374633
222670
280786
613157
315841
264141
117191
550477
234449
253311
371829
848050
402536
374633
222670
280786
613157
315841
264141
117191
550477
234449
253311
371829
848050

796674
116633
495377
386861
739886
563330
632944
549361
662408
796674
176975
116633
495377
386861
563330
632944
549361
187342
662408
796674
176975
116633
495377
386861
563330
632944
187342
662408
796674
176975
116633
495377
136185
386861
563330
632944
187342
662408
796674
176975
116633
495377
136185
386861
563330
114130
187342
662408
796674
176975
116633
495377
527682
136185
386861
563330
114130
187342
662408
796674
176975
116633
495377
196693
527682
136185
386861
563330
114130
187342
662408
796674
176975
116633
495377
196693
527682
136185
386861
563330
114130
187342
662408
796674
176975
495377
196693
527682
136185
386861
563330
114130
187342
176975
495377
196693
527682
136185
386861
563330
114130
643250
187342
176975
495377
196693
527682
136185
386861
563330
752444
114130
643250
187342
176975
495377
196693
527682
136185
386861
563330
752444
114130
643250
187342
176975
495377
196693
527682
136185
386861
287726
563330
752444
114130
790147
643250
187342
176975
495377
196693
527682
569026

KeyboardInterrupt: 

In [46]:
reader = Reader(line_format='user item rating', skip_lines=1, rating_scale=(0, 1))

In [47]:
sd = Dataset.load_from_df(data.data[['order_number', 'item_id', 'rating']], reader)

In [52]:
dict(zip(data.data['item_id'], data.data['l3']))[4569]


KeyError: 4569

In [53]:
data.data[['order_number', 'item_id', 'rating']].sort_values('order_number')

Unnamed: 0,order_number,item_id,rating
729645,2,1957,1
729309,2,373,1
729351,2,4158,1
729435,2,373,1
729477,2,4158,1
...,...,...,...
399338,1240944,3673,1
399632,1240944,3846,1
399674,1240944,3846,1
399716,1240944,3846,1
