In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics.pairwise as pw
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [28]:
# Load Data
df = pd.read_csv("RecommendationDataCleaned.csv",lineterminator='\n')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4194 entries, 0 to 4193
Data columns (total 70 columns):
Customers.id                     4194 non-null int64
Customers.fname                  4194 non-null object
Customers.lname                  4194 non-null object
Customers.create_date            4194 non-null int64
Customers.mailing                4194 non-null float64
Customers.last_modified          4194 non-null int64
Orders.id                        4194 non-null int64
Orders.fname                     4194 non-null object
Orders.lname                     4194 non-null object
Orders.order_number              4194 non-null object
Orders.currency                  4194 non-null object
Orders.subtotal                  4194 non-null float64
Orders.shipping                  4194 non-null float64
Orders.total                     4194 non-null float64
Orders.shipping_carrier          4194 non-null object
Orders.shipping_method           4194 non-null object
Orders.tracking              

In [104]:
# Redo dtypes conversion

df[['Customers.mailing','Orders.shipping','Orders.payment_status','Orders.status','Products.vendor'\
    ,'Products.import_id','Products.flags','Products.taxable','Products.shopping_flags','Products.import_flags']]\
= \
df[['Customers.mailing','Orders.shipping','Orders.payment_status','Orders.status','Products.vendor'\
    ,'Products.import_id','Products.flags','Products.taxable','Products.shopping_flags','Products.import_flags']]\
    .apply(lambda x: x.astype('category'))


df['Products.case_qty'] = df['Products.case_qty'].astype('int')

In [31]:
# Definition of Popular - Purchase frequency - Products Purchased by most number of customer by ordering most number of times and purchase qty

In [105]:
# Get popular prods based on def

popular_prods = pd.DataFrame(df.groupby(['Products.id','Products.name']).agg({'Customers.id':'count','Order_Items.id':'count','Order_Items.qty': 'sum'}).reset_index().sort_values(by=['Customers.id','Order_Items.id','Order_Items.qty',],ascending=False))
popular_prods

Unnamed: 0,Products.id,Products.name,Customers.id,Order_Items.id,Order_Items.qty
599,1842,"Emesis Bags,Blue,36.000 OZ",245,245,910
662,2107,"MoliCare Disposable Super Plus Briefs,27""-47""",71,71,228
256,910,"Aluminum Transport Chair with 12"" Wheels,Blue,...",67,67,68
257,911,"Aluminum Transport Chair with 12"" Wheels,Red,F...",67,67,68
607,1867,Silent Knight Pill Crushers,53,53,61
221,858,K1 Basic Extra-Wide Wheelchairs,52,52,56
605,1862,Silent Knight Pill Crusher Pouches,45,45,66
495,1469,Protection Plus Super Protective Adult Underwe...,31,31,68
663,2109,"MoliCare Disposable Super Plus Briefs,39""-59""",31,31,61
207,837,K1 Basic Wheelchairs,25,25,27


In [106]:
# Top 10 products
popular_prods[:10]

Unnamed: 0,Products.id,Products.name,Customers.id,Order_Items.id,Order_Items.qty
599,1842,"Emesis Bags,Blue,36.000 OZ",245,245,910
662,2107,"MoliCare Disposable Super Plus Briefs,27""-47""",71,71,228
256,910,"Aluminum Transport Chair with 12"" Wheels,Blue,...",67,67,68
257,911,"Aluminum Transport Chair with 12"" Wheels,Red,F...",67,67,68
607,1867,Silent Knight Pill Crushers,53,53,61
221,858,K1 Basic Extra-Wide Wheelchairs,52,52,56
605,1862,Silent Knight Pill Crusher Pouches,45,45,66
495,1469,Protection Plus Super Protective Adult Underwe...,31,31,68
663,2109,"MoliCare Disposable Super Plus Briefs,39""-59""",31,31,61
207,837,K1 Basic Wheelchairs,25,25,27


In [34]:
# Memory Based Collaborative Filtering - Get similar users/products based on cosine distance


In [35]:
# Product based recommendation

    # take care of multi colinearity between features
    # get important features using Random Forest (create dummies on categorical columns)
    # Use PCA for variable reduction (Only on normalized numerical variables)
    # get distances between those features
    # sort by lowest distance

In [108]:

qty = df.groupby(['Products.id']).agg({'Order_Items.id':'count','Order_Items.qty': 'mean'}).sort_values(by=['Order_Items.id'], ascending=False)
qty

Unnamed: 0_level_0,Order_Items.id,Order_Items.qty
Products.id,Unnamed: 1_level_1,Unnamed: 2_level_1
1842,245,3.714286
2107,71,3.211268
911,67,1.014925
910,67,1.014925
1867,53,1.150943
858,52,1.076923
1862,45,1.466667
1469,31,2.193548
2109,31,1.967742
837,25,1.080000


In [109]:
qtyNormalized = qty[['Order_Items.id']]
qtyNormalized = qtyNormalized.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
qtyNormalized.head()

Unnamed: 0_level_0,Order_Items.id
Products.id,Unnamed: 1_level_1
1842,1.0
2107,0.286885
911,0.270492
910,0.270492
1867,0.213115


In [246]:
x = df[['Products.id','Products.name','Products.google_shopping_type']]
X = pd.get_dummies(x, prefix=['Products.google_shopping_type'], columns=['Products.google_shopping_type'], drop_first=True)
X.to_csv("Recommendation_cosine.csv",index=False,header=False,line_terminator='\n',sep = '|' )
X

Unnamed: 0,Products.id,Products.name,Products.google_shopping_type_Apparel > Gloves > Glove Liners,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Accutouch&laquo;,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Aloetouch&laquo;,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Curad&laquo;,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Generation Pink&laquo;,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Medline,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > SensiCare,Products.google_shopping_type_Apparel > Gloves > Nonsterile PF LF Synthetic Exam Gloves > Venom&laquo;,...,Products.google_shopping_type_Wheelchairs > Manual Wheelchairs > Transport Wheelchairs,Products.google_shopping_type_Wheelchairs > Power Wheelchairs & Scooters > 3 & 4 Wheel Power Scooters,Products.google_shopping_type_Wheelchairs > Power Wheelchairs & Scooters > FWD Power Wheelchairs,Products.google_shopping_type_Wheelchairs > Power Wheelchairs & Scooters > Folding Power Wheelchairs,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Manual Wheelchair Parts & Accessories,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Power Mobility Accessories,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Scooter and Wheelchair Ramps,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Wheelchair Cushions > Foam Seat Cushions,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Wheelchair Cushions > Gel Seat Cushions,Products.google_shopping_type_Wheelchairs > Wheelchairs/Power Mobility Parts & Accassories > Wheelchair Cushions > Seat / back Cushions
0,2310,"Basic Steel Rollators,Green",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,177,Urinary Drain Bags,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,"SensiCare Nitrile Exam Gloves,Blue,XX-Large",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,983,Basket for 2-Button Walkers,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,991,TENS 3000 Analog Unit,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1842,"Emesis Bags,Blue,36.000 OZ",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,1842,"Emesis Bags,Blue,36.000 OZ",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1379,Aloetouch Sensitive Personal Cleansing Baby Wipes,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,815,Universal Raised Toilet Seat,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1842,"Emesis Bags,Blue,36.000 OZ",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [257]:
prodDict = {}
with open(r'Recommendation_cosine.csv') as f:

    for line in f:
        fields = line.rstrip('\n').split('|')
        #print(int(fields[0]))
        prodID = int(fields[0])
        #print(type(fields[0]))
        name = fields[1]
        shop_typ = fields[2:]
        shop_typ = list(map(int,shop_typ))
        prodDict[prodID] = (name, shop_typ, qtyNormalized.loc[prodID].get('Order_Items.id')\
                        ,qty.loc[prodID].get('Order_Items.qty'))


In [259]:
prodDict[2310][0], prodDict[177][0]

('Basic Steel Rollators,Green', 'Urinary Drain Bags')

In [263]:
prodDict[2310][1], prodDict[177][1]

([0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [264]:
prodDict[2310][2], prodDict[177][2]

(0.0040983606557377051, 0.0)

In [266]:
from scipy import spatial
def ComputeDistance(a, b):
    shopTypA = a[1]
    shopTypB = b[1]
    shopTypDistance = spatial.distance.cosine(shopTypA, shopTypB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return shopTypDistance + popularityDistance

In [269]:
print (prodDict[2310])
print (prodDict[177])

('Basic Steel Rollators,Green', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.0040983606557377051, 1.0)
('

In [271]:
ComputeDistance (prodDict[2310], prodDict[177])

1.0040983606557377

In [282]:
import operator

def getNeighbors(prodID, K):
    distances = []
    for prod in prodDict:
        if (prod != prodID):
            dist = ComputeDistance(prodDict[prodID], prodDict[prod])
            distances.append((prod, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors
    
K = 10
avgRating = 0
neighbors = getNeighbors(1842, K)
for neighbor in neighbors:
    avgRating += prodDict[neighbor][3]
    print (prodDict[neighbor][0] + " " + str(prodDict[neighbor][3]))

Emesis Bags,Blue,36.000 OZ 1.13333333333
"MoliCare Disposable Super Plus Briefs,27""-47""" 3.21126760563
"Aluminum Transport Chair with 12"" Wheels,Blue,F: 8   R: 12" 1.01492537313
"Aluminum Transport Chair with 12"" Wheels,Red,F: 8   R: 12" 1.01492537313
Silent Knight Pill Crushers 1.15094339623
K1 Basic Extra-Wide Wheelchairs 1.07692307692
Silent Knight Pill Crusher Pouches 1.46666666667
Protection Plus Super Protective Adult Underwear,2X-Large 2.1935483871
Bariatric Bath Bench without Back 1.38888888889
Disposable Emergency Blanket,Not Applicable 22.0


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [283]:
#avgRating = avgRating/float(K)
avgRating /= float(K)
avgRating

3.5651422101037618

In [284]:
prodDict[1842]

('Emesis Bags,Blue,36.000 OZ',
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,