#  <center> Recommending product to shoppers </center>

Out of the various approches out there for recommendation system, this notebook presents an item-item collaborative recommendation. This means for visitor who has viewed lots of T-shirts will be recommended T-shirts may be of different color or brand.

Having said that, there are lot of ways we can go ahead and improve the model. This has been a simple approach for solving the recommendation problem.

## Importing necessary modules for data analysis

In [29]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle
from sklearn.metrics.pairwise import cosine_similarity

## 1) Importing events dataset

In [2]:
# Reading event datasets 
events = pd.read_csv("../data/events.csv")
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [3]:
# Describing quantitative features of events dataset
print ("No. of rows in events dataset : {0}".format(len(events)))
events.describe()

No. of rows in events dataset : 2756101


Unnamed: 0,timestamp,visitorid,itemid,transactionid
count,2756101.0,2756101.0,2756101.0,22457.0
mean,1436424000000.0,701922.9,234922.5,8826.497796
std,3366312000.0,405687.5,134195.4,5098.99629
min,1430622000000.0,0.0,3.0,0.0
25%,1433478000000.0,350566.0,118120.0,4411.0
50%,1436453000000.0,702060.0,236067.0,8813.0
75%,1439225000000.0,1053437.0,350715.0,13224.0
max,1442545000000.0,1407579.0,466867.0,17671.0


In [4]:
# statistics for categorical variables
events.describe(include=[np.object])

Unnamed: 0,event
count,2756101
unique,3
top,view
freq,2664312


## 2) Importing item_properites dataset

In [5]:
item_prop = pd.read_csv("../data/item_properties_part1.csv")
print ("No. of rows in item_prop : {0}".format(len(item_prop)))
item_prop.head()

No. of rows in item_prop : 10999999


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 3) Formatting property feature to numeric type

In [153]:
# replace categoryid with 0, and available with 1
def preProcessDF(item_prop):
    item_prop['property'] = item_prop['property'].replace(["categoryid"], 0)
    item_prop['property'] = item_prop['property'].replace(["available"], 1)
    item_prop['property'] = pd.to_numeric(item_prop.property.values)
    return item_prop

item_prop = preProcessDF(item_prop.copy())
item_prop[['itemid','property']].to_pickle("../output/item_prop.pickle")
item_prop.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,0,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


## 4) Vectorizing itemid feature from item_prop dataFrame

In [7]:
def getDictOfItemIdVector(item_prop):
    """
    Given   : item_prop as pandas dataframe
    Return  : dictionary of ItemIdVector of all unique items in the product catalog
    """
    dictOfItemIdVector = defaultdict(set)
    for index, row in item_prop.iterrows():
        dictOfItemIdVector[row['itemid']].add(row['property'])
        
    return dictOfItemIdVector
    
def writeDictToFile(dictToFile, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(dictToFile, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def readDictFromFile(filename):
    with open(filename, 'rb') as handle:
        dictToFile = pickle.load(handle)
    return dictToFile
    
    
# dictOfItemIdVector = getDictOfItemIdVector(item_prop)
# dictOfItemIdVector = defaultdict(list, ((k, list(v)) for k, v in dictOfItemIdVector.items()))
# dictOfItemIdVector2  = dictOfItemIdVector.copy()
# writeDictToFile(dictOfItemIdVector, "../output/dictOfItemVector2.pickle")


# read dict from file
dictOfItemIdVector = readDictFromFile("../output/dictOfItemVector.pickle")

In [9]:
dictOfItemIdVector

defaultdict(list,
            {0: [1056,
              225,
              1,
              227,
              6,
              678,
              776,
              364,
              112,
              917,
              888,
              283,
              189,
              159],
             1: [0,
              33,
              1,
              678,
              839,
              296,
              776,
              364,
              813,
              689,
              284,
              790,
              981,
              888,
              185,
              59,
              764],
             2: [0, 641, 332, 877, 443, 790, 282, 283, 159],
             3: [0,
              1,
              33,
              227,
              678,
              839,
              459,
              689,
              562,
              888,
              917,
              790,
              1080,
              698,
              250,
              283,
              764,
           

## 5) Compute Cosine similarity of 2 vectors

In [72]:
def square_rooted(x):
 
    return round(np.sqrt(np.sum(np.dot(x,x))),3)
 
def cosine_similarity2(x,y):
 
    numerator = np.sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)

cosine_similarity(dictOfItemIdVector[0], dictOfItemIdVector[0])



array([[ 1.]])

## Recommend product to user based on user view items

In [88]:
def createColumnsForCosineSimilarity(dictOfItemIdVector):
    columns=[]
    for key, values in dictOfItemIdVector.items():
        columns.append(key)
    return columns[:]
    
def computeSimilarity(X, userid, dictOfItemIdVector):
    print ("Starting building compute similarity")
    columns = createColumnsForCosineSimilarity(dictOfItemIdVector)
    cosineSimilarityMatrix = pd.DataFrame(columns = columns)
    print ("Built empty dataframe")
    tempDict = {}
#     X = np.array(X).reshape(1,-1)
    for key, value in dictOfItemIdVector.items():        
#         tempDict[key] = cosine_similarity(X, np.array(value).reshape(1,-1))
        if key == userid:
            tempDict[key] = 0
        else:
            tempDict[key] = cosine_similarity2(X, value)
#         print (key, tempDict[key])
        
    cosineSimilarityMatrix = cosineSimilarityMatrix.append(tempDict, ignore_index=True)
    
    return cosineSimilarityMatrix.copy()    
    
# cosineSimilarityMatrix = computeSimilarity(dictOfItemIdVector[0], 0, dictOfItemIdVector)
# cosineSimilarityMatrix

# computeMaxofCosineSimMatrix(cosineSimilarityMatrix)

In [105]:
# Top 5 recommended products
print ("Top 5 recommended products : {0}".format(cosineSimilarityMatrix.transpose().sort_values(by=0, ascending=False).head().index.values))
cosineSimilarityMatrix.transpose().sort_values(by=0, ascending=False).head()

Top 5 recommended products : [     0  66420 254871 136697  48477]


Unnamed: 0,0
0,1.0
66420,0.988
254871,0.987
136697,0.974
48477,0.97


In [140]:
df = cosineSimilarityMatrix.transpose().sort_values(by=0, ascending=False).head()

def getPropertyVectorForItemId(itemId):
    return list(set(item_prop.loc[item_prop.itemid == itemId].property.values))

def returnValue(df):
    result = defaultdict(list)
    count=1
    for index, rows in df.iterrows():
        result["item"+str(count)+"Suggested"].append(index)
        result["item"+str(count)+"Suggested"].append(tuple(("confidenceLevel",str(rows[0]*100)+"%")))
        result["item"+str(count)+"Suggested"].append(tuple(("suggestedProperty", str(getPropertyVectorForItemId(index)))))
        count+=1
        
    return result

result = returnValue(df)
result

defaultdict(list,
            {'item1Suggested': [0,
              ('confidenceLevel', '100.0%'),
              ('suggestedProperty',
               '[1056, 225, 1, 227, 6, 678, 776, 364, 112, 917, 888, 283, 189, 159]')],
             'item2Suggested': [66420,
              ('confidenceLevel', '98.8%'),
              ('suggestedProperty',
               '[928, 1, 96, 227, 6, 678, 776, 364, 112, 917, 888, 283, 159]')],
             'item3Suggested': [254871,
              ('confidenceLevel', '98.7%'),
              ('suggestedProperty',
               '[960, 1, 0, 227, 6, 776, 810, 364, 112, 917, 790, 283, 159]')],
             'item4Suggested': [136697,
              ('confidenceLevel', '97.4%'),
              ('suggestedProperty',
               '[928, 1, 0, 325, 6, 776, 713, 558, 19, 917, 888, 159]')],
             'item5Suggested': [48477,
              ('confidenceLevel', '97.0%'),
              ('suggestedProperty',
               '[928, 0, 1, 227, 6, 678, 839, 364, 275, 790, 888]