In [1]:
#Dependencies
import pandas as pd
import numpy as np

In [2]:
#Reading the csv file
events = pd.read_csv('events.csv')

In [3]:
#Taking a subset of the file for faster prototyping
events = events.iloc[1:10000,:]

In [4]:
#convering timestamp into pandas datetime format
events.timestamp = pd.to_datetime(events.timestamp, unit='ms')

In [5]:
#Checking different unique events in the dataframe
events.event.unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [6]:
#creating dataframe for each event
view = events.loc[events.event == 'view']
addtocart = events.loc[events.event == 'addtocart']
transaction = events.loc[events.event == 'transaction']

In [7]:
#Keeping relevant columns
view = view.loc[:,['visitorid','itemid']]
addtocart = addtocart.loc[:,['visitorid', 'itemid']]
transaction = transaction.loc[:,['visitorid', 'itemid']]

In [8]:
#Creating visitorid as index
view.index = view.visitorid
view.drop('visitorid', axis=1, inplace =True)
addtocart.index = addtocart.visitorid
addtocart.drop('visitorid', axis=1, inplace = True)
transaction.index = transaction.visitorid
transaction.drop('visitorid', axis=1, inplace=True)

In [9]:
#Creating history matrix
history_view = pd.get_dummies(view, columns=['itemid'])
history_addtocart = pd.get_dummies(addtocart, columns=['itemid'])
history_transaction = pd.get_dummies(transaction, columns=['itemid'])

In [72]:
#Check which visitor has viewed which item
df = history_view.copy()
cols = df.columns
bt = df.apply(lambda x: x > 0)
bt.apply(lambda x: list(cols[x.values]), axis=1)

visitorid
992329     [itemid_248676]
111016     [itemid_318965]
483717     [itemid_253185]
951259     [itemid_367447]
972639      [itemid_22556]
810725     [itemid_443030]
794181     [itemid_439202]
824915     [itemid_428805]
339335      [itemid_82389]
176446      [itemid_10572]
929206     [itemid_410676]
15795       [itemid_44872]
598426     [itemid_156489]
223343     [itemid_402625]
57036      [itemid_334662]
1377281    [itemid_251467]
1370216    [itemid_176721]
1398644    [itemid_135256]
653756     [itemid_132316]
1213673    [itemid_343861]
864246      [itemid_36642]
125625      [itemid_17655]
608100     [itemid_187722]
781127      [itemid_21989]
1076270    [itemid_262799]
453474     [itemid_250696]
1153198    [itemid_388242]
273888     [itemid_205392]
849453     [itemid_123990]
487887     [itemid_345560]
                ...       
1105700    [itemid_133968]
431115     [itemid_287918]
932927     [itemid_273743]
543510     [itemid_125159]
808588      [itemid_55955]
1238871     [itemi

In [59]:
#Functions to calculate entropy
#k11, k12, k21, and k22 are the counts of simultaneously occuring of two events

def denormEntropy1(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    lg = np.log(np.divide(k,float(np.sum(k))))
    lg[lg==-np.inf]=0
    return -np.sum(k*lg)

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * (denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

In [104]:
#calculating counts k11, k12, k21, k22 for each item, item combination in history matrix
#Then calculating cross-cooccurence matrix

def calc_counts_row(item1, item2):
    new_item = np.concatenate((item1.reshape(-1,1) + item2.reshape(-1,1)), axis=1)
    if((np.any(new_item[:,1]<0)==True)|(np.any(new_item[:,0]<0)==True)):
        raise ValueError('History matrix has negative element')
    k22 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]==0)])
    k21 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]!=0)])
    k12 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]==0)])
    k11 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]!=0)])
    
    return k11,k12,k21,k22

def calc_cooccurence_matrix(history_view):
    coo_matrix = np.zeros((history_view.shape[1],history_view.shape[1]))
    for item1_index in range(history_view.shape[1]):
        for item2_index in range(history_view.shape[1]):
            item1 = history_view[:,item1_index]
            item2 = history_view[:,item2_index]
            llr = llr_2x2(calc_counts_row(item1,item2))
            coo_matrix[item1_index,item2_index] = llr
    return coo_matrix


In [None]:
params = {'eventNames': ['view','addtocart', 'transaction'],
          'primaryEvent' : 2,
          'algorithm' : {'name': ['UR'],
                         'no_recommendations': 3,
                         'time_dependent': True,
                         'time_weight_ratio':}}

In [None]:
class urecommend:
    def __init__(self, params):
        self.eventNames = params['eventNames']
        self.primaryEvent = params['primaryEvent']
        
        if not isinstance(self.eventNames, list):
            raise TypeError('eventNames should be of type list')
        if not isinstance(self.primaryEvent, int):
            raise TypeError('primaryEvent should be of type int')
    
    def fit(self, X):
        
        if isinstance(X, pd.DataFrame):
            self.X = X.as_matrix()
        
    def history_matrix(self):
        

In [34]:
getattr(self, params.keys)

dict_keys(['eventNames', 'primaryEvent', 'algorithm'])