In [1]:
#Dependencies
import pandas as pd
import numpy as np
import logging
import imp

In [2]:
#Reading the csv file
events = pd.read_csv('events.csv')

In [3]:
#Taking a subset of the file for faster prototyping
events = events.iloc[1:100,:]

In [4]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
5,1433224086234,972639,view,22556,


In [6]:
#convering timestamp into pandas datetime format
events.timestamp = pd.to_datetime(events.timestamp, unit='ms')

In [7]:
#Checking different unique events in the dataframe
events.event.unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [8]:
#creating dataframe for each event
view = events.loc[events.event == 'view']
addtocart = events.loc[events.event == 'addtocart']
transaction = events.loc[events.event == 'transaction']

In [10]:
#Keeping relevant columns
view = view.loc[:,['visitorid','itemid']]
addtocart = addtocart.loc[:,['visitorid', 'itemid']]
transaction = transaction.loc[:,['visitorid', 'itemid']]

In [286]:
#Creating visitorid as index
view.index = view.visitorid
view.drop('visitorid', axis=1, inplace =True)
addtocart.index = addtocart.visitorid
addtocart.drop('visitorid', axis=1, inplace = True)
transaction.index = transaction.visitorid
transaction.drop('visitorid', axis=1, inplace=True)

In [290]:
#Creating history matrix
history_view = pd.get_dummies(view, columns=['itemid'])
history_addtocart = pd.get_dummies(addtocart, columns=['itemid'])
history_transaction = pd.get_dummies(transaction, columns=['itemid'])

In [291]:
#Counting the duplicate rows and adding the count
history_view = history_view.groupby([history_view.index])[history_view.filter(regex='itemid_.*').columns].sum()
history_addtocart = history_addtocart.groupby([history_addtocart.index])[history_addtocart.filter(regex='itemid_.*').columns].sum()
history_transaction = history_transaction.groupby([history_transaction.index])[history_transaction.filter(regex='itemid_.*').columns].sum()

In [303]:
#Converting into a union dataframe
def union_dataframe(history_event, ind_union, col_union):
    
    history_event = history_event.copy()
    index_event_df = pd.DataFrame(0, index=ind_union.difference(history_event.index), columns= history_event.columns)
    history_event = pd.concat([history_event, index_event_df], axis=0)
    col_event_df = pd.DataFrame(0,index=history_event.index, columns=col_union.difference(history_event.columns))
    history_event= pd.concat([history_event, col_event_df], axis=1)
    #Sort rows and columns for uniformity
    history_event = history_event.reindex_axis(sorted(history_event.columns), axis=1).sort_index()
    return history_event

ind_union = history_view.index.union(history_addtocart.index).union(history_transaction.index)
col_union = history_view.columns.union(history_addtocart.columns).union(history_transaction.columns)
history_view =  union_dataframe(history_view,ind_union, col_union)
history_addtocart =  union_dataframe(history_addtocart,ind_union, col_union)
history_transaction =  union_dataframe(history_transaction,ind_union, col_union)

In [295]:
#Check which visitor has viewed which item
df = history_view.copy()
cols = df.columns
bt = df.apply(lambda x: x > 0)
bt.apply(lambda x: list(cols[x.values]), axis=1)

visitorid
137                                      [itemid_383819]
202                                       [itemid_62641]
458                                      [itemid_182419]
533                                      [itemid_360664]
581                                      [itemid_388097]
765                                      [itemid_287017]
845                                      [itemid_351530]
1322                                      [itemid_96924]
1485                                      [itemid_32971]
1654                                     [itemid_123555]
1713                                     [itemid_270144]
1722                                     [itemid_381314]
1756       [itemid_150100, itemid_296448, itemid_346892]
2081                                     [itemid_221146]
2160                                     [itemid_280029]
2366                                     [itemid_221428]
2610                                     [itemid_159856]
2900                 

In [59]:
#Functions to calculate entropy
#k11, k12, k21, and k22 are the counts of simultaneously occuring of two events

def denormEntropy1(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    lg = np.log(np.divide(k,float(np.sum(k))))
    lg[lg==-np.inf]=0
    return -np.sum(k*lg)

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * (denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

In [95]:
#calculating counts k11, k12, k21, k22 for each item, item combination in history matrix
#Then calculating cross-cooccurence matrix

def calc_counts_row(item1, item2):
    new_item = np.concatenate((item1.reshape(-1,1),item2.reshape(-1,1)), axis=1)
    if((np.any(new_item[:,1]<0)==True)|(np.any(new_item[:,0]<0)==True)):
        raise ValueError('History matrix has negative element')
    k22 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]==0)])
    k21 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]!=0)])
    k12 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]==0)])
    k11 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]!=0)])
    
    return k11,k12,k21,k22

def calc_cooccurence_matrix(history_event):
    coo_matrix = np.zeros((history_event.shape[1],history_event.shape[1]))
    for item1_index in range(history_event.shape[1]):
        for item2_index in range(history_event.shape[1]):
            item1 = history_event[:,item1_index]
            item2 = history_event[:,item2_index]
            llr = llr_2x2(calc_counts_row(item1,item2))
            coo_matrix[item1_index,item2_index] = llr
    return coo_matrix

def calc_cross_coocurence_matrix(primary_history, secondary_event):
    coo_matrix = np.zeros((primary_history.shape[1], primary_history.shape[1]))
    for item1_index in range(primary_history.shape[1]):
        for item2_index in range(secondary_event.shape[1]):
            item1 = primary_history[:,item1_index]
            item2 = secondary_event[:,item2_index]
            llr = llr_2x2(calc_counts_row(item1,item2))
            coo_matrix[item1_index,item2_index] =llr
    return coo_matrix

def return_recommended_llr(coo_matrix, user_history):
    user_history = user_history.reshape(-1,1)
    user_llr = np.dot(coo_matrix, user_history)
    return llr


#Recommendation training flow
1. Creating a history matrix (including adding count for duplicate events by same user)
2. Converting it into union history matrix
3. Calculating cooccurence and cross-cooccurence matrix
4. Calculating user_llr for each event
5. Adding user_llr and recommending based on the number of recommendations specified

In [7]:
params = {'eventNames': ['view','addtocart', 'transaction'],
          'primaryEvent' : 2,
          'algorithm' : {'name': ['URz'],
                         'no_recommendations': 3,
                         'time_dependent': False}}

In [72]:
u = urecommend(params)
u.fit(events)



In [73]:
user_history = {}
user_history['view']=[['itemid_128499', 1],['itemid_22556', 3]]
user_history['transaction']=[['itemid_21989', 1]]
user_history['addtocart'] = None
u.predict(user_history)

['itemid_102061', 'itemid_328025', 'itemid_417464']

In [71]:
class urecommend:
    
    def __init__(self, params):
        self.eventNames = params['eventNames']
        self.primaryEvent = params['primaryEvent']
        self.no_recommendations = params['algorithm']['no_recommendations']
        imp.reload(logging)
        logging.basicConfig(filename= 'urecommend_log.log',format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.DEBUG)
        
        if not isinstance(self.eventNames, list):
            raise TypeError('eventNames should be of type list')
        if not isinstance(self.primaryEvent, int):
            raise TypeError('primaryEvent should be of type int')
        if not isinstance(self.no_recommendations, int):
            raise TypeError('no_recommendations should be of type int')
    
    def fit(self, X):
        
        self.X = X
        #Check if input dataframe has all required columns or not
        if isinstance(self.X, pd.DataFrame):
            if(not set(['visitorid','event','itemid']).issubset(set(self.X.columns.values))):
                raise ValueError('Incomplete input dataframe')
        else:
            raise TypeError('Input is not a dataframe')
        
        #Check if params events are not equal to input dataframe unique events
        if(set(self.X.event.unique())!=set(self.eventNames)):
            if(len(self.X.event.unique())>len(self.eventNames)):
                raise ValueError('Input dataframes have more unique events than specified in input params')
            elif(len(self.X.event.unique())<len(self.eventNames)):
                diff_elem = set(self.eventNames) - set(self.X.event.unique())
                logging.info('Input dataframe have no datapoints for '+ ' and '.join(diff_elem))
        
        #Create history matrix
        self.create_history_matrix()
        logging.info('History matrices created')
        
        #Create union history matrix
        self.create_union_history_matrix()
        logging.info('Union of history matrices created')
        
        #Calculating coocurrence and cross-coocurrence matrices
        self.create_coocurrence_matrix()
        logging.info('Coocurrence matrices created')
        logging.info('u-recommender trained to the data')
        
    def predict(self, user_history):
        
        """
        This function predicts the recommended products
        
        user_history (dictionary of length equal to number of events in X)
                    :keys will be event name
                     and values will be a 2D list of item_id and count pairs
                     if they are no items for any event put value for that event will be none
        """
        #Check if user_history has more or less items than total item
        if(set(self.X.event.unique())==set(user_history.keys())):
            raise ValueError('All elements are not present in user_history')
        
        user_llr = np.zeros((len(self.coocurrence[list(self.coocurrence.keys())[0]]),1))
        
        for event,matrix in user_history.items():
                        
            user = pd.DataFrame(0,index = self.coocurrence[list(self.coocurrence.keys())[0]].index, columns =['user'])
            if(matrix is not None):
                
                #Check if input user_history is list of list
                if(not isinstance(matrix, list)):
                    raise ValueError('Values of user_history '+event+' key should be list of list')
                
                matrix = dict(matrix)

                for index,series in user.iterrows():
                    if(index in list(matrix.keys())):
                        user.loc[index,'user']=matrix[index]
        
            user_llr += self.return_recommended_llr(self.coocurrence[event], user)
        
        user_llr = pd.DataFrame(user_llr, index=self.coocurrence[list(self.coocurrence.keys())[0]].index, columns =['llr'])
        return list(user_llr.sort_values('llr', axis=0, ascending=False).index)[:self.no_recommendations]
        
    def create_history_matrix(self):
        
        #Initializing empty dictionary
        self.history = {}
        
        for eventName in self.eventNames:
            
            #Taking out only event specific columns
            self.history[eventName] = self.X.loc[self.X.event == eventName]
            #Taking out required columns
            self.history[eventName] = self.history[eventName].loc[:,['visitorid','itemid']]
            #Making visitorid as index
            self.history[eventName].index = self.history[eventName].visitorid
            self.history[eventName].drop('visitorid', axis=1, inplace =True)
            #creating dummy variable for each itemid
            self.history[eventName] = pd.get_dummies(self.history[eventName], columns=['itemid'])
            #adding count when user participated in an event with same item more than once
            self.history[eventName] = self.history[eventName].groupby([self.history[eventName].index])[self.history[eventName].filter(regex='itemid_.*').columns].sum()
        
        
    def create_union_history_matrix(self):
        
        #Calculating index and column union
        index_union = self.history[self.eventNames[0]].index
        column_union = self.history[self.eventNames[0]].columns
        
        for eventName in self.eventNames[1:]:
            index_union = index_union.union(self.history[eventName].index)
            column_union = column_union.union(self.history[eventName].columns)
        
        #Creating union history event dataframe using index and column union
        for eventName in self.eventNames:
            self.history[eventName] = self.union_dataframe(self.history[eventName], index_union, column_union)
    
    def create_coocurrence_matrix(self):
        
        self.coocurrence = {}
        
        self.coocurrence[self.eventNames[self.primaryEvent]] = self.calc_cooccurence_matrix(self.history[self.eventNames[self.primaryEvent]])
        logging.debug('cooccurence matrix of primary event - %s is calculated',self.eventNames[self.primaryEvent])
        
        for eventName in self.eventNames:
            if(eventName!= self.eventNames[self.primaryEvent]):
                self.coocurrence[eventName] = self.calc_cross_coocurence_matrix(self.history[self.eventNames[self.primaryEvent]], self.history[eventName])
                logging.debug('cross-cooccurence matrix of event - %s is calculated',eventName)
                
    def union_dataframe(self, history_event, ind_union, col_union):
    
        history_event = history_event.copy()
        index_event_df = pd.DataFrame(0, index=ind_union.difference(history_event.index), columns= history_event.columns)
        history_event = pd.concat([history_event, index_event_df], axis=0)
        col_event_df = pd.DataFrame(0,index=history_event.index, columns=col_union.difference(history_event.columns))
        history_event= pd.concat([history_event, col_event_df], axis=1)
        #Sort rows and columns for uniformity
        history_event = history_event.reindex_axis(sorted(history_event.columns), axis=1).sort_index()
        return history_event
    
    def calc_cooccurence_matrix(self, history_event):
        
        coo_matrix = pd.DataFrame(0, index = history_event.columns, columns = history_event.columns)
        for item1_index in range(len(history_event.columns)):
            for item2_index in range(len(history_event.columns)):
                item1 = np.array(history_event.iloc[:,item1_index].values)
                item2 = np.array(history_event.iloc[:,item2_index].values)
                k11,k12,k21,k22 = self.calc_counts_row(item1,item2)
                llr = self.llr_2x2(k11,k12,k21,k22)
                coo_matrix.iloc[item1_index,item2_index] = llr
        return coo_matrix
    
    def calc_cross_coocurence_matrix(self, primary_history, secondary_event):
        coo_matrix = pd.DataFrame(0, index = primary_history.columns, columns = secondary_event.columns)
        for item1_index in range(len(primary_history.columns)):
            for item2_index in range(len(secondary_event.columns)):
                item1 = np.array(primary_history.iloc[:,item1_index].values)
                item2 = np.array(secondary_event.iloc[:,item2_index].values)
                k11,k12,k21,k22 = self.calc_counts_row(item1,item2)
                llr = self.llr_2x2(k11,k12,k21,k22)
                coo_matrix.iloc[item1_index,item2_index] =llr
        return coo_matrix
    
    def calc_counts_row(self, item1, item2):
        new_item = np.concatenate((item1.reshape(-1,1),item2.reshape(-1,1)), axis=1)
        if((np.any(new_item[:,1]<0)==True)|(np.any(new_item[:,0]<0)==True)):
            raise ValueError('History matrix has negative element')
        k22 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]==0)])
        k21 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]!=0)])
        k12 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]==0)])
        k11 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]!=0)])

        return k11,k12,k21,k22
    
    def llr_2x2(self, k11, k12, k21, k22):
        '''Special case of llr with a 2x2 table'''
        return 2 * (self.denormEntropy([k11+k12, k21+k22]) +
                    self.denormEntropy([k11+k21, k12+k22]) -
                    self.denormEntropy([k11, k12, k21, k22]))
    
    def denormEntropy(self,counts):
        '''Computes the entropy of a list of counts scaled by the sum of the counts. 
            If the inputs sum to one, this is just the normal definition of entropy'''
        lg = np.log(np.divide(counts,float(np.sum(counts))))
        lg[lg==-np.inf]=0
        return -np.sum(counts*lg)
    
    def return_recommended_llr(self, coo_dataframe, user_history):
        
        coo_matrix = coo_dataframe.as_matrix()
        user = np.array(user_history.values).reshape(-1,1)
        user_llr = np.dot(coo_matrix, user)
        return user_llr


In [8]:
a={'a':1,'b':2}

In [29]:
a=['s','d']
print(' and '.join(a))

s and d


In [53]:
matrix = [['a',1],['b',2]]

In [57]:
dict(np.array(matrix))

{'a': '1', 'b': '2'}

In [58]:
dict(matrix)

{'a': 1, 'b': 2}

In [28]:
np.array(matrix)

array([['a', '1'],
       ['b', '2']], 
      dtype='<U1')

In [29]:
a.keys()[0]

TypeError: 'dict_keys' object does not support indexing

In [14]:
a[list(a.keys())[0]]

1

In [34]:
getattr(self, params.keys)

dict_keys(['eventNames', 'primaryEvent', 'algorithm'])

In [43]:
b = pd.DataFrame(0, index=['0','100','200'], columns=['4'])

In [44]:
b.loc[['1','2'],:] = [2]

KeyError: "['1' '2'] not in index"

In [26]:
b.as_matrix()[:,0]

array([0, 2, 2])

In [3]:
list(a.keys())

['a', 'v']

In [9]:
s = '21:55:29'
int(str(pd.to_datetime(s).hour)+str(pd.to_datetime(s).minute)+str(pd.to_datetime(s).second))

215529

In [51]:
b.loc['0','4']=49
b.loc['100','4']=120
b.loc['200','4']=11

In [52]:
b

Unnamed: 0,4
0,49
100,120
200,11


In [61]:
list(b.sort_values('4',axis=0,ascending=False).index)[:3]

['100', '0', '200']

In [50]:
for integer, (index, series) in enumerate(b.iterrows()):
    print(series['4'])

0
0
0
