# DEFINING FUNCTIONS

In [1]:
#Dependencies
import pandas as pd
import numpy as np
import logging
import imp

In [2]:
#Reading the csv file
events = pd.read_csv('events.csv')

In [3]:
#Taking a subset of the file for faster prototyping
events = events.iloc[1:100,:]

In [4]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 1 to 99
Data columns (total 5 columns):
timestamp        99 non-null int64
visitorid        99 non-null int64
event            99 non-null object
itemid           99 non-null int64
transactionid    0 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 3.9+ KB


In [6]:
#convering timestamp into pandas datetime format
events.timestamp = pd.to_datetime(events.timestamp, unit='ms')

In [7]:
#Checking different unique events in the dataframe
events.event.unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [8]:
#creating dataframe for each event
view = events.loc[events.event == 'view']
addtocart = events.loc[events.event == 'addtocart']
transaction = events.loc[events.event == 'transaction']

In [10]:
#Keeping relevant columns
view = view.loc[:,['visitorid','itemid']]
addtocart = addtocart.loc[:,['visitorid', 'itemid']]
transaction = transaction.loc[:,['visitorid', 'itemid']]

In [286]:
#Creating visitorid as index
view.index = view.visitorid
view.drop('visitorid', axis=1, inplace =True)
addtocart.index = addtocart.visitorid
addtocart.drop('visitorid', axis=1, inplace = True)
transaction.index = transaction.visitorid
transaction.drop('visitorid', axis=1, inplace=True)

In [290]:
#Creating history matrix
history_view = pd.get_dummies(view, columns=['itemid'])
history_addtocart = pd.get_dummies(addtocart, columns=['itemid'])
history_transaction = pd.get_dummies(transaction, columns=['itemid'])

In [291]:
#Counting the duplicate rows and adding the count
history_view = history_view.groupby([history_view.index])[history_view.filter(regex='itemid_.*').columns].sum()
history_addtocart = history_addtocart.groupby([history_addtocart.index])[history_addtocart.filter(regex='itemid_.*').columns].sum()
history_transaction = history_transaction.groupby([history_transaction.index])[history_transaction.filter(regex='itemid_.*').columns].sum()

In [303]:
#Converting into a union dataframe
def union_dataframe(history_event, ind_union, col_union):
    
    history_event = history_event.copy()
    index_event_df = pd.DataFrame(0, index=ind_union.difference(history_event.index), columns= history_event.columns)
    history_event = pd.concat([history_event, index_event_df], axis=0)
    col_event_df = pd.DataFrame(0,index=history_event.index, columns=col_union.difference(history_event.columns))
    history_event= pd.concat([history_event, col_event_df], axis=1)
    #Sort rows and columns for uniformity
    history_event = history_event.reindex_axis(sorted(history_event.columns), axis=1).sort_index()
    return history_event

ind_union = history_view.index.union(history_addtocart.index).union(history_transaction.index)
col_union = history_view.columns.union(history_addtocart.columns).union(history_transaction.columns)
history_view =  union_dataframe(history_view,ind_union, col_union)
history_addtocart =  union_dataframe(history_addtocart,ind_union, col_union)
history_transaction =  union_dataframe(history_transaction,ind_union, col_union)

In [295]:
#Check which visitor has viewed which item
df = history_view.copy()
cols = df.columns
bt = df.apply(lambda x: x > 0)
bt.apply(lambda x: list(cols[x.values]), axis=1)

visitorid
137                                      [itemid_383819]
202                                       [itemid_62641]
458                                      [itemid_182419]
533                                      [itemid_360664]
581                                      [itemid_388097]
765                                      [itemid_287017]
845                                      [itemid_351530]
1322                                      [itemid_96924]
1485                                      [itemid_32971]
1654                                     [itemid_123555]
1713                                     [itemid_270144]
1722                                     [itemid_381314]
1756       [itemid_150100, itemid_296448, itemid_346892]
2081                                     [itemid_221146]
2160                                     [itemid_280029]
2366                                     [itemid_221428]
2610                                     [itemid_159856]
2900                 

In [59]:
#Functions to calculate entropy
#k11, k12, k21, and k22 are the counts of simultaneously occuring of two events

def denormEntropy1(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    lg = np.log(np.divide(k,float(np.sum(k))))
    lg[lg==-np.inf]=0
    return -np.sum(k*lg)

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * (denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

In [95]:
#calculating counts k11, k12, k21, k22 for each item, item combination in history matrix
#Then calculating cross-cooccurence matrix

def calc_counts_row(item1, item2):
    new_item = np.concatenate((item1.reshape(-1,1),item2.reshape(-1,1)), axis=1)
    if((np.any(new_item[:,1]<0)==True)|(np.any(new_item[:,0]<0)==True)):
        raise ValueError('History matrix has negative element')
    k22 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]==0)])
    k21 = len(new_item[(new_item[:,0]==0)&(new_item[:,1]!=0)])
    k12 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]==0)])
    k11 = len(new_item[(new_item[:,0]!=0)&(new_item[:,1]!=0)])
    
    return k11,k12,k21,k22

def calc_cooccurence_matrix(history_event):
    coo_matrix = np.zeros((history_event.shape[1],history_event.shape[1]))
    for item1_index in range(history_event.shape[1]):
        for item2_index in range(history_event.shape[1]):
            item1 = history_event[:,item1_index]
            item2 = history_event[:,item2_index]
            llr = llr_2x2(calc_counts_row(item1,item2))
            coo_matrix[item1_index,item2_index] = llr
    return coo_matrix

def calc_cross_coocurence_matrix(primary_history, secondary_event):
    coo_matrix = np.zeros((primary_history.shape[1], primary_history.shape[1]))
    for item1_index in range(primary_history.shape[1]):
        for item2_index in range(secondary_event.shape[1]):
            item1 = primary_history[:,item1_index]
            item2 = secondary_event[:,item2_index]
            llr = llr_2x2(calc_counts_row(item1,item2))
            coo_matrix[item1_index,item2_index] =llr
    return coo_matrix

def return_recommended_llr(coo_matrix, user_history):
    user_history = user_history.reshape(-1,1)
    user_llr = np.dot(coo_matrix, user_history)
    return llr


#Recommendation training flow
1. Creating a history matrix (including adding count for duplicate events by same user)
2. Converting it into union history matrix
3. Calculating cooccurence and cross-cooccurence matrix
4. Calculating user_llr for each event
5. Adding user_llr and recommending based on the number of recommendations specified

# TEST RUN

In [14]:
#Appending path and importing
import sys
sys.path.append('/home/siddharth/Desktop/Sidd_files/Recommendation/urecommend')
import urecommend

In [15]:
#Defining metadata params dictionary
params = {'eventNames': ['view','addtocart', 'transaction'],
          'primaryEvent' : 2,
          'algorithm' : {'name': 'Universal Recommender',
                         'no_recommendations': 3,
                         'time_dependency': False},
          'log_path':'/home/siddharth/Desktop/Sidd_files/Recommendation/urecommend',
          'ipython_notebook':True
         }

In [16]:
#Training the recommender
u = urecommend.urecommend(params)
u.fit(events)

  lg = np.log(np.divide(counts,float(np.sum(counts))))


In [17]:
#Defining the user_history dictionary
user_history = {}
user_history['view']=[['itemid_128499', 1],['itemid_22556', 3]]
user_history['transaction']=[['itemid_21989', 1]]
user_history['addtocart'] = None

In [18]:
#Predicting the recommendations
u.predict(user_history)

['itemid_102061', 'itemid_328025', 'itemid_417464']