*Note: run command ```D:\elasticsearch-7.12.1\bin\elasticsearch``` in the terminal to start elasticsearch server*

(change path accordingly)

In [16]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
data_path = './data'

In [6]:
df = pd.read_csv(os.path.join(data_path,'events.csv'))
df.shape

(2756101, 5)

In [7]:
df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [8]:
df.event.unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [9]:
trans = df[df['event'] == 'transaction']
trans.shape

(22457, 5)

In [10]:
visitors = trans['visitorid'].unique()
items = trans['itemid'].unique()
print(visitors.shape)
print(items.shape)

(11719,)
(12025,)


In [11]:
trans2 = trans.groupby(['visitorid']).head(50)
trans2.shape

(19939, 5)

In [17]:
trans2.loc[:,'visitors'] = trans2['visitorid'].apply(lambda x : np.argwhere(visitors == x)[0][0])
trans2.loc[:,'items'] = trans2['itemid'].apply(lambda x : np.argwhere(items == x)[0][0])
trans2.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,visitors,items
130,1433222276276,599528,transaction,356475,4000.0,0,0
304,1433193500981,121688,transaction,15335,11117.0,1,1
418,1433193915008,552148,transaction,81345,5444.0,2,2
814,1433176736375,102019,transaction,150318,13556.0,3,3
843,1433174518180,189384,transaction,310791,7244.0,4,4


In [18]:
from scipy.sparse import csr_matrix

In [19]:
occurences = csr_matrix((visitors.shape[0], items.shape[0]), dtype='int8')
def set_occurences(visitor, item):
    occurences[visitor, item] += 1
trans2.apply(lambda row: set_occurences(row['visitors'], row['items']), axis=1)
occurences

<11719x12025 sparse matrix of type '<class 'numpy.int8'>'
	with 18905 stored elements in Compressed Sparse Row format>

In [20]:
cooc = occurences.transpose().dot(occurences)
cooc.setdiag(0)

In [21]:
def xLogX(x):
    return x * np.log(x) if x != 0 else 0.0

def entropy(x1, x2=0, x3=0, x4=0):
    return xLogX(x1 + x2 + x3 + x4) - xLogX(x1) - xLogX(x2) - xLogX(x3) - xLogX(x4)

def LLR(k11, k12, k21, k22):
    rowEntropy = entropy(k11 + k12, k21 + k22)
    columnEntropy = entropy(k11 + k21, k12 + k22)
    matrixEntropy = entropy(k11, k12, k21, k22)
    if rowEntropy + columnEntropy < matrixEntropy:
        return 0.0
    return 2.0 * (rowEntropy + columnEntropy - matrixEntropy)

def rootLLR(k11, k12, k21, k22):
    llr = LLR(k11, k12, k21, k22)
    sqrt = np.sqrt(llr)
    if k11 * 1.0 / (k11 + k12) < k21 * 1.0 / (k21 + k22):
        sqrt = -sqrt
    return sqrt

In [22]:
row_sum = np.sum(cooc, axis=0).A.flatten()
column_sum = np.sum(cooc, axis=1).A.flatten()
total = np.sum(row_sum, axis=0)
pp_score = csr_matrix((cooc.shape[0], cooc.shape[1]), dtype='double')
cx = cooc.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
    if v != 0:
        k11 = v
        k12 = row_sum[i] - k11
        k21 = column_sum[j] - k11
        k22 = total - k11 - k12 - k21
        pp_score[i,j] = rootLLR(k11, k12, k21, k22)

In [23]:
result = np.flip(np.sort(pp_score.A, axis=1), axis=1)
result_indices = np.flip(np.argsort(pp_score.A, axis=1), axis=1)

In [24]:
result[8456]

array([15.33511076, 14.60017668,  3.62091635, ...,  0.        ,
        0.        ,  0.        ])

In [25]:
result_indices[8456]

array([8682,  380, 8501, ..., 8010, 8009,    0], dtype=int64)

After adjusting for biases (e.g. popularity bias, trend bias etc.) using LLR method, item 8682 with score 15.34 and 380 with score 14.6 (respectively) have high co-occurence with item 8456. It means that, if someone bought 8682 and 380, we can recommend him 8456.

In [26]:
minLLR = 5
indicators = result[:, :50]
indicators[indicators < minLLR] = 0.0
indicators_indices = result_indices[:, :50]
max_indicator_indices = (indicators==0).argmax(axis=1)
max = max_indicator_indices.max()
indicators = indicators[:, :max+1]
indicators_indices = indicators_indices[:, :max+1]

In [27]:
import requests
import json

In [28]:
actions = []

for i in range(indicators.shape[0]):
    length = indicators[i].nonzero()[0].shape[0]
    real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
    id = items[i]
    
    action = { "index" : { "_index" : "items2", "_id" : str(id) } }
    
    data = {
        "id": int(id),
        "indicators": real_indicators
    }
    
    actions.append(json.dumps(action))
    actions.append(json.dumps(data))
    
    if len(actions) == 200:
        actions_string = "\n".join(actions) + "\n"
        actions = []
        
        url = "http://127.0.0.1:9200/_bulk/"
        headers = {
            "Content-Type" : "application/x-ndjson"
        }
        requests.post(url, headers=headers, data=actions_string)
        
if len(actions) > 0:
    actions_string = "\n".join(actions) + "\n"
    actions = []
    url = "http://127.0.0.1:9200/_bulk/"
    headers = {
        "Content-Type" : "application/x-ndjson"
    }
    requests.post(url, headers=headers, data=actions_string)

In [None]:
from IPython.display import JSON

In [42]:
url = 'http://127.0.0.1:9200/items2/_count'
headers = {'Content-Type':'application/x-ndjson'}
res = requests.post(url, headers=headers)
JSON(res.text, expanded=True)

<IPython.core.display.JSON object>

In [73]:
# url = 'http://127.0.0.1:9200/items2/_search?pretty&size=2'
# headers = {'Content-Type':'application/x-ndjson'}
# res = requests.get(url, headers=headers)
# JSON(res.text, expanded=True)

In [75]:
url = 'http://127.0.0.1:9200/items2/_source/240708'
headers = {'Content-Type':'application/x-ndjson'}
res = requests.get(url, headers=headers)
JSON(res.text, expanded=True)

<IPython.core.display.JSON object>

In [85]:
url = 'http://127.0.0.1:9200/items2/_search'
headers = {'Content-Type':'application/x-ndjson'}
data = {
  'query': {
    'bool': {
     'should': [
      { 'terms': {'indicators' : [240708], 'boost': 2}}
     ]
    }
  }
}
res = requests.post(url, headers=headers, data=json.dumps(data))
JSON(res.text, expanded=True)

<IPython.core.display.JSON object>

In [86]:
url = 'http://127.0.0.1:9200/items2/_search'
headers = {'Content-Type':'application/x-ndjson'}
data = {
  "query": {
    "bool": {
     "should": [
      { "terms": {"indicators" : [240708]}},
      { "constant_score": {"filter" : {"match_all": {}}, "boost" : 0.000001}}
     ]
    }
  }
}
res = requests.post(url, headers=headers, data=json.dumps(data))
JSON(res.text, expanded=True)

<IPython.core.display.JSON object>

In [87]:
popular = np.zeros(items.shape[0])
def inc_popular(index):
    popular[index] += 1
trans2.apply(lambda row: inc_popular(row['items']), axis=1)

130        None
304        None
418        None
814        None
843        None
           ... 
2755082    None
2755285    None
2755294    None
2755508    None
2755607    None
Length: 19939, dtype: object

In [89]:
actions = []

for i in range(indicators.shape[0]):
    length = indicators[i].nonzero()[0].shape[0]
    real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
    id = items[i]
    
    action = { "index" : { "_index" : "items3", "_id" : str(id) } }

    data = {
        "id": int(id),
        "indicators": real_indicators,
        "popular": popular[i]
    }
    
    actions.append(json.dumps(action))
    actions.append(json.dumps(data))
    
    if len(actions) == 200:
        actions_string = "\n".join(actions) + "\n"
        actions = []
        
        url = "http://127.0.0.1:9200/_bulk/"
        headers = {
            "Content-Type" : "application/x-ndjson"
        }
        requests.post(url, headers=headers, data=actions_string)
        
if len(actions) > 0:
    actions_string = "\n".join(actions) + "\n"
    actions = []
    url = "http://127.0.0.1:9200/_bulk/"
    headers = {
        "Content-Type" : "application/x-ndjson"
    }
    requests.post(url, headers=headers, data=actions_string)

In [90]:
url = 'http://127.0.0.1:9200/items3/_source/240708'
headers = {'Content-Type':'application/x-ndjson'}
res = requests.get(url, headers=headers)
JSON(res.text, expanded=True)

<IPython.core.display.JSON object>

In [95]:
data = {
  "query": {
    "function_score":{
     "query": {
      "bool": {
       "should": [
        { "terms": {"indicators" : [240708], "boost": 2}},
        { "constant_score": {"filter" : {"match_all": {}}, "boost" : 0.000001}}
       ]
      }
    },
     "functions":[
      {
       "filter": {"range": {"popular": {"gt": 0}}},
       "script_score" : {
                 "script" : {
                   "source": "doc['popular'].value * 0.1"
                 }
             }
      }
     ],
     "score_mode": "sum",
     "min_score" : 0
    }
  }
}

url = 'http://127.0.0.1:9200/items3/_search'
headers = {'Content-Type':'application/x-ndjson'}
res = requests.get(url, headers=headers, data=json.dumps(data))
JSON(res.text, expanded=False)

<IPython.core.display.JSON object>

In [96]:
data = {
  "query": {
    "function_score":{
     "query": {
      "bool": {
       "should": [
        { "terms": {"indicators" : [240708], "boost": 2}},
        { "constant_score": {"filter" : {"match_all": {}}, "boost" : 0.000001}}
       ]
      }
    },
     "functions":[
      {
       "filter": {"range": {"popular": {"gt": 1}}},
       "script_score" : {
                 "script" : {
                   "source": "0.1 * Math.log(doc['popular'].value)"
                 }
             }
      },
      {
       "filter": {"match_all": {}},
       "random_score": {}
      }
     ],
     "score_mode": "sum",
     "min_score" : 0
    }
  }
}

url = 'http://127.0.0.1:9200/items3/_search'
headers = {'Content-Type':'application/x-ndjson'}
res = requests.get(url, headers=headers, data=json.dumps(data))
JSON(res.text, expanded=False)

<IPython.core.display.JSON object>