In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
df_train=pd.read_csv("/content/drive/MyDrive/Minor_Project/Instacart_history.csv")
df_test=pd.read_csv("/content/drive/MyDrive/Minor_Project/Instacart_future.csv")
df_train=df_train.head(10000)

In [4]:
# Mapping values
def encode(df,i):
  item_unique = df[i].unique().tolist()
  item_map = dict(zip(item_unique, range(1,len(item_unique) + 1)))
  item_map[-1] = 0
  df[i] = df[i].apply(lambda x: item_map[x])

In [5]:
df_train

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,MATERIAL_NUMBER
0,1,1,1
1,1,1,2
2,1,1,3
3,1,1,4
4,1,1,5
...,...,...,...
9995,166,2,305
9996,166,2,2732
9997,166,2,64
9998,166,2,2733


In [6]:
df_test

Unnamed: 0,CUSTOMER_ID,ORDER_NUMBER,MATERIAL_NUMBER
0,1,6,1
1,1,6,3
2,1,6,6
3,1,6,9
4,1,7,1
...,...,...,...
1348096,19935,98,23
1348097,19935,98,4964
1348098,19935,98,5415
1348099,19935,99,3381


In [7]:
# Renaming Columns
df_train.rename(columns = {'ORDER_NUMBER':'TRANSACTION_DT','MATERIAL_NUMBER':'PRODUCT_ID'}, inplace = True)
df_test.rename(columns = {'ORDER_NUMBER':'TRANSACTION_DT','MATERIAL_NUMBER':'PRODUCT_ID'}, inplace = True)

In [8]:
df_train['AMOUNT']=1
df_test['AMOUNT']=1

In [9]:
# Encoding Transaction date in tseting data
encode(df_test,'TRANSACTION_DT')
df_test

Unnamed: 0,CUSTOMER_ID,TRANSACTION_DT,PRODUCT_ID,AMOUNT
0,1,1,1,1
1,1,1,3,1
2,1,1,6,1
3,1,1,9,1
4,1,2,1,1
...,...,...,...,...
1348096,19935,97,23,1
1348097,19935,97,4964,1
1348098,19935,97,5415,1
1348099,19935,98,3381,1


In [10]:
# User's General Preferences
def UGP(df):
  # Selecting Necessary Columns
  df=df[['CUSTOMER_ID','PRODUCT_ID','AMOUNT']]

  # Grouping Data by CUSTOMER_ID and PRODUCT_ID
  df1=df.groupby(['CUSTOMER_ID','PRODUCT_ID']).sum()

  # Storing unique CUSTOMER_ID and PRODUCT_ID
  customer=df['CUSTOMER_ID'].unique().tolist()
  product=df['PRODUCT_ID'].unique().tolist()
  
  # Making Pivot Table
  df_final=pd.DataFrame()
  for i in customer:
    df2=df1.loc[i]
    df2.rename(columns = {'AMOUNT':i}, inplace = True)
    df2=df2.T
    df_final=pd.concat([df_final,df2])
    df_final=df_final.fillna(0)

  # Rearranging Columns according to their Values in ascending order
  x=df_final
  l=sorted(x.columns)
  x = x.reindex(columns=l)

  # Converting to Array
  R=x
  R = np.array(R)

  # Making User's General Prefernces
  n=len(R)
  m=len(R[0])
  ans=[[0 for i in range(m)] for j in range(n)]
  for i in range(n):
    sum=0
    for j in range(m):
      sum+=R[i][j]
    for j in range(m):
      if R[i][j]>0:
        ans[i][j]=R[i][j]/sum
  return ans

In [11]:
ugp=UGP(df_train)

In [12]:
t=df_test['CUSTOMER_ID'].unique().tolist()
print(len(t))
print(max(t))

19935
19935


In [13]:
Ugp=pd.DataFrame(ugp)

In [14]:
Ugp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2723,2724,2725,2726,2727,2728,2729,2730,2731,2732
0,0.172414,0.034483,0.172414,0.068966,0.068966,0.137931,0.068966,0.034483,0.103448,0.034483,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012500,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
162,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
163,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.035714,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
164,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.400000,0.000000,0.000000,0.000000,...,0.2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
# Transition Pattern among Items
def TPI(df):
  #Adding new column which contain γ(T −t)
  dt=51
  df['value']=0.2**((dt-df['TRANSACTION_DT']))

  # Grouping data according to customer_id and product_id and performing summation of γ(T −t)
  df1=df.groupby(['CUSTOMER_ID','PRODUCT_ID']).sum()

  # Storing unique CUSTOMER_ID and PRODUCT_ID
  customer=df['CUSTOMER_ID'].unique().tolist()
  product=df['PRODUCT_ID'].unique().tolist()

  # Making Dataset for matrix multiplication(substitue of pivot table)
  df_final=pd.DataFrame()
  for i in customer:
    df2=df1.loc[i]
    df2=df2.drop(['TRANSACTION_DT','AMOUNT'],axis=1)
    df2.rename(columns = {'value':i}, inplace = True)
    df2=df2.T
    df_final=pd.concat([df_final,df2])
    df_final=df_final.fillna(0)

  # Rearranging Columns according to their Values in ascending order
  x=df_final
  l=sorted(x.columns)
  x = x.reindex(columns=l)

  arr = df_final.to_numpy()

  return arr

  # # Storing Columns Name
  # col=df_final.columns

  # # Doing Matrix Multiplication with identity matrix
  # s=len(col)
  # weight_matrix=np.random.randint(10, size=(s, s))
  # index=list(df_final.index)
  # result=[]
  # for i in index:
  #   x=np.array(df_final.loc[i])
  #   d=np.matmul(x,weight_matrix)
  #   d=np.array(d)
  #   d=d.flatten()
  #   result.append(d)
  
  # # Applying tanh()
  # for i in range(len(result)):
  #   for j in range(len(result[0])):
  #     result[i][j]=math.tanh(result[i][j])
  
  # #result
  # return result

In [16]:
tpi=TPI(df_train)

In [17]:
Tpi=pd.DataFrame(tpi)

In [18]:
Tpi

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2723,2724,2725,2726,2727,2728,2729,2730,2731,2732
0,8.793278e-33,1.125900e-35,8.793278e-33,6.755399e-35,1.418634e-33,8.782019e-33,7.093169e-33,5.629500e-35,8.725724e-33,2.814750e-34,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.125900e-35,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
162,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
163,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,6.755399e-35,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
164,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,3.377700e-34,0.000000e+00,0.000000e+00,0.000000e+00,...,1.125900e-35,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [74]:
# Global Popularities of Items
def GPI(df_train):
  # Extracting all unique CUSTOMER_ID
  customer=df_train['CUSTOMER_ID'].unique().tolist()

  # Making Basket of each Customer according to Transaction Date
  basket_train=[]
  for i in customer:
    df2=df_train[df_train['CUSTOMER_ID']==i]
    df2=df2[['TRANSACTION_DT','PRODUCT_ID']]
    date=df2['TRANSACTION_DT'].unique().tolist()
    x=[]
    for j in date:
      df3=df2[df2['TRANSACTION_DT']==j]
      df3=df3['PRODUCT_ID']
      arr=df3.to_numpy()
      a=list(map(str, arr))
      x.append(a)
    basket_train.append(x)

  # Extracting all unique PRODUCT_ID
  product=df_train['PRODUCT_ID'].unique().tolist()
  n=max(product)
  
  # Initialising Values with zeros for all Product_ID, when we encounter with anyone will increase it's value by 1
  GPI=[0]*(n)

  # Making array for GPI
  for i in range(len(basket_train)):
    for j in range(len(basket_train[i])):
      for k in range(len(basket_train[i][j])):
        GPI[int(basket_train[i][j][k])-1]+=1


  return GPI

In [17]:
gpi=GPI(df_train)
gpi

[41,
 2,
 11,
 2,
 4,
 4,
 123,
 1,
 3,
 21,
 1,
 1,
 9,
 6,
 52,
 3,
 3,
 1,
 5,
 4,
 11,
 14,
 29,
 16,
 29,
 3,
 4,
 3,
 2,
 8,
 1,
 4,
 2,
 7,
 33,
 1,
 108,
 90,
 2,
 2,
 1,
 15,
 10,
 8,
 4,
 4,
 1,
 2,
 1,
 4,
 3,
 2,
 3,
 9,
 14,
 8,
 9,
 3,
 15,
 10,
 3,
 4,
 3,
 21,
 3,
 7,
 4,
 60,
 1,
 2,
 2,
 2,
 3,
 12,
 8,
 77,
 1,
 16,
 3,
 2,
 2,
 6,
 2,
 2,
 10,
 16,
 8,
 7,
 20,
 10,
 20,
 3,
 2,
 1,
 1,
 1,
 4,
 50,
 1,
 3,
 4,
 16,
 22,
 20,
 19,
 12,
 11,
 2,
 3,
 1,
 34,
 1,
 4,
 3,
 19,
 6,
 7,
 2,
 2,
 37,
 2,
 9,
 32,
 7,
 4,
 8,
 2,
 4,
 3,
 9,
 3,
 14,
 6,
 5,
 9,
 11,
 4,
 5,
 3,
 6,
 2,
 5,
 5,
 22,
 10,
 7,
 3,
 3,
 4,
 19,
 2,
 2,
 4,
 24,
 1,
 15,
 1,
 6,
 3,
 5,
 14,
 9,
 5,
 9,
 3,
 26,
 2,
 4,
 3,
 1,
 11,
 1,
 17,
 2,
 13,
 12,
 1,
 4,
 10,
 8,
 1,
 2,
 6,
 1,
 8,
 21,
 7,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 5,
 1,
 1,
 7,
 3,
 2,
 11,
 11,
 2,
 4,
 3,
 3,
 10,
 1,
 5,
 2,
 3,
 13,
 7,
 9,
 8,
 5,
 5,
 8,
 10,
 5,
 4,
 1,
 6,
 4,
 2,
 3,
 2,
 3,
 2,

In [18]:
gpi1=pd.DataFrame(gpi)
gpi1.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2723,2724,2725,2726,2727,2728,2729,2730,2731,2732
0,41,2,11,2,4,4,123,1,3,21,...,1,1,1,1,1,1,1,1,1,1


In [None]:
tpi=np.array(tpi)
ugp=np.array(ugp)
gpi=np.array(gpi)

In [None]:
x=0.8*(tpi+ugp)
y=0.2*gpi

In [None]:
# Test Basket Preparation

In [None]:
# Calculating total number of customer and product in training data
cus=df_test['CUSTOMER_ID'].unique().tolist()
r=len(cus)
pro=df_test['PRODUCT_ID'].unique().tolist()
c=len(pro)

In [None]:
# Test Set 
def Test(df_train,dt,r,c):
  
  basket=[]
  df2=df_train[df_train['TRANSACTION_DT']==dt]
  df2=df2[['CUSTOMER_ID','PRODUCT_ID']]
  customer=df2['CUSTOMER_ID'].unique().tolist()
  customer.sort()
  for j in range(1,r+1):
    if j in customer:
      df3=df2[df2['CUSTOMER_ID']==j]
      df3=df3['PRODUCT_ID']
      arr=df3.to_numpy()
      a=list(map(str, arr))
      basket.append(a)
    else:
      a=[]
      basket.append(a)
  # print(len(basket[0]))
  test=[[0 for i in range(c)] for j in range(r)]
  for i in range(len(basket)):
    # print(i)
    if len(basket[i])!=0:
      # print('a')
      for j in range(len(basket[i])):
        test[int(i)-1][int(j)-1]=1

  return test

In [None]:
# Recall calculation

In [None]:
date = df_test['TRANSACTION_DT'].unique().tolist()
date.sort()
for dt in date:
  df2=df_test[df_test['TRANSACTION_DT']==dt]
  test_customer= df2['CUSTOMER_ID'].unique().tolist()
  test_customer.sort()
  actual=Test(df_test,dt,r,c)
  e=0
  for i in test_customer:
    if i>=166:
      continue
    prediction=x[i-1]+y
    count=0
    for j in range(20):
      a=max(prediction)
      j=np.where(prediction==a)[0][0]
      prediction[j]=0
      if((a>=0.49 and actual[i-1][j]==1) or (a<0.49 and actual[i-1][j]==0)):
        count+=1
    e+=count/20
print("recall@20 for date",dt, "is", e)