Data available at : https://github.com/LianHaiMiao/Attentive-Group-Recommendation 

# **Data Exploration**

In [1]:
import scipy.sparse as sp
import numpy as np

In [2]:
class Config(object):
  def __init__(self):
    self.data_path='/content/'

In [3]:

class Dataset(object):
  def __init__ (self, data_path)  :
        
        print("loading User Train Matrix...")
        self.user_trainMatrix=self.load_rating_file_as_matrix(data_path+"userRatingTrain.txt")
        print("loading Group Train Matrix...")
        self.group_trainMatrix=self.load_rating_file_as_matrix(data_path+"groupRatingTrain.txt")
        print("loading User Test Matrix...")
        self.user_testMatrix=self.load_rating_file_as_matrix(data_path+"userRatingTest.txt")
        print("loading Group Test Matrix...")
        self.group_testMatrix=self.load_rating_file_as_matrix(data_path+"groupRatingTest.txt")
        
        # process negative data
        #print("loading User Negative into User test Matrix:")
        #self.user_testMatrix=self.load_negative_data(self.user_testMatrix, data_path+"userRatingNegative.txt")
        
        #taken num items as per train data since they are different in test matrices from train matrice item counts
        self.num_users, self.num_items = self.user_trainMatrix.shape
        self.num_groups=self.group_trainMatrix.shape[0]

        #implicit matrices
        self.implicit_user_trainMatrix=np.zeros((self.user_trainMatrix.shape))
        self.implicit_user_testMatrix=np.zeros((self.user_testMatrix.shape))
        self.implicit_group_trainMatrix=np.zeros((self.group_trainMatrix.shape))
        self.implicit_group_testMatrix=np.zeros((self.group_testMatrix.shape))

        print("loading Implicit User Train Matrix...")
        self.implicit_user_trainMatrix[self.user_trainMatrix!=0]=1
        print("loading Implicit User Test Matrix...")
        self.implicit_user_testMatrix[self.user_testMatrix!=0]=1
        print("loading Implicit Group Train Matrix...")
        self.implicit_group_trainMatrix[self.group_trainMatrix!=0]=1
        print("loading Implicit Group Test Matrix...")
        self.implicit_group_testMatrix[self.group_testMatrix!=0]=1

        #group-user mapping
        print("loading Group-User Mapping Data...")
        self.group_user_Dict=self.extract_group_user_data(data_path+"groupMember.txt")

  def extract_group_user_data(self, filename):
        group_user_dict={}
        with open(filename, "r") as f:
            line=f.readline()
            while line!=None and line!="":
                arr=line.split(" ")
                arr[1] = arr[1].replace("\n", "")
                members=arr[1].split(",")
                if arr[0] not in group_user_dict:
                    group_user_dict[int(arr[0])]= [int(x) for x in members]
                line=f.readline()
        return group_user_dict

  def load_rating_file_as_matrix(self, filename):
        # Get number of users and items
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line=f.readline()
            while line!=None and line!="":
                arr=line.split(" ")
                u,i=int(arr[0]), int(arr[1])
                num_users=max(num_users,u)
                num_items=max(num_items,i)
                line=f.readline()
      
        mat=np.zeros((num_users+1, num_items+1))
        with open(filename, "r") as f:
            line=f.readline()
            while line!=None and line!="":
                arr=line.split(" ")
                mat[int(arr[0])-1][int(arr[1])-1]=int(arr[2])
                line=f.readline()

        return mat

  #TBD
  def load_negative_data(self, matrixname, filename):
        with open(filename, "r") as f:
            line=f.readline()
            while line!=None and line!="":
                arr=line.split(" ")
                print(arr)
                line=f.readline()



In [4]:
if __name__=='__main__':
  config=Config()
  dataset=Dataset(config.data_path)
  num_users, num_items, num_groups = dataset.num_users, dataset.num_items, dataset.num_groups



loading User Train Matrix...
loading Group Train Matrix...
loading User Test Matrix...
loading Group Test Matrix...
loading Implicit User Train Matrix...
loading Implicit User Test Matrix...
loading Implicit Group Train Matrix...
loading Implicit Group Test Matrix...
loading Group-User Mapping Data...


**Dataset Details:**

In [5]:
print("num users, num items, num groups=({}, {}, {})".format(num_users, num_items, num_groups))
print()
print("user_traindata shape={}".format(dataset.user_trainMatrix.shape))
print("user_testdata shape={}".format(dataset.user_testMatrix.shape))
print("group_traindata shape={}".format(dataset.group_trainMatrix.shape))
print("group_testdata shape={}".format(dataset.group_testMatrix.shape))
print()
print("Implicit user_traindata shape={}".format( dataset.implicit_user_trainMatrix.shape))
print("implicit user_testdata shape={}".format(  dataset.implicit_user_testMatrix.shape))
print("implicit group_traindata shape={}".format(dataset.implicit_group_trainMatrix.shape))
print("implicit group_testdata shape={}".format( dataset.implicit_group_testMatrix.shape))
print()
print('Group User Data:')
print(dataset.group_user_Dict)

num users, num items, num groups=(602, 7710, 290)

user_traindata shape=(602, 7710)
user_testdata shape=(602, 7679)
group_traindata shape=(290, 7710)
group_testdata shape=(290, 7656)

Implicit user_traindata shape=(602, 7710)
implicit user_testdata shape=(602, 7679)
implicit group_traindata shape=(290, 7710)
implicit group_testdata shape=(290, 7656)

Group User Data:
{216: [346, 414], 217: [433, 526], 214: [559, 570], 215: [226, 294], 212: [415, 470], 213: [43, 267, 308], 210: [443, 520], 211: [53, 392], 165: [451, 496], 264: [105, 171], 265: [556, 253, 366], 218: [334, 386], 219: [199, 302], 133: [6, 126], 132: [141, 519], 131: [480, 500], 130: [179, 348], 137: [106, 524], 136: [304, 587], 135: [42, 510], 134: [113, 120], 139: [440, 545], 138: [365, 490], 166: [258, 397], 24: [27, 404], 25: [58, 252], 26: [157, 565, 431], 27: [347, 462], 20: [8, 435], 21: [152, 484], 22: [271, 502], 23: [155, 381], 160: [391, 405], 28: [597, 521], 29: [23, 523], 161: [210, 486], 289: [61, 475], 0: [21

**NOTE: Need to check :**



1.   Why is test data's item counts different from train data's
2.   Plan how to incorporate and make use of negative data wrt to each item and each user/group




# **Module Function Scripts**

**Gartrell, M., Xing, X., Lv, Q., Beach, A., Han, R., Mishra, S., & Seada, K. (2010, November). Enhancing group recommendation by incorporating social relationship interactions. In Proceedings of the 16th ACM international conference on Supporting group work (pp. 97-106).**

***1. Social Relationship***

In [6]:
similarity_in_group=np.zeros(num_groups)

for group_id in dataset.group_user_Dict:
  group_members= dataset.group_user_Dict[group_id]
  sum_of_weight_similarities=0
  
  
  for i in range(len(group_members)):
    for j in range(i+1,len(group_members)):
      user1=group_members[i]
      user2=group_members[j]
      if user1 != user2:
        w_ij=0
        user1_items=np.nonzero(dataset.user_trainMatrix[user1])[0]
        user2_items=np.nonzero(dataset.user_trainMatrix[user2])[0]
        common_items = set(user1_items).intersection(user2_items)
        num_common_items = len(common_items)
        total_items = set(user1_items).union(user2_items)
        num_total_items = len(total_items)

        w_ij=num_common_items/num_total_items
        sum_of_weight_similarities+=w_ij
  similarity_in_group[group_id]=(2*sum_of_weight_similarities)/(len(group_members)*(len(group_members)-1))

#print(similarity_in_group)   

In [8]:
social_descriptor_groupwise=np.zeros(num_groups)
sorted_values = sorted(similarity_in_group)

## Compute the statistical thresholds for each category (since we dont have very strong similarities in any group)
#q1 = sorted_values[int(len(sorted_values) * 0.2)]
#q2 = sorted_values[int(len(sorted_values) * 0.4)]
#q3 = sorted_values[int(len(sorted_values) * 0.6)]
#q4 = sorted_values[int(len(sorted_values) * 0.8)]
#
#print(q1,",",q2,",",q3,",",q4)
#
#for i in range(num_groups):
#        if similarity_in_group[i] <= q1:
#            social_descriptor_groupwise[i]=0
#        elif similarity_in_group[i] <= q2:
#            social_descriptor_groupwise[i]=1
#        elif similarity_in_group[i] <= q3:
#            social_descriptor_groupwise[i]=2
#        elif similarity_in_group[i] <= q4:
#            social_descriptor_groupwise[i]=3
#        else:
#            social_descriptor_groupwise[i]=4
#    
#print(social_descriptor_groupwise)

# Compute the statistical thresholds for each category (since we dont have very strong similarities in any group)
q1 = sorted_values[int(len(sorted_values) * 0.33)]
q2 = sorted_values[int(len(sorted_values) * 0.67)]


print(q1,",",q2)

for i in range(num_groups):
        if similarity_in_group[i] <= q1:
            social_descriptor_groupwise[i]=0
        elif similarity_in_group[i] <= q2:
            social_descriptor_groupwise[i]=1
        else:
            social_descriptor_groupwise[i]=2
    
print(social_descriptor_groupwise)




0.06550218340611354 , 0.1299342105263158
[0. 0. 1. 1. 0. 2. 2. 0. 1. 2. 2. 2. 1. 1. 2. 1. 2. 0. 2. 2. 1. 2. 2. 2.
 2. 1. 2. 2. 0. 0. 1. 2. 2. 2. 2. 2. 1. 1. 0. 2. 1. 0. 2. 0. 2. 0. 1. 2.
 2. 2. 2. 0. 1. 1. 2. 2. 2. 0. 1. 0. 1. 2. 1. 0. 0. 2. 0. 0. 0. 2. 2. 1.
 2. 1. 0. 0. 2. 0. 2. 2. 1. 1. 0. 1. 2. 1. 2. 0. 2. 0. 0. 1. 1. 0. 2. 0.
 0. 0. 0. 1. 1. 1. 0. 2. 1. 0. 0. 1. 1. 2. 0. 0. 2. 1. 0. 1. 0. 0. 1. 1.
 2. 0. 1. 0. 0. 1. 1. 0. 2. 2. 2. 0. 1. 2. 2. 0. 1. 1. 0. 2. 0. 1. 1. 0.
 1. 0. 1. 0. 1. 2. 2. 2. 0. 1. 0. 1. 0. 1. 2. 1. 0. 0. 1. 2. 0. 1. 2. 0.
 2. 2. 0. 2. 0. 1. 0. 0. 0. 2. 0. 0. 1. 0. 2. 0. 0. 2. 1. 0. 1. 2. 2. 0.
 0. 1. 2. 1. 1. 1. 1. 1. 2. 1. 0. 0. 1. 1. 2. 2. 0. 2. 2. 2. 2. 1. 0. 2.
 0. 1. 0. 1. 2. 1. 1. 0. 1. 2. 1. 0. 2. 0. 1. 1. 2. 0. 0. 0. 1. 0. 1. 0.
 1. 1. 0. 2. 1. 0. 1. 0. 1. 1. 2. 0. 1. 2. 1. 1. 2. 2. 1. 2. 0. 2. 1. 2.
 1. 0. 2. 0. 2. 2. 2. 2. 1. 1. 2. 0. 1. 1. 0. 1. 2. 1. 0. 0. 1. 0. 1. 2.
 1. 1.]


***2. Expertise descriptor***

In [17]:
#calculate absolute user expertise
user_expertise=np.zeros(num_users)
x=0
for user_data in dataset.user_trainMatrix:
    #print(user_data)
    user_i_item_count=len(np.nonzero(user_data)[0])
    #print(user_i_item_count)
    user_expertise[x]=user_i_item_count/num_items
    x+=1

#print(user_expertise)
absolute_user_expertise_levels=np.zeros(num_users)
sorted_expertise_values = sorted(user_expertise)

# Compute the statistical thresholds for each category (since we dont have very strong similarities in any group)
q1 = sorted_expertise_values[int(len(sorted_expertise_values) * 0.2)]
q2 = sorted_expertise_values[int(len(sorted_expertise_values) * 0.4)]
q3 = sorted_expertise_values[int(len(sorted_expertise_values) * 0.6)]
q4 = sorted_expertise_values[int(len(sorted_expertise_values) * 0.8)]

print(q1,",",q2,",",q3,",",q4)

for i in range(num_users):
        if user_expertise[i] <= q1:
            absolute_user_expertise_levels[i]=1
        elif user_expertise[i] <= q2:
            absolute_user_expertise_levels[i]=2
        elif user_expertise[i] <= q3:
            absolute_user_expertise_levels[i]=3
        elif user_expertise[i] <= q4:
            absolute_user_expertise_levels[i]=4
        else:
            absolute_user_expertise_levels[i]=5
    
print(absolute_user_expertise_levels)


0.00959792477302205 , 0.014396887159533073 , 0.020622568093385214 , 0.03333333333333333
[5. 1. 4. 4. 1. 2. 4. 4. 3. 5. 5. 4. 3. 3. 2. 2. 3. 3. 5. 4. 2. 1. 3. 5.
 1. 3. 3. 2. 5. 5. 1. 1. 3. 5. 3. 5. 5. 4. 4. 3. 3. 5. 2. 4. 4. 5. 1. 5.
 3. 3. 5. 5. 2. 3. 4. 3. 2. 3. 5. 4. 5. 5. 3. 5. 4. 4. 4. 2. 1. 1. 2. 5.
 4. 2. 5. 4. 2. 1. 3. 4. 5. 3. 4. 2. 4. 1. 5. 1. 1. 3. 2. 4. 4. 5. 3. 1.
 2. 4. 4. 2. 1. 2. 2. 5. 3. 2. 1. 4. 3. 1. 3. 5. 3. 2. 3. 3. 1. 4. 5. 3.
 4. 3. 3. 5. 3. 3. 3. 2. 3. 4. 3. 1. 5. 1. 4. 5. 3. 5. 5. 5. 4. 4. 4. 3.
 5. 1. 4. 1. 2. 4. 3. 1. 2. 1. 3. 4. 4. 4. 3. 2. 4. 2. 3. 5. 2. 2. 4. 5.
 4. 3. 4. 3. 4. 4. 3. 1. 2. 2. 4. 1. 5. 1. 5. 2. 4. 4. 5. 4. 1. 4. 1. 4.
 5. 2. 3. 4. 1. 5. 1. 5. 1. 2. 1. 4. 4. 2. 4. 1. 1. 3. 1. 1. 3. 5. 4. 3.
 4. 2. 4. 5. 3. 5. 1. 1. 4. 2. 4. 1. 2. 4. 5. 4. 5. 1. 4. 2. 3. 4. 5. 2.
 1. 3. 2. 2. 3. 5. 4. 2. 1. 4. 4. 2. 2. 5. 2. 5. 2. 5. 1. 5. 3. 3. 4. 1.
 1. 3. 1. 5. 2. 3. 5. 4. 3. 2. 2. 4. 2. 1. 1. 1. 4. 5. 5. 2. 5. 5. 5. 4.
 5. 3. 2. 5. 4. 3. 4. 3. 2. 3. 4. 2.

In [21]:
expertise_in_group_members_dict={}
for group_id in dataset.group_user_Dict:
  group_members= dataset.group_user_Dict[group_id]
  
  sum_of_expertise_in_group=0 
  for user in group_members:
      sum_of_expertise_in_group+=absolute_user_expertise_levels[user]
  list_of_member_expertise_groupwise=[] 
  for user in group_members:
      user_relative_expertise_pair={}
      E_j=absolute_user_expertise_levels[user]/sum_of_expertise_in_group
      user_relative_expertise_pair[user]=E_j
      list_of_member_expertise_groupwise.append(user_relative_expertise_pair)
  expertise_in_group_members_dict[group_id]=list_of_member_expertise_groupwise

print(expertise_in_group_members_dict)

{216: [{346: 0.4444444444444444}, {414: 0.5555555555555556}], 217: [{433: 0.3333333333333333}, {526: 0.6666666666666666}], 214: [{559: 0.5}, {570: 0.5}], 215: [{226: 0.5}, {294: 0.5}], 212: [{415: 0.5555555555555556}, {470: 0.4444444444444444}], 213: [{43: 0.3076923076923077}, {267: 0.38461538461538464}, {308: 0.3076923076923077}], 210: [{443: 0.6666666666666666}, {520: 0.3333333333333333}], 211: [{53: 0.5}, {392: 0.5}], 165: [{451: 0.5}, {496: 0.5}], 264: [{105: 0.4}, {171: 0.6}], 265: [{556: 0.3333333333333333}, {253: 0.5555555555555556}, {366: 0.1111111111111111}], 218: [{334: 0.5}, {386: 0.5}], 219: [{199: 0.5555555555555556}, {302: 0.4444444444444444}], 133: [{6: 0.5714285714285714}, {126: 0.42857142857142855}], 132: [{141: 0.8}, {519: 0.2}], 131: [{480: 0.6666666666666666}, {500: 0.3333333333333333}], 130: [{179: 0.25}, {348: 0.75}], 137: [{106: 0.25}, {524: 0.75}], 136: [{304: 0.75}, {587: 0.25}], 135: [{42: 0.6666666666666666}, {510: 0.3333333333333333}], 134: [{113: 0.33333333

***3. Dissimilarity descriptors***

**For Explicit Rating- taking rating of items into consideration**

---



a. Dis_1: APD(Average Pairwise Dissimilarity)

In [None]:
#explicit APD



b. Dis_2: VD(Variance Dissimilarity)

**For Implicit Rating- taking rating of items into consideration**

---



4. Heuristic group concensus function

Rank top k

# **Permuted Pipeline creation**

**DISCUSS: How do we create?**
1. Do we generate top 1000 from each algo and take intersection and then output top 50?
2. Do we create top 1000 from first algorithm, reduce no of columns of entire dataset, perform second algorithm to output 900 items, then reduce trainset to those common items and so on.. in the end take top 50?

# **Evaluation**

# **Analysis**