In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
#read data from pickle file 
data = pd.read_pickle('../Datasets/restraunts_places_north_america.pickle')

In [4]:
data.columns

Index(['rating', 'reviewerName', 'reviewText', 'categories', 'gPlusPlaceId',
       'unixReviewTime', 'reviewTime', 'gPlusUserId', 'name', 'price',
       'address', 'hours', 'phone', 'closed', 'gps', 'lat', 'long',
       'open_days', 'Breakfast', 'Lunch', 'Dinner'],
      dtype='object')

In [3]:
num_users = len(data['gPlusUserId'].unique())
num_places = len(data['gPlusPlaceId'].unique())
print("Number of users: ", num_users)
print("Numebr of places: ", num_places)

Number of users:  153945
Numebr of places:  155621


In [8]:
print("Number of reviews from restraunts in North America: ", len(data['reviewText'].notna()))

Number of reviews from restraunts in North America:  275568


# RRP based on Text Review Content

In [4]:
pipe = Pipeline([('vectorizer', CountVectorizer(max_features = 5000)),
('tfidf',TfidfTransformer()),
('model',LogisticRegression(max_iter = 300))])

In [18]:
#for text based model we consider review text as the feature space and target variable as rating
X = data['reviewText']
y = data['rating']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 13)

In [19]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(max_features=5000)),
                ('tfidf', TfidfTransformer()),
                ('model', LogisticRegression(max_iter=300))])

In [20]:
text_predictions = pipe.predict(x_test)

In [21]:
score = pipe.score(x_test,y_test)
print("Accuracy: ", score)

Accuracy:  0.5791269006060166


In [22]:
accuracy_score(y_test,text_predictions)

0.5791269006060166

## RRP based on user-user similairty

In [5]:
ratings = data[['gPlusUserId','gPlusPlaceId','rating','reviewTime']]

In [6]:
ratings.head(10)

Unnamed: 0,gPlusUserId,gPlusPlaceId,rating,reviewTime
0,100000032416892623125,106591714648856494903,4.0,2014-03-12
1,100000032416892623125,115827996910815192564,5.0,2014-03-12
2,100164982335939791768,115827996910815192564,5.0,2011-03-28
3,100915754139202877823,115827996910815192564,4.0,2012-12-07
4,100000032416892623125,116585428624152564242,5.0,2014-03-14
5,100053609898337945121,116585428624152564242,4.0,2013-08-11
6,100301170211647672384,116585428624152564242,4.0,2013-02-27
7,104495751233378069123,116585428624152564242,5.0,2012-12-27
8,100000053212755369563,100073820849130920147,2.0,2013-08-10
9,100000053212755369563,102471437282277965376,1.0,2013-08-16


In [7]:
#since usersIDs and place IDs are very large number label encoding them to get different numbers
le = LabelEncoder()
le.fit(ratings['gPlusUserId'].unique())
ratings['userID'] = le.transform(ratings['gPlusUserId'])

In [8]:
le = LabelEncoder()
le.fit(ratings['gPlusPlaceId'].unique())
ratings['itemID'] = le.transform(ratings['gPlusPlaceId'])

In [9]:
ratings_df = ratings[['userID','itemID','rating','reviewTime']]

In [10]:
def dataPreprocessor(rating_df, num_users, num_items):
    
    matrix = np.zeros((num_users,num_items)) #num_users x num_items

    for (userID,itemID,rating,reviewTime) in rating_df.itertuples(index = False):
      matrix[userID-1,itemID -1] = rating # -1 because minimum id number is 1
    
    return matrix

In [11]:
ratings.dtypes

gPlusUserId             object
gPlusPlaceId            object
rating                 float64
reviewTime      datetime64[ns]
userID                   int32
itemID                   int32
dtype: object

In [12]:
train_df = dataPreprocessor(ratings_df, num_users, num_places)

In [13]:
class SimBasedRecSys(object):

    def __init__(self, base, method, processor=dataPreprocessor):
        
        self.base = base
        self.method_name = method
        self.method = self._getMethod(self.method_name)
        self.processor = processor
        self.pred_column_name = self.base+'-'+self.method_name
    
    def _getMethod(self, method_name):
        """
            Don't change this
        """
        switcher = {
            'cosine': self.cosine,
            'euclidean': self.euclidean,
            'somethingelse': self.somethingelse,
        }
        
        return switcher[method_name]
    
    @staticmethod
    def cosine(matrix):
        """
            cosine similarity
        """
        similarity_matrix = 1 - pairwise_distances(matrix, metric='cosine')
        return similarity_matrix
    
    @staticmethod
    def euclidean(matrix):
        """
            euclidean similarity
        """
        ########### your code goes here ###########

        similarity_matrix = 1/(1+pairwise_distances(matrix, metric='euclidean')) #lesser the distance more the similarity 
        #inverse relationship
    
        ###########         end         ###########    
        
        return similarity_matrix
    
    @staticmethod
    def somethingelse(matrix):
        """
            manhattan? or super-natural intuition similarity
        """
        ########### your code goes here ###########
    
        similarity_matrix = 1/(1+pairwise_distances(matrix, metric='manhattan'))
    
    
        ###########         end         ###########        
        return similarity_matrix
        
    def predict_all(self, train_df, num_users, num_items):
        """
            INPUT: 
                data: pandas DataFrame. columns=['userID', 'itemID', 'rating'...]
                num_row: scalar. number of users
                num_col: scalar. number of items
            OUTPUT:
                no return... this method assigns the result to self.model
            
            NOTES:
                self.__model should contain predictions for *all* user and items
                (don't worry about predicting for observed (user,item) pairs,
                 since we won't be using these predictions in the evaluation)
                (see code in for an efficient vectorized example)
        """
        if self.processor is not None:
          train_matrix = self.processor(train_df, num_users, num_items)
        else:
          train_matrix = train_df #if processer is none
        if self.base == 'user':
            ########### your code goes here ###########
            temp_matrix = np.zeros(train_matrix.shape)
            temp_matrix[train_matrix.nonzero()] = 1 #assign 1 to the items that have been rated

            uu_similarity = 1 - pairwise_distances(train_matrix, metric='cosine') #similarity matrix
            
            # UxI: UxU mul UxI
            normalizer = np.matmul(uu_similarity, temp_matrix) 
            #print(normalizer)
            normalizer[normalizer == 0] = 1e-5 #to avoid dividing by zero
            #what's the dimension of np.matmul(uu_similarity, trainSet)
            
            predictionMatrix = np.matmul(uu_similarity, train_matrix)/normalizer #U*I
            #predictionMatrix[temp_matrix.nonzero()] = 0
            #Cold start
            # if no one has rated this item before, use user average  
            useraverage = np.sum(train_matrix, axis=1)/(np.sum(temp_matrix, axis=1)+1e-5)
            columns = np.sum(predictionMatrix, axis=0)  
            print(columns.shape)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(useraverage, axis=1)

            #output
            self.__model = predictionMatrix

            ###########         end         ###########
            
        elif self.base == 'item':
            ########### your code goes here ###########
            train_matrix_1 = train_matrix.transpose() #IxU
            temp_matrix = np.zeros(train_matrix_1.shape)
            temp_matrix[train_matrix_1.nonzero()] = 1 #assign 1 to the items that have been rated

            ii_similarity = 1 - pairwise_distances(train_matrix_1, metric='cosine') #similarity matrix
            
            # IxI x IxU: IxU
            normalizer = np.matmul(ii_similarity, temp_matrix) 
            #print(normalizer)
            normalizer[normalizer == 0] = 1e-5 #to avoid dividing by zero
            #what's the dimension of np.matmul(uu_similarity, trainSet)
            
            predictionMatrix = np.matmul(ii_similarity, train_matrix_1)/normalizer #IxU
            #predictionMatrix[temp_matrix.nonzero()] = 0
            #Cold start
            # if no one has rated this item before, use user average  
            itemaverage = np.sum(train_matrix_1, axis=1)/(np.sum(temp_matrix, axis=1) +1e-5)
            columns = np.sum(predictionMatrix, axis=0) #find the 
            #print(columns.shape)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(itemaverage, axis=1)

            #output
            self.__model = predictionMatrix.transpose() #UxI
            ###########         end         ###########
        else:
            print('No other option available')
        
    def evaluate_test(self, test_df, copy=False):
        
        if copy:
            prediction = test_df.copy()
        else:
            prediction = test_df
        prediction[self.pred_column_name] = np.nan
        
        
        for (index, 
             userID, 
             itemID) in tqdm(prediction[['userID','itemID']].itertuples()):
            prediction.loc[index, self.pred_column_name] = self.__model[userID-1, itemID-1]
    
        return prediction
    
    def getModel(self):
        """
            return predicted user-item matrix
        """
        return self.__model
    
    def getPredColName(self):
        """
            return prediction column name
        """
        return self.pred_column_name
    
    def reset(self):
        """
            reuse the instance of the class by removing model
        """
        try:
            self.model = None
        except:
            print("You do not have model..")

In [14]:
user_cosine_recsys = SimBasedRecSys('user','cosine',processor = None)

In [15]:
user_cosine_recsys.predict_all(train_df, num_users, num_places)

MemoryError: Unable to allocate 178. GiB for an array with shape (153945, 155621) and data type float64

## Average of predictions from text based and user-user based recsys