### Excercise based on [this article](https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/)


In [28]:
# load training & testing data for ratings

ratings_train = pd.read_csv('ml-100k/ua.base', sep = '\t', names = ['uid', 'mid', 'rating', 'timestamp'])
ratings_test = pd.read_csv('ml-100k/ua.test', sep = '\t', names = ['uid', 'mid', 'rating', 'timestamp'])
ratings_train.timestamp = pd.to_datetime(ratings_train.timestamp, unit = 's')
ratings_test.timestamp = pd.to_datetime(ratings_test.timestamp, unit = 's')

# load items data

items = pd.read_csv('ml-100k/u.item', sep = '|', names = ['mid', 'title', 'release_date', 'vd_release_date', 'url', 'unkown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding = 'latin-1', parse_dates = ['release_date'])

In [2]:
ratings_train.head()

Unnamed: 0,uid,mid,rating,timestamp
0,1,1,5,1997-09-22 22:02:38
1,1,2,3,1997-10-15 05:26:11
2,1,3,4,1997-11-03 07:42:40
3,1,4,3,1997-10-15 05:25:19
4,1,5,3,1998-03-13 01:15:12


In [3]:
data_train = ratings_train.pivot(index='uid', columns='mid', values='rating').fillna(0).values
data_train

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [6]:
# codes for algorithm

# the goal is to factorize the user-rating matrix into 2 matrices, with a new "concept" dimension added - the user-concept & item-concept
# these 2 matrices represent users' and items' affinity for concepts

class matrix_factorize():
    
    # initialize model and assign parameters
    
    def __init__(self, R, K = 20, alpha = 0.001, beta = 0.01, n_iter = 2000):
        
        self.R = R
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.n_iter = n_iter
        
        self.n_users, self.n_items = R.shape
    
        # inititalize the user-concept with size of n_user & k concepts, filled with random normal numbers
        self.P = np.random.normal(scale = 1. / self.K, size = (self.n_users, self.K))

        # inititalize the item-concept with size of n_item & k concepts, filled with random normal numbers
        self.Q = np.random.normal(scale = 1. / self.K, size = (self.n_items, self.K))

        # initialize bias terms
        self.b_u = np.zeros(self.n_users)
        self.b_i = np.zeros(self.n_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # create list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.n_users)
            for j in range(self.n_items)
            if self.R[i, j] > 0
        ]

    # get ratings for user i on movie j
    
    def __get_rating(self, i, j):
        pred = self.b + self.b_u[i] + self.b_i[j]+ self.P[i, :].dot(self.Q[j, :])
        return pred

    # define fucntion for stochastic gradient descent
    
    def __sgd(self):
        
        for i, j, r in self.samples:
            
                diff = r - self.__get_rating(i, j)
                
                # update bias
                self.b_u[i] += self.alpha * (diff - self.beta * self.b_u[i])
                self.b_i[j] += self.alpha * (diff - self.beta * self.b_i[j])
                
                # update P, Q matrices
                self.P[i, :] += self.alpha * (diff * self.Q[j, :] - self.beta * self.P[i, :])
                self.Q[j, :] += self.alpha * (diff * self.P[i, :] - self.beta * self.Q[j, :])
                
    # create full user-movie matrix
    
    def full_matrix(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis, :] + self.P.dot(self.Q.T)
    
    # define function for calcuating mse
    
    def __mse(self):
        
        xs, ys = self.R.nonzero()
        prediction = self.full_matrix()
        error = 0
        
        for x, y in zip(xs, ys):
            error += np.square(self.R[x, y] - prediction[x, y])
            
        return np.sqrt(error)
    
    # define training process
    
    def train(self):
        
        train_proc = []
        
        for i in range(self.n_iter):
            np.random.shuffle(self.samples)
            self.__sgd()
            mse = self.__mse()
            train_proc.append((i, mse))
            if (i + 1) % 20 == 0:
                print (f'Iternation: {i+1}; error = {mse:4f}')
                
        return train_proc

In [7]:
# test training results

mf = matrix_factorize(R = data_train)
mf.train()

Iternation: 20; error = 281.796525
Iternation: 40; error = 276.731801
Iternation: 60; error = 273.446722
Iternation: 80; error = 268.705013
Iternation: 100; error = 260.602729
Iternation: 120; error = 250.273843
Iternation: 140; error = 239.559887
Iternation: 160; error = 229.359489
Iternation: 180; error = 220.306324
Iternation: 200; error = 212.651903
Iternation: 220; error = 206.320479
Iternation: 240; error = 201.092432
Iternation: 260; error = 196.735659
Iternation: 280; error = 193.059381
Iternation: 300; error = 189.919953
Iternation: 320; error = 187.208502
Iternation: 340; error = 184.845449
Iternation: 360; error = 182.769318
Iternation: 380; error = 180.932968
Iternation: 400; error = 179.297223
Iternation: 420; error = 177.833766
Iternation: 440; error = 176.515782
Iternation: 460; error = 175.324512
Iternation: 480; error = 174.242152
Iternation: 500; error = 173.255243
Iternation: 520; error = 172.351664
Iternation: 540; error = 171.521333
Iternation: 560; error = 170.756

[(0, 321.7415692362322),
 (1, 311.76877944363594),
 (2, 305.2871447704345),
 (3, 300.767013944792),
 (4, 297.4184492871841),
 (5, 294.83514930607305),
 (6, 292.7729456199337),
 (7, 291.08691187213145),
 (8, 289.6746276292903),
 (9, 288.46775897470866),
 (10, 287.4251819085055),
 (11, 286.5105093862041),
 (12, 285.69865568430913),
 (13, 284.9730260854026),
 (14, 284.3190623484893),
 (15, 283.72557474116013),
 (16, 283.1826120851971),
 (17, 282.6837335182105),
 (18, 282.22343249840947),
 (19, 281.79652486249574),
 (20, 281.3989132150461),
 (21, 281.02727412365505),
 (22, 280.67879368880716),
 (23, 280.35032132978876),
 (24, 280.0413198498184),
 (25, 279.7488264613471),
 (26, 279.4718071545217),
 (27, 279.20813036572713),
 (28, 278.95667544307895),
 (29, 278.71617978072203),
 (30, 278.4857685378343),
 (31, 278.2640468444798),
 (32, 278.05165014685656),
 (33, 277.84521153327796),
 (34, 277.64624025465406),
 (35, 277.4533740015914),
 (36, 277.2662791038864),
 (37, 277.08401144871067),
 (38,

In [14]:
recom = mf.full_matrix()

In [102]:
# for userid == 100, check all movies rated 5

fav_movies = ratings_train.loc[(ratings_train.uid == 100) & (ratings_train.rating >= 4)].mid.values

In [103]:
# for userid == 3, check the top 5 most recommended movies

top5 = recom[99,:].argpartition(-10)[-5:] + 1

In [104]:
# details of those movies

items.loc[items.mid.isin(fav_movies)]

Unnamed: 0,mid,title,release_date,vd_release_date,url,unkown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
257,258,Contact (1997),1997-07-11,,http://us.imdb.com/Title?Contact+(1997/I),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
268,269,"Full Monty, The (1997)",1997-01-01,,http://us.imdb.com/M/title-exact?Full+Monty%2C...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
271,272,Good Will Hunting (1997),1997-01-01,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
293,294,Liar Liar (1997),1997-03-21,,http://us.imdb.com/Title?Liar+Liar+(1997),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
299,300,Air Force One (1997),1997-01-01,,http://us.imdb.com/M/title-exact?Air+Force+One...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
312,313,Titanic (1997),1997-01-01,,http://us.imdb.com/M/title-exact?imdb-title-12...,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
314,315,Apt Pupil (1998),1998-10-23,,http://us.imdb.com/Title?Apt+Pupil+(1998),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
315,316,As Good As It Gets (1997),1997-12-23,,http://us.imdb.com/Title?As+Good+As+It+Gets+(1...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
327,328,Conspiracy Theory (1997),1997-08-08,,http://us.imdb.com/M/title-exact?Conspiracy+Th...,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0
346,347,Wag the Dog (1997),1998-01-09,,http://us.imdb.com/M/title-exact?imdb-title-12...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0


In [97]:
# details of the top 5 recommended movies for userid == 2

items.loc[items.mid.isin(top5)]

Unnamed: 0,mid,title,release_date,vd_release_date,url,unkown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
56,57,Priest (1994),1994-01-01,,http://us.imdb.com/M/title-exact?Priest%20(1994),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
317,318,Schindler's List (1993),1993-01-01,,http://us.imdb.com/M/title-exact?Schindler's%2...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
371,372,Jeffrey (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Jeffrey%20(1995),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
720,721,Mallrats (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Mallrats%20(1...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
903,904,Ma vie en rose (My Life in Pink) (1997),1997-12-26,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
