In [66]:
def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
 
    """
    Args:
        sparse_data (csr_matrix): 유저와 아이템 행렬 (training set)
 
        alpha_val (int): 신뢰도 값 조정에 필요한 값.
 
        iterations (int): 반복 횟수
 
        lambda_val (float): 정규화에 필요한 값. 정규화 값이 증가 할 수록 편향도 증가 Default is 0.1.
 
        features (int): 계산할 latent features의 값
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    # 신뢰도값 = training * alpha (최적값 40)
    confidence = sparse_data * alpha_val
    
    # 유저와 아이템의 사이즈
    user_size, item_size = sparse_data.shape
    
    # X(user matrix) , Y(item matrix) 랜덤값으로 초기화
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    # I는 identifier matrix
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I

Still inside our implicit_als function we start the main iteration loop. Here we first precompute X-transpose-X and Y-transpose-Y as discussed earlier. We then have two inner loops where we first iterate over all users and update X and then do the same for all items and update Y.

In [75]:
""" Continuation of implicit_als function"""

# loop을 통해 유저와 아이템을 반복적으로 최적화
for i in xrange(iterations):
    print( 'iteration %d of %d' % (i+1, iterations))

    # Precompute Y-transpose-Y and X-transpose-X
    yTy = Y.T.dot(Y)
    xTx = X.T.dot(X)

    # 유저와 아이템을 loop을 통해 최적화
    for u in xrange(user_size):

        # 유저 값을 구함
        u_row = confidence[u,:].toarray() 

        # 선호도 p(u)값을 계산
        p_u = u_row.copy()
        p_u[p_u != 0] = 1.0

        
        # Calculate Cu and Cu - I
        
        CuI = sparse.diags(u_row, [0])
        Cu = CuI + Y_I

        # Put it all together and compute the final formula
        yT_CuI_y = Y.T.dot(CuI).dot(Y)
        yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
        X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)


    for i in xrange(item_size):

        # 아이템 컬럼을 구하고 전치
        i_row = confidence[:,i].T.toarray()

        # Calculate the binary preference p(i)
        p_i = i_row.copy()
        p_i[p_i != 0] = 1.0

        # Calculate Ci and Ci - I
        CiI = sparse.diags(i_row, [0])
        Ci = CiI + X_I

        # Put it all together and compute the final formula
        xT_CiI_x = X.T.dot(CiI).dot(X)
        xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
        Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

return X, Y

NameError: name 'xrange' is not defined

In [16]:
artist_rank = ap.groupby(['name']) \
  .agg({'userID' : 'count', 'playCount' : 'sum'}) \
  .rename(columns={"userID" : 'totalUniqueUsers', "playCount" : "totalArtistPlays"}) \
  .sort_values(['totalArtistPlays'], ascending=False)

artist_rank['avgUserPlays'] = artist_rank['totalArtistPlays'] / artist_rank['totalUniqueUsers']

In [28]:
artist_rank

Unnamed: 0_level_0,totalUniqueUsers,totalArtistPlays,avgUserPlays
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Britney Spears,522,2393140,4584.559387
Depeche Mode,282,1301308,4614.567376
Lady Gaga,611,1291387,2113.563011
Christina Aguilera,407,1058405,2600.503686
Paramore,399,963449,2414.659148
...,...,...,...
Morris,1,1,1.000000
Eddie Kendricks,1,1,1.000000
Excess Pressure,1,1,1.000000
My Mine,1,1,1.000000


In [29]:
ap = ap.join(artist_rank, on="name", how="inner") \
  .sort_values(['playCount'], ascending=False)

In [30]:
ap

Unnamed: 0,id,name,userID,artistID,playCount,totalUniqueUsers,totalArtistPlays,avgUserPlays
2800,72,Depeche Mode,1642,72,352698,282,1301308,4614.567376
35843,792,Thalía,2071,792,324663,26,350035,13462.884615
27302,511,U2,1094,511,320725,185,493024,2664.994595
8152,203,Blur,1905,203,257978,114,318221,2791.412281
26670,498,Paramore,1664,498,227829,399,963449,2414.659148
...,...,...,...,...,...,...,...,...
38688,913,Destiny's Child,1810,913,1,83,34746,418.626506
32955,697,Sia,1290,697,1,56,27597,492.803571
71811,4988,Chris Spheeris,510,4988,1,5,3106,621.200000
91319,17080,Haylie Duff,1851,17080,1,1,1,1.000000


In [32]:
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())

ap = ap.assign(playCountScaled=play_count_scaled)

In [34]:
ratings_df = ap.pivot(
    index='userID',
    columns='artistID',
    values='playCountScaled')

ratings = ratings_df.fillna(0).values

In [35]:
ratings

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00115397, ..., 0.        , 0.        ,
        0.        ]])

In [36]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('{:.2f}%'.format(sparsity))

0.28%


In [52]:
train, val = train_test_split(ratings)

In [53]:
MIN_USER_RATINGS = 35
DELETE_RATING_COUNT = 15

def train_test_split(ratings):

    validation = np.zeros(ratings.shape)
    train = ratings.copy()

    for user in np.arange(ratings.shape[0]):
        if len(ratings[user,:].nonzero()[0]) >= MIN_USER_RATINGS:
            val_ratings = np.random.choice(
              ratings[user, :].nonzero()[0],
              size=DELETE_RATING_COUNT,
              replace=False
            )
            train[user, val_ratings] = 0
            validation[user, val_ratings] = ratings[user, val_ratings]
    return train, validation

In [54]:
def predict(self, X_train, user_index):
    y_hat = self.predictions(self.P, self.Q)
    predictions_index = np.where(X_train[user_index, :] == 0)[0]
    return y_hat[user_index, predictions_index].flatten()

In [55]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [56]:
def fit(self, X_train, X_val):
    m, n = X_train.shape

    self.P = 3 * np.random.rand(self.n_latent_features, m)
    self.Q = 3 * np.random.rand(self.n_latent_features, n)

    self.train_error = []
    self.val_error = []

    users, items = X_train.nonzero()

    for epoch in range(self.n_epochs):
        for u, i in zip(users, items):
            error = X_train[u, i] - self.predictions(self.P[:,u], self.Q[:,i])
            self.P[:, u] += self.learning_rate * \
            (error * self.Q[:, i] - self.lmbda * self.P[:, u])
            self.Q[:, i] += self.learning_rate * \
            (error * self.P[:, u] - self.lmbda * self.Q[:, i])

        train_rmse = rmse(self.predictions(self.P, self.Q), X_train)
        val_rmse = rmse(self.predictions(self.P, self.Q), X_val)
        self.train_error.append(train_rmse)
        self.val_error.append(val_rmse)

In [57]:
def predict(self, X_train, user_index):
    y_hat = self.predictions(self.P, self.Q)
    predictions_index = np.where(X_train[user_index, :] == 0)[0]
    return y_hat[user_index, predictions_index].flatten()

In [59]:
user_id = 1236
user_index = ratings_df.index.get_loc(user_id)
predictions_index = np.where(train[user_index, :] == 0)[0]

rating_predictions = recommender.predict(train, user_index)

NameError: name 'recommender' is not defined

### movies

In [60]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import SVD, SVDpp, NMF

In [61]:
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
df.head()

Unnamed: 0,user,item,rate,id
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [64]:
df.rename(columns={'user':'user_id','id':'movie_id'},inplace=True)

In [91]:
df = df[['movie_id','user_id','rate']]

In [92]:
user_movie_rating = df.pivot_table('rate', index = 'movie_id', columns='user_id').fillna(0)

In [93]:
user_movie_rating

user_id,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
874724710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874724727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874724754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874724781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874724843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893286550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
893286584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
893286603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
893286637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [95]:
def make_recommendations(self, fav_movie, n_recommendations):
    """
    make top n movie recommendations
    Parameters
    ----------
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    """
    # get data
    movie_user_mat_sparse, hashmap = self._prep_data()
    # get recommendations
    raw_recommends = self._inference(
        self.model, movie_user_mat_sparse, hashmap,
        fav_movie, n_recommendations)
    # print results
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance '
              'of {2}'.format(i+1, reverse_hashmap[idx], dist))