In [1]:
import pandas as pd
import numpy as np

recipesdf = pd.read_csv("recipes.csv")
reviewsdf = pd.read_csv("reviews.csv")
reviewsdf = reviewsdf.head(10000)

In [2]:
recept_names = recipesdf.set_index('RecipeId')['Name'].to_dict()
n_users = len(reviewsdf.AuthorId.unique())
n_items = len(reviewsdf.RecipeId.unique())
print("Number of unique users:", n_users)
print("Number of unique recipes:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(reviewsdf))
print("Therefore: ", len(reviewsdf) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 3356
Number of unique recipes: 5514
The full rating matrix will have: 18504984 elements.
----------
Number of ratings: 10000
Therefore:  0.05403949552185509 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [3]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm 

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [4]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = reviewsdf.copy()
        
        # Extract all user IDs and recipe IDs
        users = reviewsdf.AuthorId.unique()
        recipes = reviewsdf.RecipeId.unique()
        
        #--- Producing new continuous IDs for users and recipes ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.recipeid2idx = {o:i for i,o in enumerate(recipes)}
        
        # Obtained continuous ID for users and recipes
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2recipeid = {i:o for o,i in self.recipeid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.RecipeId = reviewsdf.RecipeId.apply(lambda x: self.recipeid2idx[x])
        self.ratings.AuthorId = reviewsdf.AuthorId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['ReviewId','AuthorName','Rating','Review','DateSubmitted','DateModified'], axis=1).values
        self.y = self.ratings['Rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)


In [5]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(3356, 8)
  (item_factors): Embedding(5514, 8)
)
user_factors.weight tensor([[0.0122, 0.0230, 0.0450,  ..., 0.0181, 0.0164, 0.0018],
        [0.0048, 0.0212, 0.0240,  ..., 0.0223, 0.0300, 0.0148],
        [0.0105, 0.0299, 0.0164,  ..., 0.0186, 0.0245, 0.0118],
        ...,
        [0.0145, 0.0172, 0.0006,  ..., 0.0293, 0.0052, 0.0471],
        [0.0274, 0.0072, 0.0164,  ..., 0.0276, 0.0046, 0.0170],
        [0.0405, 0.0024, 0.0397,  ..., 0.0323, 0.0071, 0.0018]])
item_factors.weight tensor([[0.0045, 0.0021, 0.0212,  ..., 0.0477, 0.0295, 0.0098],
        [0.0230, 0.0036, 0.0030,  ..., 0.0093, 0.0137, 0.0215],
        [0.0182, 0.0427, 0.0451,  ..., 0.0040, 0.0495, 0.0362],
        ...,
        [0.0449, 0.0314, 0.0461,  ..., 0.0358, 0.0101, 0.0259],
        [0.0241, 0.0185, 0.0013,  ..., 0.0193, 0.0016, 0.0032],
        [0.0203, 0.0014, 0.0200,  ..., 0.0225, 0.0183, 0.0302]])


In [6]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()            
    #print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  0%|          | 0/128 [00:00<?, ?it/s]

In [7]:
# By training the model, we will have tuned latent factors for recipes and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

trained_recipe_embeddings = model.item_factors.weight.data.cpu().numpy()

len(trained_recipe_embeddings) # unique recipe factor weights

user_factors.weight tensor([[0.0122, 0.0230, 0.0450,  ..., 0.0181, 0.0164, 0.0018],
        [0.0048, 0.0212, 0.0240,  ..., 0.0223, 0.0300, 0.0148],
        [0.0105, 0.0299, 0.0164,  ..., 0.0186, 0.0245, 0.0118],
        ...,
        [0.0145, 0.0172, 0.0006,  ..., 0.0293, 0.0052, 0.0471],
        [0.0274, 0.0072, 0.0164,  ..., 0.0276, 0.0046, 0.0170],
        [0.0405, 0.0024, 0.0397,  ..., 0.0323, 0.0071, 0.0018]])
item_factors.weight tensor([[0.0045, 0.0021, 0.0212,  ..., 0.0477, 0.0295, 0.0098],
        [0.0230, 0.0036, 0.0030,  ..., 0.0093, 0.0137, 0.0215],
        [0.0182, 0.0427, 0.0451,  ..., 0.0040, 0.0495, 0.0362],
        ...,
        [0.0449, 0.0314, 0.0461,  ..., 0.0358, 0.0101, 0.0259],
        [0.0241, 0.0185, 0.0013,  ..., 0.0193, 0.0016, 0.0032],
        [0.0203, 0.0014, 0.0200,  ..., 0.0225, 0.0183, 0.0302]])


5514

In [8]:
from sklearn.cluster import KMeans
# Fit the clusters based on the recipe weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_recipe_embeddings)

'''It can be seen here that the recipes that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the recipe name
and only obtained the relationships by looking at the numbers representing how
users have responded to the recipe selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2recipeid[movidx]
    rat_count = reviewsdf.loc[reviewsdf['RecipeId']==movid].count()[0]
    movs.append((recept_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Belly Buster
	 Pecan Pie Muffins
	 Melt in Your Mouth Banana Bread
	 Crock Pot Garlic Brown Sugar Chicken
	 Baked Pasta with Asparagus (Pasta al Forno con Asparagi)
	 Easy Pot Roast
	 Quick and Easy No Bake Chocolate Cookies
	 Pepsi Pork Roast
	 BBQ Ribs
	 Crock Pot Maple Country Ribs
Cluster #1
	 Steven's World Famous To-Die-For Sour Cream Chicken
	 Slow-Cooker Cheesy Chicken
	 Dutch Cucumber Salad
	 Chicken Tortilla Soup II
	 Creamy Taco Casserole
	 Bleu Cheese Beef Tenderloin
	 Mary's Best Zucchini Bread
	 Creamy Beef Stroganoff Over Rice
	 No Fuss No Mess Potato Pancakes
	 Hot German Potato Salad
Cluster #2
	 Yummy Banana Bread
	 The Bomb Burgers
	 Garlic Chicken Breasts in Balsamic Vinegar
	 Betty White's Chicken Wings
	 Asparagus with Toasted Pine Nuts & Lemon Vinaigrette
	 Easy Chicken & Potato Dinner
	 Depression Fudge Cake
	 Fantasy Fudge
	 Creamy Chicken and Penne
	 Spinach and Strawberry Salad
Cluster #3
	 Cracker Barrel's Hashbrowns Casserole - Copycat
	 Chicke