## Import Statements

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import pickle

In [2]:
# Load in data from csv files
movies_df = pd.read_csv("./Data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("./Data/ml-latest-small/ratings.csv")

print(f"Movie dataframe dimensions: {movies_df.shape}")
print(f"Ratings dataframe dimensions: {ratings_df.shape}")

# get number of unique users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

ratings_df.head()

Movie dataframe dimensions: (9742, 3)
Ratings dataframe dimensions: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Relevant Classes Needed for PyTorch

In [4]:
class Model(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user and item embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.movie_factors = torch.nn.Embedding(n_items, n_factors)
        # fills weights with values from a uniform distribution [0, 0.5]
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.movie_factors.weight.data.uniform_(0, 0.05)
    
    def forward(self, data):
        # matrix multiplication between user and item factors, and then concatenates them to one column
        return (self.user_factors(data[:,0])*self.movie_factors(data[:,1])).sum(1)

In [5]:
class MovieDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

        le = preprocessing.LabelEncoder()
        le.fit(self.ratings.movieId.values)
        self.lookup = dict(zip(le.transform(self.ratings.movieId.values), self.ratings.movieId.values))

        self.ratings.userId = preprocessing.LabelEncoder().fit_transform(self.ratings.userId.values)
        self.ratings.movieId = preprocessing.LabelEncoder().fit_transform(self.ratings.movieId.values)

        self.x = torch.tensor(self.ratings.drop(['rating', 'timestamp'], axis=1).values)
        self.y = torch.tensor(self.ratings['rating'].values)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, item):
        return (self.x[item], self.y[item])

## Train Model

In [6]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = Model(n_users, n_items, n_factors=8)

for name, param in model.named_parameters():
    # prints the parameters who's changes will be recorded
    if param.requires_grad:
        print(name, param.data)

# enable GPU if you have a GPU
if cuda:
    model = model.cuda()

# MSE loss function
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = MovieDataset(ratings_df)
train_loader = DataLoader(train_set, 128, shuffle=True)

for it in range(num_epochs):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Is running on GPU: False
user_factors.weight tensor([[0.0456, 0.0344, 0.0102,  ..., 0.0156, 0.0445, 0.0421],
        [0.0237, 0.0284, 0.0106,  ..., 0.0487, 0.0370, 0.0430],
        [0.0291, 0.0203, 0.0149,  ..., 0.0246, 0.0217, 0.0102],
        ...,
        [0.0142, 0.0446, 0.0113,  ..., 0.0331, 0.0417, 0.0308],
        [0.0208, 0.0301, 0.0235,  ..., 0.0450, 0.0350, 0.0035],
        [0.0247, 0.0423, 0.0334,  ..., 0.0352, 0.0226, 0.0256]])
movie_factors.weight tensor([[0.0127, 0.0288, 0.0076,  ..., 0.0290, 0.0072, 0.0496],
        [0.0424, 0.0048, 0.0441,  ..., 0.0243, 0.0435, 0.0201],
        [0.0066, 0.0483, 0.0083,  ..., 0.0437, 0.0146, 0.0002],
        ...,
        [0.0420, 0.0465, 0.0042,  ..., 0.0113, 0.0116, 0.0366],
        [0.0461, 0.0219, 0.0415,  ..., 0.0111, 0.0074, 0.0330],
        [0.0483, 0.0250, 0.0116,  ..., 0.0144, 0.0110, 0.0437]])
iter #0 Loss: 11.073272115688033
iter #1 Loss: 4.743582609946353
iter #2 Loss: 2.4732775609505357
iter #3 Loss: 1.7201407274921534
iter #4

Save Model

In [7]:
with open('recSys.pkl', 'wb') as f:
    pickle.dump(model, f)

In [8]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

user_factors.weight tensor([[ 1.4406,  1.5085,  1.3647,  ...,  1.3247,  1.1655,  1.8264],
        [ 0.9310,  1.1197, -0.0258,  ...,  1.1496,  1.8208,  1.7693],
        [ 1.1364,  1.7194,  0.7117,  ..., -2.6151,  1.3925, -0.2541],
        ...,
        [ 0.3648,  0.5481,  1.3855,  ..., -0.6589,  0.6806,  1.1321],
        [ 0.5329,  1.3394,  1.0249,  ...,  0.6849,  1.2605,  1.1007],
        [ 1.8096,  1.5428,  1.3131,  ...,  0.4340,  0.8188,  0.4304]])
movie_factors.weight tensor([[0.5541, 0.4625, 0.3197,  ..., 0.6063, 0.3152, 0.3232],
        [0.3978, 0.4120, 0.4124,  ..., 0.6047, 0.3821, 0.6489],
        [0.6360, 0.3837, 0.3564,  ..., 0.3592, 0.7338, 0.6272],
        ...,
        [0.4305, 0.4024, 0.4302,  ..., 0.4395, 0.4208, 0.3973],
        [0.4053, 0.4127, 0.4042,  ..., 0.4353, 0.4248, 0.3825],
        [0.4924, 0.5075, 0.4913,  ..., 0.4938, 0.5000, 0.3327]])


In [9]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
trained_movie_embeddings = model.movie_factors.weight.data.cpu().numpy()
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.lookup[movidx]
    # print(ratings_df.loc[ratings_df['movieId']==movid].count())
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()["userId"]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Go Fish (1994)
	 Chungking Express (Chung Hing sam lam) (1994)
	 On Her Majesty's Secret Service (1969)
	 Three Colors: Blue (Trois couleurs: Bleu) (1993)
	 Bubba Ho-tep (2002)
	 Substitute, The (1996)
	 Insider, The (1999)
	 Money Train (1995)
	 Emma (1996)
	 Death and the Maiden (1994)
Cluster #1
	 Poetic Justice (1993)
	 Lost Weekend, The (1945)
	 Don Juan DeMarco (1995)
	 Being Human (1993)
	 For Whom the Bell Tolls (1943)
	 Bread and Chocolate (Pane e cioccolata) (1973)
	 Kissed (1996)
	 Philadelphia (1993)
	 Inspector General, The (1949)
	 Jefferson in Paris (1995)
Cluster #2
	 Philadelphia Story, The (1940)
	 Afterglow (1997)
	 Unforgiven (1992)
	 Candleshoe (1977)
	 Empire (2002)
	 Blue in the Face (1995)
	 Penny Serenade (1941)
	 Untouchables, The (1987)
	 Battle Royale (Batoru rowaiaru) (2000)
	 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
Cluster #3
	 Secret of Roan Inish, The (1994)
	 Dangerous Minds (1995)
	 Run Silent Run Deep (1958)
	 National Lampoon's Sen