## Import Statements

pip3 install ipykernel torch torchvision torchaudio pandas scikit-learn flask cloudpickle

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import pickle
import heapq

## Load in Data

In [2]:
# Load in data from csv files
movies_df = pd.read_csv("./Data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("./Data/ml-latest-small/ratings.csv")

print(f"Movie dataframe dimensions: {movies_df.shape}")
print(f"Ratings dataframe dimensions: {ratings_df.shape}")

# get number of unique users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

ratings_df.head()

Movie dataframe dimensions: (9742, 3)
Ratings dataframe dimensions: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Relevant Classes Needed for PyTorch

In [3]:
class Model(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user and item embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.movie_factors = torch.nn.Embedding(n_items, n_factors)
        # fills weights with values from a uniform distribution [0, 0.5]
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.movie_factors.weight.data.uniform_(0, 0.05)
    
    def forward(self, data):
        # matrix multiplication between user and item factors, and then concatenates them to one column
        return (self.user_factors(data[:,0])*self.movie_factors(data[:,1])).sum(1)
    
    def predict(self, user, item):
        return (self.user_factors(user)*self.movie_factors(item)).sum(1)

In [4]:
class MovieDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

        le = preprocessing.LabelEncoder()
        le.fit(self.ratings.movieId.values)
        self.lookup = dict(zip(le.transform(self.ratings.movieId.values), self.ratings.movieId.values))

        self.ratings.userId = preprocessing.LabelEncoder().fit_transform(self.ratings.userId.values)
        self.ratings.movieId = preprocessing.LabelEncoder().fit_transform(self.ratings.movieId.values)

        self.x = torch.tensor(self.ratings.drop(['rating', 'timestamp'], axis=1).values)
        self.y = torch.tensor(self.ratings['rating'].values)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, item):
        return (self.x[item], self.y[item])

## Train Model

In [5]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = Model(n_users, n_items, n_factors=8)

for name, param in model.named_parameters():
    # prints the parameters who's changes will be recorded
    if param.requires_grad:
        print(name, param.data)

# enable GPU if you have a GPU
if cuda:
    model = model.cuda()

# MSE loss function
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = MovieDataset(ratings_df)
train_loader = DataLoader(train_set, 128, shuffle=True)

for it in range(num_epochs):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Is running on GPU: False
user_factors.weight tensor([[0.0414, 0.0116, 0.0377,  ..., 0.0225, 0.0053, 0.0193],
        [0.0230, 0.0121, 0.0220,  ..., 0.0478, 0.0443, 0.0261],
        [0.0487, 0.0319, 0.0065,  ..., 0.0160, 0.0081, 0.0450],
        ...,
        [0.0448, 0.0048, 0.0022,  ..., 0.0335, 0.0283, 0.0262],
        [0.0305, 0.0256, 0.0112,  ..., 0.0350, 0.0448, 0.0079],
        [0.0027, 0.0127, 0.0356,  ..., 0.0005, 0.0011, 0.0232]])
movie_factors.weight tensor([[0.0440, 0.0252, 0.0293,  ..., 0.0262, 0.0207, 0.0034],
        [0.0412, 0.0053, 0.0209,  ..., 0.0428, 0.0488, 0.0186],
        [0.0180, 0.0401, 0.0086,  ..., 0.0015, 0.0429, 0.0120],
        ...,
        [0.0090, 0.0492, 0.0409,  ..., 0.0138, 0.0240, 0.0168],
        [0.0492, 0.0129, 0.0296,  ..., 0.0206, 0.0291, 0.0290],
        [0.0319, 0.0390, 0.0051,  ..., 0.0097, 0.0243, 0.0490]])
iter #0 Loss: 11.062116707642065
iter #1 Loss: 4.739447764938858
iter #2 Loss: 2.473496480036508
iter #3 Loss: 1.7203255179267245
iter #4 

In [7]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

user_factors.weight tensor([[ 1.5459,  1.5626,  0.3918,  ...,  1.3266,  1.2053,  1.6495],
        [ 1.4505,  0.5052,  1.0755,  ...,  0.2588,  0.9375,  1.3744],
        [ 1.4311, -0.3158, -0.4011,  ..., -0.8710, -2.2422,  2.6228],
        ...,
        [ 1.5472,  2.0237,  0.2342,  ...,  1.4624,  0.1823,  0.4366],
        [ 0.7654,  1.7095,  0.6038,  ...,  0.7168,  0.8237,  0.9143],
        [ 1.1587,  0.0038,  1.3299,  ...,  1.0626,  1.2879,  1.7076]])
movie_factors.weight tensor([[0.4595, 0.2566, 0.7599,  ..., 0.4822, 0.8101, 0.1779],
        [0.5607, 0.1511, 0.6720,  ..., 0.0382, 0.3733, 0.4972],
        [0.2896, 0.7683, 0.5279,  ..., 0.2558, 0.1718, 0.4468],
        ...,
        [0.4181, 0.4432, 0.4126,  ..., 0.4293, 0.4070, 0.4155],
        [0.4134, 0.4182, 0.4095,  ..., 0.4095, 0.4085, 0.3807],
        [0.4662, 0.5197, 0.5108,  ..., 0.5193, 0.5188, 0.4895]])


## Save Model

In [7]:
with open('recSys.pkl', 'wb') as f:
    pickle.dump(model, f)

### Function for getting top n recommendations for a certain user

In [39]:
def get_top_n_recommendations(i, n):
    # Load in data from csv files
    actual_ratings = pd.read_csv("./Data/ml-latest-small/ratings.csv")
    actual_ratings = actual_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(-1)
    predicted_ratings = torch.matmul(model.user_factors.weight.data, model.movie_factors.weight.data.T)

    # i < actual_ratings.shape[0]

    recommendations = []
    rated = []
    for j in range(actual_ratings.shape[1]):
        if actual_ratings.iloc[i][actual_ratings.columns[j]] > 0:
            rated.append((actual_ratings.columns[j], actual_ratings.iloc[i][actual_ratings.columns[j]], float(predicted_ratings[i][j])))
        else:
            heapq.heappush(recommendations, (-float(predicted_ratings[i][j]), actual_ratings.columns[j]))
    # difference between predicted and rated: sum(list(map(lambda x: (x[1] - x[2])**2, rated)))
    movie_names = movies_df.set_index('movieId')['title'].to_dict()
    for x in range(n):
        print(movie_names[recommendations[x][1]])

Last Tango in Paris (Ultimo tango a Parigi) (1972)
Sausage Party (2016)
Galaxy of Terror (Quest) (1981)
Repulsion (1965)
Half Nelson (2006)
Kung Pow: Enter the Fist (2002)
Looker (1981)
Alien Contamination (1980)
Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)
Shall We Dance (1937)


### Example: Running a clustering algorithm on our result

In [29]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
trained_movie_embeddings = model.movie_factors.weight.data.cpu().numpy()
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.lookup[movidx]
    # print(ratings_df.loc[ratings_df['movieId']==movid].count())
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()["userId"]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Swimming with Sharks (1995)
	 My Fair Lady (1964)
	 Philadelphia Story, The (1940)
	 High Noon (1952)
	 Maltese Falcon, The (1941)
	 Snow White and the Seven Dwarfs (1937)
	 Real Genius (1985)
	 Safe (1995)
	 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
	 Speechless (1994)
Cluster #1
	 Restoration (1995)
	 What's Eating Gilbert Grape (1993)
	 Dangerous Minds (1995)
	 Puppet Masters, The (1994)
	 Some Kind of Wonderful (1987)
	 Sudden Death (1995)
	 New Guy, The (2002)
	 Gaslight (1944)
	 Jennifer 8 (1992)
	 Specialist, The (1994)
Cluster #2
	 Hate (Haine, La) (1995)
	 Chungking Express (Chung Hing sam lam) (1994)
	 Singin' in the Rain (1952)
	 Orlando (1992)
	 Afterglow (1997)
	 Unforgiven (1992)
	 Candleshoe (1977)
	 Empire (2002)
	 Penny Serenade (1941)
	 North by Northwest (1959)
Cluster #3
	 Miracle on 34th Street (1994)
	 Just Cause (1995)
	 Go Fish (1994)
	 How to Make an American Quilt (1995)
	 Piano, The (1993)
	 Mixed Nuts (1994)
	 NeverEnding Story III, The (199