<a href="https://colab.research.google.com/github/rencdr/python.movie.recommendation/blob/master/python_movie_recom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [56]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [57]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [58]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)


    def predict(self, user, item):
        return self.forward(user, item)

In [59]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

In [60]:
# Creating the dataloader
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)


In [61]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0409, 0.0155, 0.0320,  ..., 0.0246, 0.0038, 0.0483],
        [0.0270, 0.0033, 0.0049,  ..., 0.0399, 0.0488, 0.0301],
        [0.0239, 0.0366, 0.0025,  ..., 0.0177, 0.0393, 0.0410],
        ...,
        [0.0435, 0.0494, 0.0281,  ..., 0.0302, 0.0138, 0.0031],
        [0.0393, 0.0315, 0.0318,  ..., 0.0052, 0.0117, 0.0216],
        [0.0119, 0.0091, 0.0318,  ..., 0.0362, 0.0254, 0.0321]])
item_factors.weight tensor([[0.0136, 0.0154, 0.0363,  ..., 0.0126, 0.0293, 0.0127],
        [0.0065, 0.0295, 0.0451,  ..., 0.0456, 0.0428, 0.0489],
        [0.0285, 0.0448, 0.0443,  ..., 0.0004, 0.0029, 0.0140],
        ...,
        [0.0247, 0.0477, 0.0014,  ..., 0.0136, 0.0240, 0.0372],
        [0.0157, 0.0305, 0.0380,  ..., 0.0008, 0.0297, 0.0132],
        [0.0485, 0.0418, 0.0314,  ..., 0.0472, 0.0440, 0.0317]])


In [63]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    #print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

In [64]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data


user_factors.weight tensor([[0.0409, 0.0155, 0.0320,  ..., 0.0246, 0.0038, 0.0483],
        [0.0270, 0.0033, 0.0049,  ..., 0.0399, 0.0488, 0.0301],
        [0.0239, 0.0366, 0.0025,  ..., 0.0177, 0.0393, 0.0410],
        ...,
        [0.0435, 0.0494, 0.0281,  ..., 0.0302, 0.0138, 0.0031],
        [0.0393, 0.0315, 0.0318,  ..., 0.0052, 0.0117, 0.0216],
        [0.0119, 0.0091, 0.0318,  ..., 0.0362, 0.0254, 0.0321]])
item_factors.weight tensor([[0.0136, 0.0154, 0.0363,  ..., 0.0126, 0.0293, 0.0127],
        [0.0065, 0.0295, 0.0451,  ..., 0.0456, 0.0428, 0.0489],
        [0.0285, 0.0448, 0.0443,  ..., 0.0004, 0.0029, 0.0140],
        ...,
        [0.0247, 0.0477, 0.0014,  ..., 0.0136, 0.0240, 0.0372],
        [0.0157, 0.0305, 0.0380,  ..., 0.0008, 0.0297, 0.0132],
        [0.0485, 0.0418, 0.0314,  ..., 0.0472, 0.0440, 0.0317]])


In [65]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [66]:
len(trained_movie_embeddings)


9724

In [67]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [68]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Saving Private Ryan (1998)
	 Beauty and the Beast (1991)
	 Mrs. Doubtfire (1993)
	 Stargate (1994)
	 Pretty Woman (1990)
	 One Flew Over the Cuckoo's Nest (1975)
	 Monsters, Inc. (2001)
	 Godfather: Part II, The (1974)
	 Goodfellas (1990)
Cluster #1
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Batman (1989)
	 Men in Black (a.k.a. MIB) (1997)
	 Finding Nemo (2003)
	 Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
	 Ocean's Eleven (2001)
	 Breakfast Club, The (1985)
	 Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)
	 Indiana Jones and the Temple of Doom (1984)
	 American Pie (1999)
Cluster #2
	 Silence of the Lambs, The (1991)
	 Schindler's List (1993)
	 Usual Suspects, The (1995)
	 Independence Day (a.k.a. ID4) (1996)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Sixth Sense, The (1999)
	 Groundhog Day (1993)
	 Indiana Jones and the Last Crusade (1989