In [1]:
# import the dataset
import pandas as pd
problems_df = pd.read_csv('ppp.csv')
target_df = pd.read_csv('tar.csv')

In [2]:
print('The dimensions of movies dataframe are:', problems_df.shape,'\nThe dimensions of ratings dataframe are:', target_df.shape)

The dimensions of movies dataframe are: (20114, 12) 
The dimensions of ratings dataframe are: (8270, 2)


In [3]:
problems_df.drop(['Submission_Id','User_Rating', 'User_Name'], axis=1, inplace=True)
problems_df.drop(['Problem_Rating','Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10','Unnamed: 11'],axis=1,inplace=True)
result_df = problems_df.drop_duplicates(subset=['Problem Name'], keep='first')

In [4]:
print('The dimensions of movies dataframe are:', problems_df.shape,'\nThe dimensions of ratings dataframe are:', target_df.shape)


The dimensions of movies dataframe are: (20114, 3) 
The dimensions of ratings dataframe are: (8270, 2)


In [5]:
# Take a look at movies_df
problems_df.head()

Unnamed: 0,prob_id,Problem Name,Tags
0,1,1037CEqualize,"['dp', 'greedy', 'strings']"
1,2,266BQueue at the School,"['constructive algorithms', 'graph matchings',..."
2,3,1583BOmkar and Heavenly Tree,"['constructive algorithms', 'trees']"
3,4,112APetya and Strings,"['implementation', 'strings']"
4,5,443AAnton and Letters,"['constructive algorithms', 'implementation']"


In [6]:
# Take a look at ratings_df
target_df.head()

Unnamed: 0,tar_id,Target Variable
0,1,0.255814
1,2,0.353846
2,3,0.4
3,4,0.261538
4,5,0.473684


In [7]:
# Movie ID to movie name mapping
problem_name = problems_df.set_index('prob_id')['Problem Name'].to_dict()

n_probs = len(problems_df.prob_id.unique())#not sure whats happening here  #n_user changed to n_probs

n_items = len(target_df.tar_id.unique())
print("Number of unique users:", n_probs)
print("Number of unique problems:", n_items)
print("The full rating matrix will have:", n_probs*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(target_df))
print("Therefore: ", len(target_df) / (n_probs*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 20114
Number of unique problems: 8270
The full rating matrix will have: 166342780 elements.
----------
Number of ratings: 8270
Therefore:  0.004971661529283087 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [8]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [13]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = target_df.copy()#changed ratings to target
        
        # Extract all user IDs and movie IDs
        users = target_df.tar_id.unique()      ###changine user_id to tar_id
        movies = target_df.tar_id.unique()      ##i changed somethng
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.tar_id = target_df.tar_id.apply(lambda x: self.movieid2idx[x])
        self.ratings.tar_id = target_df.tar_id.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [14]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_probs, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(20114, 8)
  (item_factors): Embedding(8270, 8)
)
user_factors.weight tensor([[0.0407, 0.0143, 0.0188,  ..., 0.0125, 0.0058, 0.0408],
        [0.0157, 0.0352, 0.0325,  ..., 0.0050, 0.0321, 0.0381],
        [0.0238, 0.0409, 0.0358,  ..., 0.0265, 0.0004, 0.0125],
        ...,
        [0.0399, 0.0190, 0.0267,  ..., 0.0036, 0.0069, 0.0227],
        [0.0097, 0.0033, 0.0093,  ..., 0.0449, 0.0358, 0.0267],
        [0.0380, 0.0348, 0.0399,  ..., 0.0048, 0.0436, 0.0139]])
item_factors.weight tensor([[0.0117, 0.0311, 0.0090,  ..., 0.0458, 0.0016, 0.0005],
        [0.0375, 0.0067, 0.0199,  ..., 0.0250, 0.0211, 0.0147],
        [0.0273, 0.0055, 0.0250,  ..., 0.0063, 0.0220, 0.0184],
        ...,
        [0.0060, 0.0489, 0.0070,  ..., 0.0148, 0.0471, 0.0071],
        [0.0285, 0.0274, 0.0446,  ..., 0.0132, 0.0328, 0.0493],
        [0.0074, 0.0485, 0.0483,  ..., 0.0483, 0.0458, 0.0333]])


KeyError: "['rating' 'timestamp'] not found in axis"

In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))

iter #0 Loss: 11.064539746584625
iter #1 Loss: 4.749444664734875
iter #2 Loss: 2.4776223112786484
iter #3 Loss: 1.72177541634153
iter #4 Loss: 1.3460204232919035
iter #5 Loss: 1.1284338527827094
iter #6 Loss: 0.9915084930392086
iter #7 Loss: 0.9002359872542057
iter #8 Loss: 0.8372054895395555
iter #9 Loss: 0.7921712103909647
iter #10 Loss: 0.7592169020306035
iter #11 Loss: 0.734483423155879
iter #12 Loss: 0.7160191048825453
iter #13 Loss: 0.7014374917912
iter #14 Loss: 0.690310317020731
iter #15 Loss: 0.6816033662348835
iter #16 Loss: 0.6750705706589113
iter #17 Loss: 0.669739288727039
iter #18 Loss: 0.6658609225816533
iter #19 Loss: 0.6628223651935001
iter #20 Loss: 0.6604955623370742
iter #21 Loss: 0.6587029967059944
iter #22 Loss: 0.6573294878459824
iter #23 Loss: 0.6564567382565609
iter #24 Loss: 0.6555627360183576
iter #25 Loss: 0.6549196255388599
iter #26 Loss: 0.6539314403978701
iter #27 Loss: 0.6528688056517373
iter #28 Loss: 0.6521614116006696
iter #29 Loss: 0.6506869314890827

In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 0.6636,  1.7670,  1.5851,  ...,  1.5591,  0.9846,  1.3279],
        [ 0.2886,  0.8389,  0.4740,  ...,  0.0211,  1.7432,  1.8250],
        [-0.1276,  0.3505, -1.8636,  ..., -1.8244,  2.3191,  2.1920],
        ...,
        [ 1.5623,  0.0149,  0.6837,  ...,  2.2128,  1.5504, -0.0838],
        [ 0.8281,  0.7045,  0.6922,  ...,  1.0350,  0.5839,  1.2660],
        [ 0.8088,  1.1780,  0.7822,  ...,  0.9027,  2.0788,  1.3200]],
       device='cuda:0')
item_factors.weight tensor([[ 0.4925,  0.3693,  0.7865,  ...,  0.1419,  0.4939,  0.3609],
        [ 0.1813,  0.2992,  0.4251,  ...,  0.5503, -0.0829,  0.6948],
        [ 0.5640,  0.3030,  0.5318,  ...,  0.5683,  0.7503,  0.3950],
        ...,
        [ 0.3246,  0.3380,  0.3319,  ...,  0.3477,  0.3367,  0.3458],
        [ 0.3952,  0.4012,  0.3834,  ...,  0.3810,  0.3876,  0.3640],
        [ 0.4014,  0.4191,  0.4042,  ...,  0.3881,  0.4108,  0.4209]],
       device='cuda:0')


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Terminator 2: Judgment Day (1991)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
Cluster #1
	 Godzilla (1998)
	 Super Mario Bros. (1993)
	 Honey, I Blew Up the Kid (1992)
	 Battlefield Earth (2000)
	 Mighty Morphin Power Rangers: The Movie (1995)
	 Superman IV: The Quest for Peace (1987)
	 Next Karate Kid, The (1994)
	 Volcano (1997)
	 Karate Kid, Part III, The (1989)
	 Rambo III (1988)
Cluster #2
	 Alien (1979)
	 Die Hard (1988)
	 Groundhog Day (1993)
	 Terminator, The (1984)
	 Aliens (1986)
	 Austin Powers: The Spy Who Shagged Me (1999)
	 Clerks (1994)
	 American Pie (1999)
	 Heat (1995)
	 Austin Powers: International Man of Mystery (1997)
C