In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
import pandas as pd

In [71]:
anime_df = pd.read_csv('drive/MyDrive/anime_data/anime.csv')
ratings_df = pd.read_csv('drive/MyDrive/anime_data/rating_complete.csv')[:100000]
#lowered ratings dataframe length to free RAM

In [72]:
print(anime_df.shape)
print(ratings_df.shape)

(17562, 35)
(100000, 3)


In [73]:
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [74]:
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [75]:
anime_names = anime_df.set_index('MAL_ID')['Name'].to_dict()

n_users = len(ratings_df.user_id.unique())
n_items = len(anime_df.MAL_ID.unique())

print(n_users)
print(n_items)

575
17562


In [76]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

In [77]:
class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors = 20):
    super().__init__()
    self.user_factors = torch.nn.Embedding(n_users, n_factors)
    self.item_factors = torch.nn.Embedding(n_items, n_factors)
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self,data):
    #matrix algebra
    users, items = data[:,0], data[:,1]
    return self.user_factors(users)* self.item_factors((items)).sum(1)

  def predict(self, user, item):
    return self.forward(user, item)

In [80]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
  def __init__(self):
    self.ratings = ratings_df.copy()

    users = ratings_df.user_id.unique()
    anime = ratings_df.anime_id.unique()

    self.userid2idx = {o: i for i, o in enumerate(users)}
    self.animeid2idx = {o: i for i, o in enumerate(anime)}

    self.idx2userid = {i: o for o, i in self.userid2idx.items()}
    self.idx2animeid = {i: o for o, i in self.animeid2idx.items()}

    self.ratings.animeId = ratings_df.anime_id.apply(lambda x: self.animeid2idx[x])
    self.ratings.userId = ratings_df.user_id.apply(lambda x: self.userid2idx[x])

    self.x = self.ratings.drop(['rating'], axis = 1).values
    self.y = self.ratings['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return len(self.ratings)

In [81]:
num_epochs = 32
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=4)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 64, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(575, 4)
  (item_factors): Embedding(17562, 4)
)
user_factors.weight tensor([[0.0460, 0.0433, 0.0179, 0.0005],
        [0.0291, 0.0361, 0.0432, 0.0290],
        [0.0295, 0.0234, 0.0034, 0.0010],
        ...,
        [0.0049, 0.0221, 0.0073, 0.0242],
        [0.0332, 0.0296, 0.0164, 0.0472],
        [0.0076, 0.0196, 0.0171, 0.0452]])
item_factors.weight tensor([[0.0470, 0.0193, 0.0132, 0.0471],
        [0.0339, 0.0015, 0.0028, 0.0223],
        [0.0371, 0.0338, 0.0380, 0.0165],
        ...,
        [0.0304, 0.0469, 0.0153, 0.0136],
        [0.0449, 0.0353, 0.0118, 0.0284],
        [0.0387, 0.0179, 0.0164, 0.0275]])


In [82]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/32 [00:00<?, ?it/s]

In [83]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.0460, 0.0433, 0.0179, 0.0005],
        [0.0291, 0.0361, 0.0432, 0.0290],
        [0.0295, 0.0234, 0.0034, 0.0010],
        ...,
        [0.0049, 0.0221, 0.0073, 0.0242],
        [0.0332, 0.0296, 0.0164, 0.0472],
        [0.0076, 0.0196, 0.0171, 0.0452]])
item_factors.weight tensor([[0.0470, 0.0193, 0.0132, 0.0471],
        [0.0339, 0.0015, 0.0028, 0.0223],
        [0.0371, 0.0338, 0.0380, 0.0165],
        ...,
        [0.0304, 0.0469, 0.0153, 0.0136],
        [0.0449, 0.0353, 0.0118, 0.0284],
        [0.0387, 0.0179, 0.0164, 0.0275]])


In [84]:
trained_anime_embeddings = model.item_factors.weight.data.cpu().numpy()

In [85]:
len(trained_anime_embeddings)

17562

In [87]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_anime_embeddings)

for cluster in range(8):
  print("Cluster #{}".format(cluster))
  anims = []
  for anime_idx in np.where(kmeans.labels_ == cluster)[0]:
    anime_id = train_set.idx2animeid[anime_idx]
    rat_count = ratings_df.loc[ratings_df['anime_id']==anime_id].count()[0]
    anims.append((anime_names[anime_id], rat_count))
  for anim in sorted(anims, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", anim[0])

  super()._check_params_vs_input(X, default_n_init=10)


Cluster #0


  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
  rat_

KeyError: 7134