In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
import torch 
import torch.nn as nn
import torch.utils.data as data 
import random 
from tqdm import tqdm 
from collections import defaultdict 
import os 
from sklearn import preprocessing 
from sklearn.metrics import roc_auc_score 

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Preprocess with txt files

In [9]:
def my_txt_preprocess(ratings_df_path, social_df_path): 
  f = open(ratings_df_path, 'r')
  head = f.readline() 
  tmp_list = []
  for line in f:
      try : 
          u, i, r = line.split()
          tmp_list.append([u, i, r])
      except: 
          print(line)
          break
  ratings_df = pd.DataFrame(tmp_list, columns=['user_id', 'item_id', 'rating'])
  user_id_series = ratings_df['user_id']
  user_id_series = user_id_series.sample(n=int(len(set(user_id_series)) * 0.1), replace=False).reset_index(drop=True)
  item_id_series = ratings_df['item_id']
  item_id_series = item_id_series.sample(n=int(len(set(item_id_series)) * 0.1), replace=False).reset_index(drop=True)
  print(len(set(user_id_series)), len(set(item_id_series)) )
  ratings_df = ratings_df[(ratings_df['user_id'].isin(user_id_series.values)) & (ratings_df['item_id'].isin(item_id_series.values))].reset_index(drop=True)
  
  f = open(social_df_path, 'r')
  head = f.readline() 
  tmp_list = []
  for line in f:
      try : 
          u, i, r = line.split()
          tmp_list.append([u, i, r])
      except: 
          print(line)
          break
  social_df = pd.DataFrame(tmp_list, columns=['user_id','user_id2','trust'])
  social_df = social_df[(social_df['user_id'].isin(ratings_df['user_id'].values)) & (social_df['user_id2'].isin(ratings_df['user_id'].values))].reset_index(drop=True)
  ratings_df.to_csv(ratings_df_path, sep=' ', index=False)
  social_df.to_csv(social_df_path, sep=' ', index=False)
  return ratings_df, social_df

In [11]:
import os 
ratings_df_path = '/content/drive/MyDrive/epinions/ratings_df.txt' # FIXME
social_df_path = '/content/drive/MyDrive/epinions/social_df.txt' # FIXME
if os.path.exists(ratings_df_path) and os.path.exists(social_df_path):
  ratings_df = pd.read_table(ratings_df_path,sep=' ')
  social_df = pd.read_table(social_df_path,sep=' ')
else : 
  ratings_df, social_df = my_txt_preprocess(ratings_df_path, social_df_path)

In [12]:
ratings_df

Unnamed: 0,user_id,item_id,rating
0,1,106,5
1,1,107,5
2,1,108,5
3,1,115,5
4,1,128,5
...,...,...,...
79853,49154,6128,3
79854,49154,6147,5
79855,49174,20646,5
79856,49212,44371,1


In [13]:
social_df

Unnamed: 0,user_id,user_id2,trust
0,23298,18079,1
1,23298,21513,1
2,17630,731,1
3,17630,2175,1
4,17630,2088,1
...,...,...,...
42945,511,1285,1
42946,511,794,1
42947,511,444,1
42948,511,2920,1


# DataLoader with train valid set

In [14]:
# f(x)=(x − 1)/(Rmax − 1)
#ratings_df['rating'] = (ratings_df['rating'] - 1) / 4
ratings_df['rating'] /= 5
set(ratings_df['rating'])

{0.2, 0.4, 0.6, 0.8, 1.0}

In [15]:
le = preprocessing.LabelEncoder()
le.fit(ratings_df['user_id'])
ratings_df['user_id']=le.transform(ratings_df['user_id'])
social_df['user_id']=le.transform(social_df['user_id'])
social_df['user_id2']=le.transform(social_df['user_id2'])
le.fit(ratings_df['item_id'])
ratings_df['item_id']=le.transform(ratings_df['item_id'])

In [16]:
ratings_df

Unnamed: 0,user_id,item_id,rating
0,0,5,1.0
1,0,6,1.0
2,0,7,1.0
3,0,8,1.0
4,0,9,1.0
...,...,...,...
79853,3164,1152,0.6
79854,3164,1167,1.0
79855,3165,3103,1.0
79856,3166,5029,0.2


In [17]:
social_user = defaultdict(set) 
for (user, user2, record) in social_df.itertuples(index=False): 
  social_user.setdefault(user, set()) 
  social_user[user].add(user2)

In [18]:
#예를 들어 181번 유저가 trust하는 유저들 
print(social_user[181])

{4, 136, 268, 142, 400, 149, 151, 162, 553, 63, 192, 320, 1093, 859, 221, 228, 231, 112, 241, 627, 628, 891, 126}


In [54]:
class Config: 
  trainset_rate = 0.8
  batch_size = 32
  device = device
  epochs = 5
  learning_rate = 0.01

In [21]:
df_train = ratings_df.sample(n=int(len(ratings_df) * Config.trainset_rate), replace=False)

In [22]:
df_train

Unnamed: 0,user_id,item_id,rating
34589,708,726,1.0
53208,1345,4617,0.8
45576,1056,2417,0.8
21346,397,2109,1.0
57929,1562,7935,0.2
...,...,...,...
78140,2954,1147,0.8
28860,562,1250,0.8
37776,789,54,0.8
59748,1647,851,1.0


In [23]:
df_test = ratings_df.drop(df_train.index, axis=0)
df_test

Unnamed: 0,user_id,item_id,rating
1,0,6,1.0
4,0,9,1.0
6,0,11,1.0
7,0,12,1.0
13,0,17,1.0
...,...,...,...
79810,3154,1218,1.0
79811,3155,227,1.0
79820,3156,700,1.0
79824,3156,1263,1.0


In [24]:
ratings_df

Unnamed: 0,user_id,item_id,rating
0,0,5,1.0
1,0,6,1.0
2,0,7,1.0
3,0,8,1.0
4,0,9,1.0
...,...,...,...
79853,3164,1152,0.6
79854,3164,1167,1.0
79855,3165,3103,1.0
79856,3166,5029,0.2


In [25]:
n_users = max(set(ratings_df['user_id'].values)) + 1
n_items = max(set(ratings_df['item_id'].values)) + 1
print(n_users, n_items)

3168 8075


In [26]:
class my_dataset(data.Dataset): 
  def __init__(self, df): 
    df.index = range(len(df)) 
    self.df= df
    self.user_item = {}
    for (user, item, record) in df.itertuples(index=False): 
      self.user_item.setdefault(user-1, {}) 
      self.user_item[user-1][item-1] = record 

  def __getitem__(self, index): 
    user = int(self.df.loc[index]['user_id']) 
    item = int(self.df.loc[index]['item_id']) 
    rating = float(self.df.loc[index]['rating']) 
    return (user, item), rating 
  
  def __len__(self): 
    return len(self.df) 

In [52]:
train_dataset = my_dataset(df_train)
test_dataset = my_dataset(df_test)
train_loader = data.DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False)

In [69]:
class SoRec(torch.nn.Module): 
  def __init__(self, n_users, n_items, social_user, n_factors=10, lr=0.01, lambda_1=10, lambda_2=0.001, sparse=False, device=torch.device("cuda")): 
    super(SoRec, self).__init__() 

    self.n_users = n_users 
    self.n_items = n_items 
    self.social_user = social_user 
    self.n_factors = n_factors 
    self.lr = lr 
    self.lambda_1 = lambda_1 
    self.lambda_2 = lambda_2 
    self.sparse = sparse 
    self.device = device 

    self.user_embeddings = nn.Embedding(self.n_users, self.n_factors, sparse=self.sparse) 
    self.item_embeddings = nn.Embedding(self.n_items, self.n_factors, sparse=self.sparse) 
    self.social_embeddings = nn.Embedding(self.n_items, self.n_factors, sparse=self.sparse)

    nn.init.normal_(self.user_embeddings.weight, std=0.01)
    nn.init.normal_(self.item_embeddings.weight, std=0.01)
    nn.init.normal_(self.social_embeddings.weight, std=0.01)
    self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) 
    self = self.to(self.device)

  def forward(self, users, items): 
    users = users.to(self.device) 
    items = items.to(self.device) 
    ues = self.user_embeddings(users) 
    uis = self.item_embeddings(items) 
    preds = (ues * uis).sum(dim=1, keepdim=True) 
    return preds.squeeze() 

  def social_forward(self, users, users_2): 
    users = users.to(self.device) 
    users_2 = users_2.to(self.device) 
    ues = self.user_embeddings(users) 
    ues_2 = self.user_embeddings(users_2) 
    preds = (ues * ues_2).sum(dim=1, keepdim=True) 
    return preds.squeeze() 

  def fit(self, train_loader): 
    for epoch in range(Config.epochs): 
      train_losses = 0. 
      optimizer = torch.optim.Adam(self.parameters(), lr=Config.learning_rate)

      model.train() 
      pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc="({0}: {1:^3}".format('train', epoch + 1)) 
      for idx, ((row, col), val) in pbar: 
        optimizer.zero_grad() 
        row = row.long() 
        col = col.long() 
        val = val.float().to(device) 
        preds = model.forward(row, col) 
        sim_user = []
        sim_val = []
        row_2 = [] 
        for i in range(row.shape[0]): 
          if len(social_user[int(row[i])]) > 0: 
            sim_user += random.sample(social_user[int(row[i])], 1)
            sim_val.append(1/len(social_user[int(row[i])]))
            row_2.append(row[i])
        sim_user = torch.tensor(sim_user).long().to(device) 
        row_2 = torch.tensor(row_2).long().to(device) 
        sim_val = torch.tensor(sim_val).float().to(device) 
        sim_preds = model.social_forward(row_2, sim_user) 
        loss = nn.MSELoss(reduction='sum')(preds, val) 
        loss += self.lambda_1 * nn.MSELoss(reduction='sum')(sim_preds, sim_val) 
        loss += self.lambda_2 * (self.item_embeddings.weight.norm() + self.user_embeddings.weight.norm() + self.social_embeddings.weight.norm()) 
        train_losses += loss.item() 
        batch_loss = loss.item() / row.size()[0] 
        pbar.set_postfix(train_loss = batch_loss) 

        with torch.set_grad_enabled(True): 
          loss.backward() 
          self.optimizer.step()  

      with torch.no_grad(): 
        self.eval() 

        y_pred, y_true = [], [] 
        for ((row, col), val) in test_loader: 
          row = row.long() 
          col = col.long() 
          val = val.float() 
          preds = self.forward(row, col) 
          y_pred += preds.tolist() 
          y_true += val.tolist() 
        y_pred, y_true = np.array(y_pred), np.array(y_true) 

        epoch_score = np.sqrt(sum([(y-x)**2 for x, y in zip(y_true, y_pred)]) / len(y_pred)) 
        score = 'rmse' 
        epoch_score1 = sum([np.abs(y-x) for x, y in zip(y_true, y_pred)]) / len(y_pred) 
        score1 = 'mae'
        print(
            f'epoch {epoch + 1} train loss : {train_losses:.3f} {score}: {epoch_score:.3f} {score1}: {epoch_score1:.3f}'
        )      

model = SoRec(n_users, n_items, social_user)
model.fit(train_loader)

(train:  1 : 100%|██████████| 1997/1997 [00:32<00:00, 60.70it/s, train_loss=0.395]


epoch 1 train loss : 44628.149 rmse: 0.474 mae: 0.387


(train:  2 : 100%|██████████| 1997/1997 [00:30<00:00, 65.84it/s, train_loss=0.147]


epoch 2 train loss : 21730.856 rmse: 0.445 mae: 0.355


(train:  3 : 100%|██████████| 1997/1997 [00:30<00:00, 66.18it/s, train_loss=0.523]


epoch 3 train loss : 18944.834 rmse: 0.446 mae: 0.353


(train:  4 : 100%|██████████| 1997/1997 [00:30<00:00, 66.12it/s, train_loss=0.289]


epoch 4 train loss : 18008.390 rmse: 0.444 mae: 0.348


(train:  5 : 100%|██████████| 1997/1997 [00:30<00:00, 65.75it/s, train_loss=0.455]


epoch 5 train loss : 17885.560 rmse: 0.447 mae: 0.352
