In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
class Loader():
    
    def __init__(self):
        pass
    
    def load_dataset(self):
        
        data = pd.read_csv('data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv',delimiter='\t',header = None)
        data.drop(data.columns[2],axis = 1,inplace=True)
        
        data.columns=['user','item','plays']
        data = data.loc[data.plays != 0]
        data.dropna()
        
        #sampling user
        sample_num = 10000
        unique_user_list = list(np.unique(data['user']))
        sample_user_idx = np.random.choice(len(unique_user_list),sample_num,replace = False)
        sample_user_list = [unique_user_list[idx] for idx in sample_user_idx]
        data = data[data['user'].isin(sample_user_list)]
        data = data.reset_index(drop = True)
        data_count = data.groupby(['user']).count()
        data['count'] = data.groupby('user')['user'].transform('count')
        data = data[data['count']>1]
        data['user_id']= data['user'].astype('category').cat.codes
        data['item_id'] = data['item'].astype('category').cat.codes
        item_lookup = data[['item_id','item']].drop_duplicates()
        item_lookup['item_id'] = item_lookup.item_id.astype(str)
        data = data[['user_id','item_id','plays']]
        train,test = self.train_test_split(data)
        users = list(np.sort(data.user_id.unique()))
        items = list(np.sort(data.item_id.unique()))
        rows = train['user_id'].astype(int)
        cols = train['item_id'].astype(int)
        values = list(train.plays)
        uids = np.array(rows.tolist())
        iids = np.array(cols.tolist())
        df_neg = self.get_negatives(uids,iids,items,test)
        
        return uids, iids, train, test, df_neg, users, items, item_lookup
        
    def train_test_split(self,data):
        
        test = data.copy(deep = True)
        train = data.copy(deep = True)
        test = test.groupby(['user_id']).first()
        test['user_id'] = test.index
        test = test[['user_id','item_id','plays']]
        test = test.reset_index(drop= True)
        mask = data.groupby(['user_id'])['user_id'].transform(
            self.mask_first
        ).astype(bool)
        train = data.loc[mask]
        return train, test
    
    
    def get_negatives(self, uids, iids, items, test):
        
        
        negatives = []
        
        test_u = test['user_id'].values.tolist()
        test_i = test['item_id'].values.tolist()
        
        test_ratings = list(zip(test_u,test_i))
        zipped = set(zip(uids, iids))
        
        for (u,i) in test_ratings:
            nega = []
            nega.append((u,i))
            
            for t in range(100):
                j = np.random.randint(len(items))
                
                while (u,j) in zipped:
                    j = np.random.randint(len(items))
                
                nega.append(j)
            negatives.append(nega)
        
        df_neg = pd.DataFrame(negatives)
        
        return df_neg
    
    def mask_first(self,x):
        result = np.ones_like(x)
        
        result[0] = 0
        return result
    
    
    
    def get_train_instances(self,uids,iids, num_reg, num_items):
        
        user_input , item_input, labels = [],[],[]
        
        zipped = set(zip(uids,iids))
        
        for (u,i) in zip(uids,iids):
            
            user_input.append(u)
            item_input.append(i)
            labels.append(1)
            
            for t in range(num_reg):
                
                j = np.random.randint(num_items)
                
                while (u,j) in zipped:
                    j = np.random.randint(num_items)
                    
                user_input.append(u)
                item_input.append(j)
                labels.append(0)
                
            return user_input, item_input, labels