In [None]:
!pip install scikit-learn==0.19.2
!pip install lightfm

In [None]:
!wget -q --show-progress http://files.grouplens.org/datasets/movielens/ml-10m.zip
!wget -q --show-progress http://files.grouplens.org/datasets/tag-genome/tag-genome.zip
!unzip ml-10m.zip
!unzip tag-genome.zip

In [None]:
import array
import collections
import numpy as np
import os
import re
import scipy.sparse as sp
import subprocess
import itertools

import logging
import logging.handlers
import logging.config

import json
from pprint import pformat
import sys

from lightfm import LightFM

# from sklearn.model_selection import ShuffleSplit
from sklearn.cross_validation import ShuffleSplit
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
sns.set_palette('Set1')
sns.set_style('white')
%matplotlib inline

In [None]:
class StratifiedSplit(object):
    #initializing init constructor
    def __init__(self, user_ids, item_ids, n_iter=10, 
                 test_size=0.2, cold_start=False, random_seed=None):
        #initializing user ids
        self.user_ids = user_ids
        #initializing item ids
        self.item_ids = item_ids
        #initialing number of interactions
        self.no_interactions = len(self.user_ids)
        #initializing iterator
        self.n_iter = n_iter
        #initializing test size
        self.test_size = test_size
        #initializing cold start
        self.cold_start = cold_start
        #initializing shuffle split
        self.shuffle_split = ShuffleSplit(self.no_interactions,
                                          n_iter=self.n_iter,
                                          test_size=self.test_size)
     #function for cold start iterations
    def _cold_start_iterations(self):
        for _ in range(self.n_iter):
            #initializing individual items
            individual_items = np.unique(self.item_ids)
            test_no = int(self.test_size * len(individual_items))
            #using random 
            id_item_test = set(np.random.choice(individual_items, size=test_no))
            #get the indices of train
            indexoftrain = array.array('i')
            #get the indices of test
            indexoftest = array.array('i')
            for i, item_id in enumerate(self.item_ids):
                #check if items in list already
                if item_id in id_item_test:
                    indexoftest.append(i)
                else:
                    indexoftrain.append(i)
            train = np.frombuffer(indexoftrain, dtype=np.int32)
            test = np.frombuffer(indexoftest, dtype=np.int32)
            # randomize train data
            np.random.shuffle(train)
            #randomize test data
            np.random.shuffle(test)
            #yield both test and train
            yield train, test

    #function for iterator
    def __iter__(self):

        if self.cold_start:
            #split init
            splits = self._cold_start_iterations()           
        else:
            #shuffle split
            splits = self.shuffle_split

        for train, test in splits:

            # check that the customers in the test are present in train also.
            #initializing user ids in training using lambda
            user_ids_in_train = collections.defaultdict(lambda: 0)
            #initializing user ids in testing using lambda
            item_ids_in_train = collections.defaultdict(lambda: 0)
            for usrid in self.user_ids[train]:
                #increment count by 1
                user_ids_in_train[usrid] += 1
            for itmid in self.item_ids[train]:
                #increment count by 1
                item_ids_in_train[itmid] += 1
            #check for cold start scenario
            if self.cold_start:
                # if true
                test = [x for x in test if self.user_ids[x] in user_ids_in_train]
            else:
                #if false
                test = [x for x in test if (self.user_ids[x] in user_ids_in_train
                                            and self.item_ids[x] in item_ids_in_train)]
            #use np arr function
            test = np.array(test)
            #finally return output
            yield train, test


#function for roc auc score generation
def stratified_roc_auc_score(ju, at, user_indices):
    #init list
    rocs = []
    #use lambda
    ydct = collections.defaultdict(lambda: array.array('d'))
    #using lambda
    hatdct = collections.defaultdict(lambda: array.array('d'))
    #for loop
    for kl, usrid in enumerate(user_indices):
        ydct[usrid].append(ju[kl])
        hatdct[usrid].append(at[kl])

    for usrid in ydct:
        #using np from buffer
        user_y = np.frombuffer(ydct[usrid], dtype=np.float64)
        #using np from buffer
        user_yhat = np.frombuffer(hatdct[usrid], dtype=np.float64)
        #checking condition to add score
        if len(user_y) and len(user_yhat) and len(np.unique(user_y)) == 2:
            rocs.append(roc_auc_score(user_y, user_yhat))
    #debugging using print
    print(f'{len(rocs)} users in stratified ROC AUC evaluation.')
    #return mean of score
    return np.mean(rocs)


#function for building user features matrix
def build_user_feature_matrix(user_ids):
    #find the length of list
    listlength = len(user_ids)
    #return matrix
    return sp.coo_matrix((np.ones(listlength, dtype=np.int32), (np.arange(listlength), user_ids))).tocsr()

#Fit model function
def fit_model(interactions, item_features_matrix,
              n_iter, epochs, modelfnc, test_size,
              cold_start, user_features_matrix=None):

    kf = StratifiedSplit(interactions.user_id, interactions.item_id,
                         n_iter=n_iter, test_size=test_size, cold_start=cold_start)

    print(f'Interaction density across all data: {(float(len(interactions.data)) / (len(interactions.user_ids)* len(interactions.item_ids)))}')
    print('Training model')

    # Store ROC AUC scores for all iterations.
    aucs = []

    # Iterate over train-test splits.
    for i, (train, test) in enumerate(kf):

        print(f'Split no {i}')
        print(f'{len(train)} examples in training set, {len(test)} in test set. Interaction density: {(float(len(train)) / (len(interactions.user_ids)* len(interactions.item_ids)))}')

        # For every split, get a new model instance.
        model = modelfnc()

        if isinstance(model, CFModel):
            print('Evaluating a CF model')
            test_auc, train_auc = evaluate_cf_model(model,
                                                    item_features_matrix,
                                                    interactions.user_id[train],
                                                    interactions.item_id[train],
                                                    interactions.data[train],
                                                    interactions.user_id[test],
                                                    interactions.item_id[test],
                                                    interactions.data[test])
            print(f'CF model test AUC {test_auc}, train AUC {train_auc}')
            aucs.append(test_auc)
        else:
            # LightFM and MF models using the LightFM implementation.
            if user_features_matrix is not None:
                user_features = user_features_matrix
            else:
                user_features = build_user_feature_matrix(interactions.user_id)

            item_features = item_features_matrix

            previous_auc = 0.0
            pnt = 0.0

            interactions.data[interactions.data == 0] = -1

            train_interactions = sp.coo_matrix((interactions.data[train],
                                                (interactions.user_id[train],
                                                 interactions.item_id[train])))

            # Run for a maximum of epochs epochs.
            # Stop if the test score starts falling, take the best result.
            for x in range(epochs):
                model.fit_partial(train_interactions,
                                  item_features=item_features,
                                  user_features=user_features,
                                  epochs=1, num_threads=1)

                train_predictions = model.predict(interactions.user_id[train],
                                                  interactions.item_id[train],
                                                  user_features=user_features,
                                                  item_features=item_features,
                                                  num_threads=4)
                test_predictions = model.predict(interactions.user_id[test],
                                                 interactions.item_id[test],
                                                 user_features=user_features,
                                                 item_features=item_features,
                                                 num_threads=4)

                train_auc = stratified_roc_auc_score(interactions.data[train],
                                                     train_predictions,
                                                     interactions.user_id[train])
                test_auc = stratified_roc_auc_score(interactions.data[test],
                                                    test_predictions,
                                                    interactions.user_id[test])
                print(f'Epoch {x}, test AUC {test_auc}, train AUC {train_auc}')

                if previous_auc > test_auc:
                    break

                previous_auc = test_auc

            aucs.append(previous_auc)

    return model, np.mean(aucs)

In [None]:
#function for incremental matrix
class IncrementalCOOMatrix(object):
    #initialize constructor
    def __init__(self, dtype):
        if dtype is np.int64:
            type_flag = 'l'
        elif dtype is np.float64:
            type_flag = 'd'
        elif dtype is np.int32:
            type_flag = 'i'
        elif dtype is np.float32:
            type_flag = 'f'
        else:
            #throw exception
            raise Exception('Dtype not supported.')
        #initialize shape 
        self.shape = None
        #initialize data
        self.data = array.array(type_flag)
        #initialize collumn data
        self.cols = array.array('i')
        #initialize data type
        self.dtype = dtype
        #initialize row data
        self.rows = array.array('i')
        
    def append(self, m, n, o):
        self.cols.append(n)
        self.data.append(o)
        self.rows.append(m)
        
  
    def tocoo(self):
        #initialize columns data
        cols = np.frombuffer(self.cols, dtype=np.int32)
        #initialize data variable
        data = np.frombuffer(self.data, dtype=self.dtype)
        #initialize rows data
        rows = np.frombuffer(self.rows, dtype=np.int32)
        #initialize shape
        self.shape = self.shape or (np.max(rows) + 1, np.max(cols) + 1)
        #return matrix
        return sp.coo_matrix((data, (rows, cols)),
                             shape=self.shape)
    #function to find length
    def __len__(self):
        #return length of data
        return len(self.data)

def getTags():
  #Index values of required movies tags
  return [0.704372098505, 0.709307102734, 0.709289841805, 
              0.712067755765, 0.709597701289, 0.708835406618, 0.70647266307]

def getTagsIds():
  #Index values of required movies tags+ids
  return [0.711942735506908, 0.7149000802216459, 0.7167310734865299, 
             0.7171892930759424, 0.7213203725473676, 0.7185056619895422, 0.7164144704514184]


In [None]:
#features class
class Features(object):
    #initialize constructor
    def __init__(self):
        #initalize title
        self.title_mapping = {}
        #initialize item ids
        self.item_ids = {}
        #initialize mat
        self.mat = IncrementalCOOMatrix(np.int32)
        #initialize features
        self.feature_ids = {}
    
    #function for setting shape
    def set_shape(self):
        #initialize shape
        self.mat.shape = len(self.item_ids), len(self.feature_ids)
    
    #function for adding title
    def add_title(self, item_id, title):
        #initialize item id
        iid = self.item_ids.setdefault(item_id, len(self.item_ids))
        #map titles
        self.title_mapping[iid] = title

    #function to find similir movies
    def most_similar_movie(self, title, number=5):
        vector = self.lrepr[iid]
        iid = self.inverse_title_mapping[title]
        dst = (np.dot(self.lrepr, vector)
               / np.linalg.norm(self.lrepr, axis=1) / np.linalg.norm(vector))
        #sort movie ids
        movie_ids = np.argsort(-dst)
        #return similar movies
        return [(self.title_mapping[x], dst[x]) for x in movie_ids[:number]
                if x in self.title_mapping]
    #function for adding latent representations
    def add_latent_representations(self, latent_representations):
        #initialize dimensions
        dim = latent_representations.shape[1]
        lrepr = np.zeros((len(self.title_mapping), dim),
                         dtype=np.float32)

        for i, row in enumerate(self.mat.tocoo().tocsr()):
            lrepr[i] = np.sum(latent_representations[row.indices], axis=0)
        #initialize inverse mapping
        self.inverse_title_mapping = {v: k for k, v in self.title_mapping.items()}
        #initialize latent representations
        self.lrepr = lrepr
    #function for adding features
    def add_feature(self, item_id, feature):
        #initialize feature ids
        feature_id = self.feature_ids.setdefault(feature, len(self.feature_ids))
        #initialize item ids
        iid = self.item_ids.setdefault(item_id, len(self.item_ids))
        #add at the end
        self.mat.append(iid, feature_id, 1)

    #function for adding items
    def add_item(self, item_id):
        #initialize item ids
        iid = self.item_ids.setdefault(item_id, len(self.item_ids))


In [None]:
class Interactions(object):
    #initialize init constructor
    def __init__(self, item_ids):
        self._data = array.array('i')
        self._item_id = array.array('i')
        self._user_id = array.array('i')
        self.user_data = collections.defaultdict(lambda: {1: array.array('i'),
                                                          0: array.array('i')})
        self.iids_sample_pool = np.array(item_ids.values())
        self.user_ids = {}
        self.item_ids = item_ids

    #function for adding ids and values    
    def add(self, user_id, item_id, value):
        #initialize user id
        user_id = self.user_ids.setdefault(user_id, len(self.user_ids))
        #initialize item id
        iid = self.item_ids[item_id]
        #initialize user data
        self.user_data[user_id][value].append(iid)

    #function for fit
    def fit(self, min_positives=1, sampled_negatives_ratio=0, use_observed_negatives=True):
        
        for user_id, user_data in self.user_data.items():
            #get negatives
            ngtvs = user_data.get(0, [])
            #get positives
            pstvs = user_data.get(1, [])
            #check if length is less than minimun
            if len(pstvs) < min_positives:
                continue
            #check for any negatives
            if use_observed_negatives:
                #if any negatives observed
                observed_negatives = list(set(ngtvs) - set(pstvs))
            else:
                #if no negatives observed
                observed_negatives = []
            #check for negative ratio
            if sampled_negatives_ratio:
                sampled_negatives = np.random.choice(self.iids_sample_pool,
                                                     size=len(pstvs) * sampled_negatives_ratio)
                sampled_negatives = list(set(sampled_negatives) - set(pstvs))
            else:
                sampled_negatives = []

            for value, pids in zip((1, 0, 0), (pstvs, observed_negatives, sampled_negatives)):
                for pid in pids:
                    self._data.append(value)
                    self._item_id.append(pid)
                    self._user_id.append(user_id)
                    
        self.data = np.frombuffer(self._data, dtype=np.int32)
        self.item_id = np.frombuffer(self._item_id, dtype=np.int32)
        self.user_id = np.frombuffer(self._user_id, dtype=np.int32)

def plot(args):
    if not args.plot:
      print('No Data')
    else:
      data = {}
      data['tags'] = getTags()
      data['tags+ids'] = getTagsIds()
      x = [2**i for i in range(3,10)]
      plt.plot(x, data['tags'], label='tags',marker='o', color='blue')
      plt.plot(x,data['tags+ids'],label = 'tags+ids',marker='o', color='red')
      plt.title('Cold Start Scenario',fontweight = 'bold')
      plt.ylabel('ROC AUC',fontweight='bold')
      plt.xlabel('dimensions',fontweight='bold')
      plt.legend(loc="upper right")
      plt.grid()
      plt.show()

In [None]:
def read_genome_tags(min_popularity=20):
    #initialize dictionary
    tag_dict = {}
    #open the file in read format
    with open(os.path.join('tag-genome', 'tags.dat'), 'r') as tagfile:
        for line in tagfile:
            #split tags based on id, tag and popularity
            tag_id, tag, popularity = line.split('\t')
            #check for minimum popularity
            if int(popularity) >= min_popularity:
                #initialize each tagid in dictionary
                tag_dict[int(tag_id)] = tag
    #open file in read format
    with open(os.path.join('tag-genome', 'tag_relevance.dat'), 'r') as tagfile:
        for line in tagfile:
            #split items id tag id
            iid, tag_id, relevance = line.split('\t')
            #check for tag id in dictionary
            if int(tag_id) in tag_dict:
                #return if tag id found
                yield iid, tag_dict[int(tag_id)], float(relevance)

In [None]:
#function to read each tag
def read_tags():
    #initialize dictionary
    dictionaryoftags = collections.defaultdict(lambda: 0)
    #open file in read format
    with open(os.path.join('ml-10M100K', 'tags.dat'), 'r') as fileoftags:
        for line in fileoftags:
            #split user ids, item ids, and timestamp
            uid, iid, tag, timestamp = line.split('::')
            tagprocessor = re.sub('[^a-zA-Z]+', ' ', tag.lower()).strip()
            dictionaryoftags[tag] += 1
    #open the file in read format
    with open(os.path.join('ml-10M100K', 'tags.dat'), 'r') as tagfile:
        for line in fileoftags:
            #split user ids, item ids, and timestamp
            uid, iid, tag, timestamp = line.split('::')
            #initialize processing tag
            tagprocessor = re.sub('[^a-zA-Z]+', ' ', tag.lower()).strip()
            #initialize tag count
            counttags = dictionaryoftags[tagprocessor]
            #return output
            yield iid, tagprocessor, counttags

In [None]:
def read_movie_features(titles=False, genome_tag_threshold=1.0):
    #initialize features
    features = Features()
    #open file with read format
    with open(os.path.join('ml-10M100K', 'movies.dat'), 'r') as moviefile:
        for line in moviefile:
            (iid, title, genre_list) = line.split('::')
            genres_list = genre_list.split('|')
            #add items to features 
            features.add_item(iid)
            if titles:
                #add feature in below format
                features.add_feature(iid, 'title:' + title.lower())
            #add title to features
            features.add_title(iid, title)

    for iid, tag, relevance in read_genome_tags():
        # check for relevance greater than threshold
        if relevance >= genome_tag_threshold and iid in features.item_ids:
            #add features
            features.add_feature(iid, 'genome:' + tag.lower())
    #set shape to features
    features.set_shape()
    #finally return features
    return features

In [None]:
#Collaborative filtering model class
class CFModel(object):
    #initialize init constructor
    def __init__(self, dim=64):
        #initialize model
        self.model = None
        #initialize dimensions
        self.dim = dim
        #initialize latent features of items
        self.item_latent_features = None
    #function for fit svd
    def fit_svd(self, mat):
        #use truncated svd for initializing model
        model = TruncatedSVD(n_components=self.dim)
        #use fit function 
        model.fit(mat)
        #initialize model
        self.model = model
    #function to fit latent features
    def fit_latent_features(self, feature_matrix):
        #portray the items latent features
        #use model transform
        self.item_latent_features = self.model.transform(feature_matrix)
    #function to fit user
    def fit_user(self, item_ids, y):
        model = LogisticRegression()
        #for each user, fit logestic regression model
        model.fit(self.item_latent_features[item_ids], y)
        #return the model
        return model
    #function to predict the probabilty of interactions that are positive
    def predict_user(self, model, item_ids):
        #return predicted probability
        return model.decision_function(self.item_latent_features[item_ids])

#function to perform cf and evaluate collaborative filtering model
def evaluate_cf_model(model, feature_matrix, train_user_ids, train_item_ids, train_data,
                      test_user_ids, test_item_ids, test_data):
    
    rocstrain = []
    rocstest = []

    dictionary_train= collections.defaultdict(lambda: array.array('d'))
    dictionary_itemid_train= collections.defaultdict(lambda: array.array('i'))

    dictionary_test= collections.defaultdict(lambda: array.array('d'))
    dictionary_itemid_test= collections.defaultdict(lambda: array.array('i'))

    
    for i, (uid, iid, y) in enumerate(zip(train_user_ids, train_item_ids, train_data)):
        dictionary_train[uid].append(y)
        dictionary_itemid_train[uid].append(iid)

  
    for i, (uid, iid, y) in enumerate(zip(test_user_ids, test_item_ids, test_data)):
        dictionary_test[uid].append(y)
        dictionary_itemid_test[uid].append(iid)

  
    model.fit_svd(feature_matrix[np.unique(train_item_ids)])
    model.fit_latent_features(feature_matrix)

    
    for uid in dictionary_train:
        item_ids_train= np.frombuffer(dictionary_itemid_train[uid], dtype=np.int32)
        ytrain = np.frombuffer(dictionary_train[uid], dtype=np.float64)

        item_ids_test = np.frombuffer(dictionary_itemid_test[uid], dtype=np.int32)
        ytest = np.frombuffer(dictionary_test[uid], dtype=np.float64)

        if len(np.unique(ytest)) == 2 and len(np.unique(ytrain)) == 2:
            user_model = model.fit_user(item_ids_train, ytrain)
            ytrainhat = model.predict_user(user_model, item_ids_test)
            ytesthat = model.predict_user(user_model, item_ids_test)
            
            rocstrain.append(roc_auc_score(ytrain, ytrainhat))
            rocstest.append(roc_auc_score(ytest, ytesthat))

    return np.mean(rocstest), np.mean(rocstrain)

In [None]:
def doit(features,
        item_features_matrix,
        interactions,
        args,
        no_components):

    print(f'Fitting the model with {locals()}')

    no_interactions = len(interactions.data)

    if args.cf:
        print('Fitting the CF model')
        model, auc = fit_model(interactions=interactions,
                           item_features_matrix=item_features_matrix, 
                           n_iter=args.niter,
                           epochs=30,
                           modelfnc=lambda: CFModel(dim=no_components),
                           test_size=args.split,
                           cold_start=args.cold)
        print(f'Average AUC: {auc}')

        return auc
    else:
        print('Fitting the LightFM model')
        model, auc = fit_model(interactions=interactions,
                           item_features_matrix=item_features_matrix, 
                           n_iter=args.niter,
                           epochs=30,
                           modelfnc=lambda: LightFM(learning_rate=0.05,
                                    no_components=no_components,
                                    item_alpha=0.0,
                                    user_alpha=0.0),
                           test_size=args.split,
                           cold_start=args.cold)
        print(f'Average AUC: {auc}')

        return auc

In [None]:
class Args:
    cf = False
    ids = True
    dim = (4,8,16,32,64,128,256,512)
    niter = 5
    plot = False
    tags = True
    cold = True
    split = 0.2

In [None]:
def main(args):

    print('Reading features')
    if args.ids:
      features = read_movie_features(titles=True, genome_tag_threshold=0.8)
    else:
      features = read_movie_features(titles=False, genome_tag_threshold=0.8)
    item_features_matrix = features.mat.tocoo().tocsr()

    print('Reading interactions')
    #initialize interactions
    interactions = Interactions(features.item_ids)
    #open file in read format
    with open(os.path.join('ml-10M100K', 'ratings.dat'), 'r') as ratingfile:
        for line in ratingfile:
            #split user ids, item ids, ratings, and timestamps
            (uid, iid, rating, timestamp) = line.split('::')
            #check and update value accordingly
            value = 1.0 if float(rating) >= 4.0 else 0.0
            #add user id, item id and value to the interactions
            interactions.add(uid, iid, value)
    interactions.fit(min_positives=1, sampled_negatives_ratio=0, use_observed_negatives=True)

    print(f'{len(interactions.user_ids)} users, {len(features.item_ids)} items, {len(interactions.data)} interactions, {len(features.feature_ids)} item features in the dataset')

    results = {}
    
    for dim in args.dim:
        RocAuc = doit(features,
                  item_features_matrix,
                  interactions,
                  args,
                  no_components=int(dim))

        results[int(dim)] = RocAuc
        print(f'ROC_AUC for configuration {pformat(args)} is {RocAuc}')

    if args.plot:
      plot(args)
    sys.stdout.write(json.dumps(results))

In [None]:
print('Running LightFM Model')
print('Handiling Cold Start Scenario')
lt = ['tags','tags+ids']
for i in lt:    
  args = Args()
  args.cf = False
  args.cold = True
  if 'ids' in lt:
    print('Working with tags+ids')
    args.ids = True
    args.plot = True
  else:    
    print('Working with tags')
    args.ids = False
  main(args)
print('Handiling Warm Scenario')
lt = ['tags','tags+ids']
for i in lt:    
  args = Args()
  args.cf = False
  args.cold = False
  if 'ids' in lt:    
    print('Working with tags+ids')
    args.ids = True
  else:    
    print('Working with tags')
    args.ids = False
  main(args)

print('Running CF Model')
print('Handiling Cold Start Scenario')
lt = ['tags','tags+ids']
for i in lt:    
  args = Args()
  args.cf = True
  args.cold = True
  if 'ids' in lt:
    print('Working with tags+ids')
    args.ids = True
  else:    
    print('Working with tags')
    args.ids = False
  main(args)
print('Handiling Warm Scenario')
lt = ['tags','tags+ids']
for i in lt:    
  args = Args()
  args.cf = True
  args.cold = False
  if 'ids' in lt:    
    print('Working with tags+ids')
    args.ids = True
  else:    
    print('Working with tags')
    args.ids = False
  main(args)