In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from numpy.core.umath_tests import inner1d
from sklearn.decomposition import NMF

In [2]:
class cdfTransformer(object):
    
    transformerSeries = {}
    
    def __init__(self, useridcolname, itemidcolname, ratingcolname):
        self.userid_colname = useridcolname
        self.itemid_colname = itemidcolname
        self.rating_colname = ratingcolname
    
    def getLowerOrEqualIndex(self,playtimeList, playtime):
        if playtime < playtimeList[0]:
            return 0
        ans = 0
        low = 0
        high = len(playtimeList) - 1
        while(low<=high):
            mid = low + (high-low)//2
            if playtime > playtimeList[mid]:
                ans = mid
                low = mid + 1
            elif playtime == playtimeList[mid]:
                return mid
            else:
                high = mid - 1
        return ans      
    
    def getNearestCdf(self,appid, playtime):
        playtimeList = self.transformerSeries[appid].index
        bestpos = self.getLowerOrEqualIndex(playtimeList, playtime)
        return self.transformerSeries[appid].iloc[bestpos]
    
    def fitTransform(self,tupledata):
        grouped1 = tupledata.groupby([self.itemid_colname,self.rating_colname]).count()
        grouped2 = grouped1.groupby(level=[0]).cumsum()
        grouped3 = grouped2.groupby(level = [0]).max()
        withcdf = grouped2/grouped3
        self.transformerSeries = pd.Series(withcdf[self.userid_colname],index=withcdf.index)
        withcdf_df = withcdf.reset_index(level=[0,1])
        withcdf_df.rename(columns={self.userid_colname:'temp_rating'}, inplace=True)
        finaltuple = pd.merge(withcdf_df,tupledata, on=[self.itemid_colname,self.rating_colname],how='inner',suffixes=('_newdf',''))
        finaltuple.drop(self.rating_colname, inplace=True, axis = 1)
        finaltuple.rename(columns={'temp_rating':self.rating_colname}, inplace=True)
        return finaltuple

    def Transform(self,tupledata):
        ansdata = tupledata.groupby([self.itemid_colname,self.rating_colname]).count().reset_index()
        ansdata.drop(self.userid_colname, inplace = True, axis = 1)
        ansdata['rating_temp'] =  ansdata.apply(lambda x: self.getNearestCdf(x[self.itemid_colname],x[self.rating_colname]), axis = 1)
        ansdata = pd.merge(ansdata,tupledata, on=[self.itemid_colname,self.rating_colname],how='inner',suffixes=('_newdf',''))
        ansdata.drop(self.rating_colname, axis = 1, inplace = True)
        ansdata.rename(columns = {"rating_temp":self.rating_colname}, inplace = True)         
        return ansdata

In [3]:
def train_test_split(dff, split_ratio = 0.8):
    
    tr_sample = dff.groupby('sid').apply(lambda x:x.sample(frac = 0.8))
    tr_sample_index = list(zip(*tr_sample.index))[1]
    te_sample = dff[(dff.index.isin(tr_sample_index) == False)]
    
    tr_sample.index.rename(['id', 'appid_level'], inplace=True)
    tr_sample.reset_index(drop = True, inplace = True)
    te_sample.reset_index(drop = True, inplace = True)
    
    return tr_sample, te_sample

In [4]:
class matrixFactorizer(object):
    
    def __init__(self, latentFactors = 3, max_iterations = 20, reg = 0.01):        
        self.numLatent = latentFactors
        self.max_iterations = max_iterations
        self.reg = reg
    
    def fit(self,train_ratingsbyuser, train_ratingsbyitem, total_users, total_items):
        self.U = np.random.randn(total_users, self.numLatent) / self.numLatent
        self.V = np.random.randn(self.numLatent, total_items) / self.numLatent
        
        for t in range(self.max_iterations):
          # update U
          for i in range(total_users):
            if i in train_ratingsbyuser:
              matrix = np.zeros((self.numLatent, self.numLatent)) + self.reg*np.eye(self.numLatent)
              vector = np.zeros(self.numLatent)
              for j, r in train_ratingsbyuser[i]:
                matrix += np.outer(self.V[:,j], self.V[:,j])
                vector += (r)*self.V[:,j]
              self.U[i,:] = np.linalg.solve(matrix, vector)

          # update V
          for j in range(total_items):
            if j in train_ratingsbyitem:
              matrix = np.zeros((self.numLatent, self.numLatent)) + self.reg*np.eye(self.numLatent)
              vector = np.zeros(self.numLatent)
              for i, r in train_ratingsbyitem[j]:
                matrix += np.outer(self.U[i,:], self.U[i,:])
                vector += (r)*self.U[i,:]
              self.V[:,j] = np.linalg.solve(matrix, vector)
    
    def predict(self, user_u, item_i):
        return inner1d(self.U[user_u,:],np.transpose(self.V)[item_i,:])

In [5]:
def constructCodes(df) :
    """
    Maps 'steamid' to categorical code 'sid'
    Maps 'appid' to categorical code 'aid'
    Returns :
        1. Dataframe with columns 'sid', 'aid', 'playtime_forever'
        2. Dictionary mapping 'steamid' to 'sid'
        3. Dictionary mapping 'sid' to 'steamid'
        4. Dictionary mapping 'appid' to 'aid'
        5. Dictionary mapping 'aid' to 'appid'
    """
    coded_df = df.copy(deep = True)
    coded_df["steamid"] = coded_df["steamid"].astype("category")
    coded_df["appid"] = coded_df["appid"].astype("category")
    coded_df["sid"] = coded_df["steamid"].cat.codes
    coded_df["aid"] = coded_df["appid"].cat.codes
    
    sid_to_steamid = dict(enumerate(coded_df["steamid"].cat.categories))
    aid_to_appid = dict(enumerate(coded_df["appid"].cat.categories))
    steamid_to_sid = {v : k for k, v in sid_to_steamid.items()}
    appid_to_aid = {v : k for k, v in aid_to_appid.items()}
    
    coded_df.drop(["steamid", "appid"], axis = 1, inplace = True)
    
    return(coded_df, steamid_to_sid, sid_to_steamid, appid_to_aid, aid_to_appid)

In [6]:
def constructSparseMatrices(df) :
    """
    Constructs sparse matrices that will be used in ALS optimization
    Input : Dataframe with columns - 'sid', 'aid', 'playtime_forever'
    
    Returns :
        1. User x Item Sparse Matrix
        2. Item x User Sparse Matrix
    """
    data_useritem = sparse.csr_matrix((df["playtime_forever"], (df["sid"], df["aid"])))
    data_itemuser = sparse.csr_matrix((df["playtime_forever"], (df["aid"], df["sid"])))
    
    sid_unique = df["sid"].nunique()
    aid_unique = df["aid"].nunique()
    
    assert data_useritem.shape == (sid_unique, aid_unique)
    assert data_itemuser.shape == (aid_unique, sid_unique)
    
    return(data_useritem, data_itemuser)

In [8]:
def trainModel(data, factors, epochs, conf_func, alpha, lmbda) :
    """
    Builds and trains Implicit Matrix Factorization model.
    Input :
        1. data - Item x User Sparse Matrix
        2. factors - Number of latent factors
        3. epochs - Number of iterations of ALS over the training data
        4. conf_func - Confidence function
        5. alpha - Confidence parameter
        6. lmbda - Regularization parameter
        
    Output : model
    """
    model = implicit.als.AlternatingLeastSquares(factors = factors, regularization = lmbda, iterations = epochs)
    if conf_func == "linear" :
        model.fit(alpha * data)
    else :
        print("{} is not a valid choice for conf_func. Choose one of the following : 'linear'".format(conf_func))
        return(None)
    return(model)

In [9]:
def evaluateValidationLoss(item_factors, user_factors, testtuples, itemidcolname, useridcolname, ratingcolname):
    """
    Input :
        1. item_factors - items * latentsize
        2. user_factors - users * latentsize
    Output : validationloss
    """
    predictions = inner1d(item_factors[testtuples[itemidcolname],:],user_factors[testtuples[useridcolname],:])
    return np.sqrt((((testtuples[ratingcolname] - predictions)**2).dot(np.ones(testtuples.shape[0])))/testtuples.shape[0])


In [10]:
class MF():

    def __init__(self, K, alpha, beta, iterations, ratingsbyuser, ratingsbyitem, traintuples, ratingcolname, itemcolname, usercolname):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        - ratings by user
        - ratigns by item
        """

        self.num_users, self.num_items = len(ratingsbyuser.keys()), len(ratingsbyitem.keys()) 
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.ratingsbyuser = ratingsbyuser
        self.ratingsbyitem = ratingsbyitem
        self.traintuples = traintuples
        self.ratingcolname = ratingcolname
        self.itemcolname = itemcolname
        self.usercolname = usercolname

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.traintuples[self.ratingcolname])

        # Create a list of training samples
        #self.samples = [
        #    (i, j, self.R[i, j])
        #    for i in range(self.num_users)
        #    for j in range(self.num_items)
        #    if self.R[i, j] > 0
        #]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            self.traintuples.sample(frac=1).reset_index(drop=True)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        predictions_dot = inner1d(self.Q[self.traintuples[self.itemcolname],:],self.P[self.traintuples[self.usercolname],:])
        biases = self.b_i[self.traintuples[self.itemcolname]] + self.b_u[self.traintuples[self.usercolname]] + self.b
        predictions = predictions_dot + biases
        return np.sqrt((((self.traintuples[self.ratingcolname] - predictions)**2).dot(np.ones(self.traintuples.shape[0])))/self.traintuples.shape[0])        

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for index, row in self.traintuples.iterrows():
            i,j,r = int(row['sid']),int(row['aid']),row['playtime_forever']
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.P.dot(self.Q.T)

Create train-test split of the data

In [11]:
full_data = pd.read_csv('data/final_data.csv', index_col = 0)
#create coded data, and split in train,test
coded_data, steamidtosid, sidtosteamid, appidtoaid, aidtoappid = constructCodes(full_data)
train,test = train_test_split(coded_data)
transformer1 = cdfTransformer('sid','aid','playtime_forever')
traintuples = transformer1.fitTransform(train)
train.shape, test.shape, traintuples.shape

  mask |= (ar1 == a)


((2643226, 3), (660690, 3), (2643226, 3))

In [12]:
testtuples = transformer1.Transform(test)
testtuples.head()

Unnamed: 0,aid,playtime_forever,sid
0,0,0.100205,73
1,0,0.100205,87
2,0,0.100205,90
3,0,0.100205,99
4,0,0.100205,102


In [13]:
testtuples.shape, traintuples['aid'].nunique(), traintuples['sid'].nunique(), testtuples['aid'].nunique(), testtuples['sid'].nunique()

((660690, 3), 12982, 13845, 12859, 13845)

In [15]:
traintuples.head()

Unnamed: 0,aid,playtime_forever,sid
0,0,0.100205,1
1,0,0.100205,30
2,0,0.100205,58
3,0,0.100205,74
4,0,0.100205,75


In [41]:
data_useritem, data_itemuser = constructSparseMatrices(traintuples)

In [43]:
data_useritem.shape

(13845, 12982)

Train the model

In [44]:
latentfactors = [5,10,20,40,50]
alpha = [0.0001,0.001,0.01,0.1,0,1,10]

besterr = 10000
bestconfig = {}

diction = {}

for lf in latentfactors:
    for alp in alpha:
        model = NMF(n_components=lf, alpha = alp)
        W = model.fit_transform(data_itemuser)
        H = model.components_
        valerr = evaluateValidationLoss(W, np.transpose(H), testtuples, 'aid','sid','playtime_forever')
        
        if lf not in diction:
            diction[lf] = {}
        
        diction[lf][alp] = valerr
        
        if valerr < besterr:
            besterr = valerr
            bestconfig = (lf, alp)
print(besterr)
print(bestconfig)

0.4355493840574516
(50, 0.1)


In [54]:
bestmodel = NMF(n_components=50, alpha=0.1)
W = bestmodel.fit_transform(data_itemuser)
H = bestmodel.components_

In [55]:
predictions = inner1d(W[testtuples['aid'],:],np.transpose(H)[testtuples['sid'],:])

In [59]:
testtuples['pred']= pd.Series(predictions)

In [61]:
testtuples.head(500)

Unnamed: 0,aid,playtime_forever,sid,pred
0,0,0.100205,73,0.085595
1,0,0.100205,87,0.096466
2,0,0.100205,90,0.109805
3,0,0.100205,99,0.094200
4,0,0.100205,102,0.093939
5,0,0.100205,103,0.098537
6,0,0.100205,104,0.086934
7,0,0.100205,106,0.095059
8,0,0.100205,107,0.091801
9,0,0.100205,108,0.100698


In [49]:
#Write the results to a file
f = open("sklearnmodel.txt","a")
f.write(str(diction))
f.close()

925