In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

In [11]:
full_data = pd.read_csv('data/final_data.csv', index_col = 0)

  mask |= (ar1 == a)


In [12]:
class cdfTransformer(object):
    
    transformerSeries = {}
    
    def getLowerOrEqualIndex(self,playtimeList, playtime):
        if playtime < playtimeList[0]:
            return 0
        ans = 0
        low = 0
        high = len(playtimeList) - 1
        while(low<=high):
            mid = low + (high-low)//2
            if playtime > playtimeList[mid]:
                ans = mid
                low = mid + 1
            elif playtime == playtimeList[mid]:
                return mid
            else:
                high = mid - 1
        return ans      
    
    def getNearestCdf(self,appid, playtime):
        playtimeList = self.transformerSeries[appid].index
        bestpos = self.getLowerOrEqualIndex(playtimeList, playtime)
        return self.transformerSeries[appid].iloc[bestpos]
    
    def fitTransform(self,tupledata):
        grouped1 = tupledata.groupby(["appid","playtime_forever"]).count()
        grouped2 = grouped1.groupby(level=[0]).cumsum()
        grouped3 = grouped2.groupby(level = [0]).max()
        withcdf = grouped2/grouped3
        self.transformerSeries = pd.Series(withcdf['steamid'],index=withcdf.index)
        withcdf_df = withcdf.reset_index(level=[0,1])
        withcdf_df.rename(columns={"steamid":"playtime_cdf"}, inplace=True)
        finaltuple = pd.merge(withcdf_df,data, on=['appid','playtime_forever'],how='inner',suffixes=('_newdf',''))
        return finaltuple

    def Transform(self,tupledata):
        ansdata = tupledata.groupby(["appid","playtime_forever"]).count().reset_index()
        ansdata.drop('steamid', inplace = True, axis = 1)
        ansdata['playtime_cdf'] =  ansdata.apply(lambda x: self.getNearestCdf(x['appid'],x['playtime_forever']), axis = 1)
        return ansdata

In [13]:
def train_test_split(dff, split_ratio = 0.8):
    
    tr_sample = dff.groupby('steamid').apply(lambda x:x.sample(frac = 0.8))
    tr_sample_index = list(zip(*tr_sample.index))[1]
    te_sample = dff[(dff.index.isin(tr_sample_index) == False)]
    
    tr_sample.index.rename(['id', 'appid_level'], inplace=True)
    tr_sample.reset_index(drop = True, inplace = True)
    te_sample.reset_index(drop = True, inplace = True)
    
    return tr_sample, te_sample

In [24]:
train_data, test_data = train_test_split(full_data,0.8)

In [34]:
N = full_data['appid'].nunique()
M = full_data['steamid'].nunique()

#Number of hyperparameter, hyperparameter
K = 10
N,M,K
#-------------------------
# LOAD AND PREP THE DATA
#-------------------------
 
#  
 # Convert artists names into numerical IDs
 #data['user_id'] = data['user'].astype("category").cat.codes
 #data['artist_id'] = data['artist'].astype("category").cat.codes
 
 # Create a lookup frame so we can get the artist names back in 
 # readable form later.
 #item_lookup = data[['artist_id', 'artist']].drop_duplicates()
 #item_lookup['artist_id'] = item_lookup.artist_id.astype(str)
 
 #data = data.drop(['user', 'artist'], axis=1)

(12982, 13845, 10)

In [None]:
def implicit_als(iterations, total_users, total_items, ratingsbyuser, ratingsbyitem)
    for t in xrange(iterations):

      # update B
      for i in xrange(total_items):
      if i in ratings_by_i:
        accum = 0
        for j, r in ratings_by_i[i]:
          accum += (r - U[i,:].dot(V[:,j]) - C[j] - mu)
        B[i] = accum / (len(ratings_by_i[i]) + reg)

      # update U
      for i in xrange(M):
        if i in ratings_by_i:
          matrix = np.zeros((K, K)) + reg*np.eye(K)
          vector = np.zeros(K)
          for j, r in ratings_by_i[i]:
            matrix += np.outer(V[:,j], V[:,j])
            vector += (r - B[i] - C[j] - mu)*V[:,j]
          U[i,:] = np.linalg.solve(matrix, vector)

      # update C
      for j in xrange(N):
        if j in ratings_by_j:
          accum = 0
          for i, r in ratings_by_j[j]:
            accum += (r - U[i,:].dot(V[:,j]) - B[i] - mu)
          C[j] = accum / (len(ratings_by_j[j]) + reg)

      # update V
      for j in xrange(N):
        if j in ratings_by_j:
          matrix = np.zeros((K, K)) + reg*np.eye(K)
          vector = np.zeros(K)
          for i, r in ratings_by_j[j]:
            matrix += np.outer(U[i,:], U[i,:])
            vector += (r - B[i] - C[j] - mu)*U[i,:]
          V[:,j] = np.linalg.solve(matrix, vector)

In [None]:
class matrixFactorizer(object):
    
    def __init__(self, latentFactors = 3, max_iterations = 20, reg = 0.01):        
        self.numLatent = latentFactors
        self.max_iterations = max_iterations
        self.reg = reg
    
    def fit(self,train_ratingsbyuser, train_ratingsbyitem, total_users, total_items):
        self.U = np.random.randn(total_users, self.numLatent) / self.numLatent
        self.V = np.random.randn(self.numLatent, total_items) / self.numLatent
        
        for t in xrange(self.max_iterations):
          # update U
          for i in xrange(total_users):
            if i in train_ratingsbyuser:
              matrix = np.zeros((K, K)) + self.reg*np.eye(K)
              vector = np.zeros(K)
              for j, r in train_ratingsbyuser[i]:
                matrix += np.outer(self.V[:,j], self.V[:,j])
                vector += (r)*V[:,j]
              self.U[i,:] = np.linalg.solve(matrix, vector)

          # update V
          for j in xrange(total_items):
            if j in train_ratingsbyitem:
              matrix = np.zeros((K, K)) + self.reg*np.eye(K)
              vector = np.zeros(K)
              for i, r in train_ratingsbyitem[j]:
                matrix += np.outer(self.U[i,:], self.U[i,:])
                vector += (r)*U[i,:]
              self.V[:,j] = np.linalg.solve(matrix, vector)
    def predict()