In [12]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse

In [13]:
def constructCodes(df) :
    """
    Maps 'steamid' to categorical code 'sid'
    Maps 'appid' to categorical code 'aid'
    Returns :
        1. Dataframe with columns 'sid', 'aid', 'playtime_forever'
        2. Dictionary mapping 'steamid' to 'sid'
        3. Dictionary mapping 'sid' to 'steamid'
        4. Dictionary mapping 'appid' to 'aid'
        5. Dictionary mapping 'aid' to 'appid'
    """
    coded_df = df.copy(deep = True)
    coded_df["steamid"] = coded_df["steamid"].astype("category")
    coded_df["appid"] = coded_df["appid"].astype("category")
    coded_df["sid"] = coded_df["steamid"].cat.codes
    coded_df["aid"] = coded_df["appid"].cat.codes
    
    sid_to_steamid = dict(enumerate(coded_df["steamid"].cat.categories))
    aid_to_appid = dict(enumerate(coded_df["appid"].cat.categories))
    steamid_to_sid = {v : k for k, v in sid_to_steamid.items()}
    appid_to_aid = {v : k for k, v in aid_to_appid.items()}
    
    coded_df.drop(["steamid", "appid"], axis = 1, inplace = True)
    
    return(coded_df, steamid_to_sid, sid_to_steamid, appid_to_aid, aid_to_appid)

In [31]:
def getSplit(df, train_size, validation_size) :
    """
    Returns Train, Validation and Test DataFrames
    """
    train = df.copy(deep = True)
    val = df.copy(deep = True)
    test = df.copy(deep = True)
    
    assert train_size + validation_size < 1, "train_size + validation_size should be less than 1"
    test_size = 1 - train_size - validation_size
    
    test_sample = df.groupby("sid").apply(lambda x : x.sample(frac = test_size))
    test_sample.reset_index(level = 0, drop = True, inplace = True)
    test_idx = test_sample.index
    train.drop(test_idx, inplace = True)
    val.drop(test_idx, inplace = True)
    
    val_sample = train.groupby("sid").apply(lambda x : x.sample(frac = validation_size / (1 - test_size)))
    val_sample.reset_index(level = 0, drop = True, inplace = True)
    val_idx = val_sample.index
    train.drop(val_idx, inplace = True)
    
    assert train["sid"].nunique() == df["sid"].nunique()
    assert train["aid"].nunique() == df["aid"].nunique()
    assert val["sid"].nunique() == df["sid"].nunique()
    assert val["aid"].nunique() == df["aid"].nunique()
    
    return(train, val, test)

In [15]:
def constructSparseMatrices(df) :
    """
    Constructs sparse matrices that will be used in ALS optimization
    Input : Dataframe with columns - 'sid', 'aid', 'playtime_forever'
    
    Returns :
        1. User x Item Sparse Matrix
        2. Item x User Sparse Matrix
    """
    data_useritem = sparse.csr_matrix((df["playtime_forever"], (df["sid"], df["aid"])))
    data_itemuser = sparse.csr_matrix((df["playtime_forever"], (df["aid"], df["sid"])))
    
    sid_unique = df["sid"].nunique()
    aid_unique = df["aid"].nunique()
    
    assert data_useritem.shape == (sid_unique, aid_unique)
    assert data_itemuser.shape == (aid_unique, sid_unique)
    
    return(data_useritem, data_itemuser)

In [16]:
def trainModel(data, factors, epochs, conf_func, alpha, lmbda) :
    """
    Builds and trains Implicit Matrix Factorization model.
    Input :
        1. data - Item x User Sparse Matrix
        2. factors - Number of latent factors
        3. epochs - Number of iterations of ALS over the training data
        4. conf_func - Confidence function
        5. alpha - Confidence parameter
        6. lmbda - Regularization parameter
        
    Output : model
    """
    model = implicit.als.AlternatingLeastSquares(factors = factors, regularization = lmbda, iterations = epochs)
    if conf_func == "linear" :
        model.fit(alpha * data)
    else :
        print("{} is not a valid choice for conf_func. Choose one of the following : 'linear'".format(conf_func))
        return(None)
    return(model)

In [17]:
def getPrecisionRecall(model, sid, N, train, test, data_useritem) :
    """
    Returns the precision@N and recall@N for given userid : 'sid'
    Input :
        1. model - Implicit Matrix Factorization trained model
        2. sid - User id whose precision and recall is to be calculated
        3. N - N parameter in precision@N and recall@N
        4. train
        5. test
        6. data_useritem - User x Item Sparse Matrix (implicit library requires this to discard already played itemids) 
    """
    #masked_aid, full_aid = getMaskedAid(sid, train, test)
    #recommendations = model.recommend(sid, data_useritem, N)
    #recommended_aid = [aid for aid, score in recommendations]
    #precision = len(set(masked_aid) & set(recommended_aid)) / N
    #recall = len(set(masked_aid) & set(recommended_aid)) / len(masked_aid)
    #return(precision, recall)

In [18]:
def getMeanPrecisionRecall(model, N, train, test, data_useritem) :
    """
    Returns the User-Average Precision@N and User-Average Recall@N
    """
    pass

In [19]:
def gridSearch(params) :
    """
    Performs a grid-search on input params
    """
    pass

#### Coarse Grid Search
We perform a coarse grid search on the parameters. Based on the results, we fine-tune the parameters manually.

#### Grid Search Observations :

#### Choice of Parameters :

#### Mean User-Average Precision@N and Recall@N vs Factors :

#### Mean User-Average Precision@N and Recall@N for Optimal Choice of Parameters vs N :

#### Game Similarity Observations :

#### Explanation for Recommendations :

In [38]:
df = pd.read_csv("data/final_data.csv", index_col = 0)

  mask |= (ar1 == a)


In [47]:
coded_df, steamid_to_sid, sid_to_steamid, appid_to_aid, aid_to_appid = constructCodes(df)
train, val, test = getSplit(coded_df, 0.45, 0.35)

In [48]:
len(train) / len(df)

0.4499851085802424

In [49]:
len(test) / len(df)

1.0

In [50]:
len(val) / len(df)

0.8000282089496222