In [1]:
import numpy as np
import pandas as pd
from scipy import spatial

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
mn = train["rating"].mean()

In [4]:
itemr = (train.groupby('book'))['rating'].mean()
trainmn = pd.read_csv('train.csv')
trainmn['mean'] = mn
rmse(trainmn['rating'], trainmn['mean'])

1.7813364070210418

In [3]:
def rmse(A, P):
    return np.sqrt((np.square(A - P)).mean())

In [5]:
def tomatrix(data):
    rows = data.user_id.unique()
    cols = data['book'].unique()
    data = data[['user_id', 'book', 'rating']]
    idict = dict(zip(cols, range(len(cols))))
    udict = dict(zip(rows, range(len(rows))))
    data.user_id = [ udict[i] for i in data.user_id ]
    data['book'] = [ idict[i] for i in data['book'] ]
    nmat = data.as_matrix()
    return nmat


In [6]:
trainmat = tomatrix(train)
testmat = tomatrix(test)
trainspar = np.count_nonzero(trainmat==0)/np.prod(trainmat.shape)*100
testspar = np.count_nonzero(testmat==0)/np.prod(testmat.shape)*100

In [7]:
rows = train.user_id.unique()
cols = train['book'].unique()
naive = np.zeros((len(rows),len(cols)))
for row in trainmat:
    naive[row[0], row[1]] = row[2]
amean1 = np.mean(naive[naive!=0])
umean1 = itemr = sum(naive.T) / sum((naive!=0).T)
imean1 = itemr = sum(naive) / sum((naive!=0))

In [8]:
def predict_naive(user, item):
    prediction = imean1[item] + umean1[user] - amean1
    return prediction


In [9]:
predictions = []
targets = []
for row in trainmat:
    user, item, actual = row[0], row[1], row[2]
    predictions.append(predict_naive(user, item))
    targets.append(actual)
rmse(np.array(targets), np.array(predictions))

1.1444196071250665

In [10]:
def cos(mat, a, b):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) == 0:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    val = 1 - spatial.distance.cosine(avec, bvec)
    if np.isnan(val):
        return 0
    return val

def pr(mat, a, b, imean):
    if a == b:
        return 1
    aval = mat.T[a].nonzero()
    bval = mat.T[b].nonzero()
    corated = np.intersect1d(aval, bval)
    if len(corated) < 2:
        return 0
    avec = np.take(mat.T[a], corated)
    bvec = np.take(mat.T[b], corated)
    avec1 = avec - imean[a]
    bvec1 = bvec - imean[b]
    val = 1 - spatial.distance.cosine(avec1, bvec1)
    if np.isnan(val):
        return 0
    return val

In [11]:
def itemsimilar(mat, option):
    n = mat.shape[1]
    sim_mat = np.zeros((n,n))
    if option == 'pr':
        print("PR")
        for i in range(n):
            for j in range(n):
                sim_mat[i][j] = pr(mat, i, j, imean1)
        sim_mat = (sim_mat + 1)/2
    elif option == 'cos':
        print("COS")
        for i in range(n):
            for j in range(n):
                sim_mat[i][j] = cos(mat, i, j)
    return sim_mat, amean1, umean1, imean1


In [12]:
def predict(user, item, mat, item_similarity, amean, umean, imean, k=20):
    nzero = mat[user].nonzero()[0]
    if len(nzero) == 0:
        return amean
    baseline = imean + umean[user] - amean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][:k]]
    prediction = ((mat[user, choice] - baseline[choice]).dot(item_similarity[item, choice])/ sum(item_similarity[item, choice])) + baseline[item]
    if np.isnan(prediction):
        prediction = amean
    if prediction > 10:
        prediction = 10
    if prediction < 1:
        prediction = 1
    return prediction

In [13]:
def get_results(train_data, test_data, option, rows, cols, k):
    full_mat = np.zeros((len(rows),len(cols)))
    for row in train_data:
        full_mat[row[0], row[1]] = row[2]
    item_similarity, amean, umean, imean = itemsimilar(full_mat, option)
    preds = []
    real = []
    for row in train_data:
        user, item, actual = row[0], row[1], row[2]
        preds.append(predict_naive(user, item))
        real.append(actual)
    err1 = rmse(np.array(real), np.array(preds))
    preds = []
    real = []
    for row in test_data:
        user, item, actual = row[0], row[1], row[2]
        preds.append(predict_naive(user, item))
        real.append(actual)
    err2 = rmse(np.array(real), np.array(preds))
    print("Train Error")
    print(err1)
    print("Test Error")
    print(err2)
get_results(trainmat, testmat, "cos", rows, cols, 5)

COS
Train Error
1.1444196071250665
Test Error
2.2847566119422518


In [None]:
#