In [67]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
%matplotlib inline
import random
from sklearn import preprocessing
import scipy.sparse as sparse
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('datasets/reviews_simpleInt.csv')

## Note Team, I changed rating column header from overall to rating

## Creating a Sparse Matrix

In [3]:
print df.shape
df.head()

(982619, 3)


Unnamed: 0,itemID,userID,rating
0,0,7773,5
1,0,61894,4
2,0,53977,4
3,0,8128,5
4,0,50527,4


In [19]:
users = len(set(df.userID))
items = len(set(df.itemID))

In [20]:
mtx = sparse.csr_matrix((df.rating, (df.userID, df.itemID)), shape=(users, items))

In [21]:
mtx

<68223x61934 sparse matrix of type '<type 'numpy.int64'>'
	with 982619 stored elements in Compressed Sparse Row format>

In [22]:
mtx.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [23]:
mtx.data

array([5, 5, 5, ..., 5, 5, 4])

In [24]:
mtx.indices

array([34942, 45499, 52773, ..., 30595, 31186, 33550], dtype=int32)

In [25]:
mtx.indptr

array([     0,      6,     12, ..., 982592, 982601, 982619], dtype=int32)

** Determine Avg for Item **

In [36]:
mtx[:,0].sum()

34

In [39]:
len(mtx[:,0].nonzero()[0])

8

** Determine Avg for User **

In [53]:
mtx[0,:].sum()

30

In [54]:
len(mtx[0,:].nonzero()[0])

6

** Determine Avg for Population **

In [55]:
mtx.sum()

4272232

In [56]:
len(mtx.nonzero()[0])

982619

In [59]:
yAvg_Population = mtx.sum() /float(len(mtx.nonzero()[0]))
yAvg_Population

4.347801131466011

## Simple Averaging

In [44]:
y = df.values[:,-1]
x = df.values[:,:-1]

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

In [51]:
mtxTrain = sparse.csr_matrix((y_train, (x_train[:,1], x_train[:,0])), shape=(users, items))

In [60]:
yAvg_Train = mtxTrain.sum() /float(len(mtxTrain.nonzero()[0]))
yAvg_Train

4.3452592971022312

In [40]:
def RMSE (y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [52]:
x_test

array([[54296, 65709],
       [47453,  2144],
       [10421, 63024],
       ..., 
       [57463, 21521],
       [25928, 64254],
       [40094, 31300]])

In [96]:
x_test.shape

(491310, 2)

In [97]:
def alwaysPredict(x_test, val):
    length = x_test.shape[0]
    return np.ones(length)*float(val)

In [85]:
def obviPredict(x_test, wItem = .5):
    predict = np.zeros(len(x_test))
    nPredict =x_test.shape[0]
    wUser = float(1 - wItem)
    
    for i in range(nPredict):
        itemID = x_test[i,0]
        userID = x_test[i,1]
        itemNonZeros = len(mtxTrain[:,itemID].nonzero()[0])
        userNonZeros = len(mtxTrain[userID,:].nonzero()[0])
        
        if itemNonZeros == 0 and userNonZeros == 0:
            predict[i] = yAvg_Train
        elif itemNonZeros == 0:
            userSum= (mtxTrain[userID,:].sum())
            userAvg = userSum/float(userNonZeros)
            predict[i] = userAvg
        elif userNonZeros == 0:
            itemSum= (mtxTrain[:,itemID].sum())
            itemAvg = itemSum/float(itemNonZeros)
            predict[i] = itemAvg
        else:
            userSum= (mtxTrain[userID,:].sum())
            userAvg = userSum/float(userNonZeros)
            itemSum= (mtxTrain[:,itemID].sum())
            itemAvg = itemSum/float(itemNonZeros)
            
            predict[i] = wItem * itemAvg + wUser * userAvg 
    
    return predict

In [86]:
obviPredictions = obviPredict(x_test[:10])

In [87]:
RMSE(y_test[:10], obviPredictions)

0.7817177707636676

In [91]:
nPredict = 1000
obviPredictions = obviPredict(x_test[:nPredict])
obviTestRMSE = RMSE(y_test[:nPredict], obviPredictions)


In [93]:
print "Obvi Algo Test RSME:     %0.2f" %obviTestRMSE

Obvi Algo Test RSME:     0.80


In [98]:
nPredict = 1000
always_yAvg_predictions = alwaysPredict(x_test[:nPredict],yAvg_Train)
always_yAvg_RMSE = RMSE(y_test[:nPredict], always_yAvg_predictions)


In [99]:
print "Always Predict Y_Train_Avg Test RSME:     %0.2f" %always_yAvg_RMSE

Always Predict Y_Train_Avg Test RSME:     0.92
