In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./data/ml-small/ratings.csv')

In [3]:
print(data.shape)

(100004, 4)


In [5]:
print(data.values[0])

[  1.00000000e+00   3.10000000e+01   2.50000000e+00   1.26075914e+09]


In [7]:
## Access a partciular column in a dataFrame 
movie_ids = data['movieId']
print(type(movie_ids))
print(movie_ids[0]) #Get the first movie id

<class 'pandas.core.series.Series'>
31


In [11]:
# Create a user - movie rating matrix
def create_matrix(filename, num_users, num_movies):
    df = pd.read_csv(filename)
    
    #matrix = np.empty(num_users, num_movies)
    
    for row_num in range(10):
        print(df.iloc[row_num]['movieId'])
    
create_matrix('./data/ml-small/ratings.csv', 671, 9125)

    


31.0
1029.0
1061.0
1129.0
1172.0
1263.0
1287.0
1293.0
1339.0
1343.0


In [5]:
#names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('./data/ml-small/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [12]:
n_users = df['userId'].unique().shape[0]
n_items = df['movieId'].unique().shape[0]
print("Number of unique users: %d" % n_users)
print("Number of unique movies: %d" % n_items)

Number of unique users: 671
Number of unique movies: 9066


In [18]:
# Create a dictionary from movieId to index
ind = 0
movie_dict = {}
movie_list = []

for item in df['movieId'].unique():
    movie_list[ind] = item
    movie_dict[item] = ind
    ind += 1    

In [19]:
# Create user-item ratings matrix
ratings = np.zeros((n_users, n_items))

# df.itertuples() returns a Pandas Frame object
for row in df.itertuples():
    ratings[row[1] - 1, movie_dict[row[2]]] = row[3]
    
#print(ratings)    

[[ 2.5  3.   3.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 ..., 
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]]


In [21]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 1.64%


In [23]:
# Split data into training and test sets by removing 10 ratings per user from the training set and adding to test set 
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [24]:
def similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [27]:
user_similarity = similarity(train, kind='user')
item_similarity = similarity(train, kind='item')
print(item_similarity[:4, :4])

[[ 1.          0.11256286  0.17489071  0.13705608]
 [ 0.11256286  1.          0.18030721  0.2053659 ]
 [ 0.17489071  0.18030721  1.          0.23087476]
 [ 0.13705608  0.2053659   0.23087476  1.        ]]


In [29]:
a = np.array([np.abs(item_similarity).sum(axis=1)])
a.shape

(1, 9066)

In [36]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [33]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [37]:
item_prediction = predict_fast_simple(train, item_similarity, kind='item')
#user_prediction = predict_fast_simple(train, user_similarity, kind='user')

#print 'User-based CF MSE: ' + str(get_mse(user_prediction, test))
print('Item-based CF MSE: ' + str(get_mse(item_prediction, test)))

Item-based CF MSE: 13.5567387802


In [39]:
# Experimenting with argsort() method of numpy
x = np.array([[0, 3], [2, 2]])
y = np.argsort(x)
print(y)
y = np.argsort(x,axis=0)
print(y)
y = np.argsort(x,axis=1)
print(y)

[[0 1]
 [0 1]]
[[0 1]
 [1 0]]
[[0 1]
 [0 1]]
