In [1]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from ntflib import betantf

%matplotlib inline
sns.set(style="white")

## Defining functions for mapping and error

In [2]:
def mapper(array):
    array = np.sort(array)
    int_map = np.arange(len(np.unique(array))).astype(int)
    dict_map = dict(zip(np.sort(np.unique(array)), int_map))
    tmp = pd.Series(array)
    res = tmp.map(lambda x: dict_map[x])
    
    inv_dict_map = {v: k for k, v in dict_map.items()}
    return res.values, inv_dict_map

def rmse(x, y):
    return np.sqrt((x - y)**2.0).sum()

## Grabbing Movie Lens data

In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

--2015-06-25 10:25:08--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917392 (5.6M) [application/zip]
Saving to: 'ml-1m.zip'


2015-06-25 10:25:22 (437 KB/s) - 'ml-1m.zip' saved [5917392/5917392]



## Parsing data and cleaning it up for NTFLib

In [3]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings.Timestamp = ratings.Timestamp.map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m'))
# movies = pd.read_table('ml-1m/movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
# users = pd.read_table('ml-1m/users.dat', sep='::', names=['UserID' ,'Gender', 'Age', 'Occupation::Zip-code'])



In [4]:
# Converting dates to integers
ratings['UserID'], inv_uid_dict = mapper(ratings['UserID'])
ratings['MovieID'], inv_mid_dict = mapper(ratings['MovieID'])
ratings['Timestamp'], inv_ts_dict = mapper(ratings['Timestamp'])

In [5]:
x_indices = ratings[['UserID', 'MovieID', 'Timestamp']].copy()
x_indices['UserID'] = x_indices['UserID'] - x_indices['UserID'].min() 
x_indices['MovieID'] = x_indices['MovieID'] - x_indices['MovieID'].min()
x_indices['Timestamp'] = x_indices['Timestamp'] - x_indices['Timestamp'].min() 

print x_indices.min()
x_indices = x_indices.values
x_vals = ratings['Rating'].values

UserID       0
MovieID      0
Timestamp    0
dtype: int64


In [6]:
print 'Number of unique movie IDs: {0}'.format(len(ratings['MovieID'].unique()))
print 'Max movie ID: {0}'.format(ratings['MovieID'].max())

Number of unique movie IDs: 3706
Max movie ID: 3705


In [7]:
indices_train, indices_test, val_train, val_test = train_test_split(
    x_indices, x_vals, test_size=0.40, random_state=42)

shape_uid = len(np.unique(x_indices[:,0]))
shape_mid = len(np.unique(x_indices[:,1]))
shape_ts = len(np.unique(x_indices[:,2]))
shape = [shape_uid, shape_mid, shape_ts]
shape

[6040, 3706, 35]

In [8]:
indices_train

array([[6022, 3683,   33],
       [5529, 3261,    9],
       [2908, 1544,    6],
       ..., 
       [ 853,  467,    2],
       [4032, 2268,    7],
       [ 785,  440,    2]])

In [9]:
# shape = [len(np.unique(ratings[x])) for x in ['UserID', 'MovieID', 'Timestamp']]
bnf = betantf.BetaNTF(shape, n_components=5, n_iters=10)
before = bnf.score(indices_train, val_train)
initial = bnf.impute(x_indices)
reconstructed = bnf.fit(indices_train, val_train)
after = bnf.score()
assert(after < before)

TypeError: not all arguments converted during string formatting

In [None]:
debug

> [0;32m/Users/eli/github/NTFLib/ntflib/betantf.py[0m(52)[0;36m_check_input[0;34m()[0m
[0;32m     51 [0;31m            [0mrank[0m [0;34m=[0m [0mx_indices[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mcol[0m[0;34m][0m[0;34m[0m[0m
[0m[0;32m---> 52 [0;31m            [0mmsg[0m [0;34m=[0m [0mmsg[0m [0;34m%[0m [0mcol[0m[0;34m[0m[0m
[0m[0;32m     53 [0;31m            [0;32mif[0m [0mrank[0m[0;34m.[0m[0mmax[0m[0;34m([0m[0;34m)[0m [0;34m+[0m [0;36m1[0m [0;34m!=[0m [0mnp[0m[0;34m.[0m[0munique[0m[0;34m([0m[0mrank[0m[0;34m)[0m[0;34m.[0m[0mshape[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0m
[0m
ipdb> p col
1
ipdb> p msg
'Rank did not match shape; is column 0 starting with zero and strictly contiguous integers?'
ipdb> l
[1;32m     47 [0m        we cannot tolerate a whole dimension with no data."""
[1;32m     48 [0m        [0mmsg[0m [0;34m=[0m [0;34m"Rank did not match shape; is column %i "[0m[0;34m[0

In [83]:
prediction = bnf.impute(indices_test)

In [85]:
rmse(prediction, val_test) / float(prediction.shape[0])

0.82303123616696161

In [11]:
!cat ml-1m/README

SUMMARY

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

USAGE LICENSE

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any research
purposes under the following conditions:

     * The user may not state or imply any endorsement from the
       University of Minnesota or the GroupLens Research Group.

     * The user must acknowledge the use of the data set in
       publications resulting from the use of the data set, and must
       send us an electronic or paper copy of those publications.

     * The user may not redistribute the data without separate
       permission.

     * The user may not use this information for any commercial or
       revenue-bearing purposes 