In [21]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from ntflib import betantf

%matplotlib inline
sns.set(style="white")

In [83]:
def mapper(array):
#     array = np.sort(array)
    # int_map = np.arange(len(np.unique(array))).astype(int)
    dict_map = {v:i for i, v in enumerate(np.unique(array))}
    # dict_map = dict(zip((np.unique(array), int_map))
    tmp = pd.Series(array)
    res = tmp.map(lambda x: dict_map[x])
    
    inv_dict_map = {v: k for k, v in dict_map.items()}
    return res.values, inv_dict_map

def rmse(x, y):
    return np.sqrt((x - y)**2.0).sum()

In [2]:
# !wget http://files.grouplens.org/datasets/movielens/ml-1m.zip

--2015-06-25 10:25:08--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917392 (5.6M) [application/zip]
Saving to: 'ml-1m.zip'


2015-06-25 10:25:22 (437 KB/s) - 'ml-1m.zip' saved [5917392/5917392]



In [3]:
# !unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [99]:
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
ratings.Timestamp = ratings.Timestamp.map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m'))
# movies = pd.read_table('ml-1m/movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
# users = pd.read_table('ml-1m/users.dat', sep='::', names=['UserID' ,'Gender', 'Age', 'Occupation::Zip-code'])

In [100]:
# Converting dates to integers
ratings['UserID'], inv_uid_dict = mapper(ratings['UserID'])
ratings['MovieID'], inv_mid_dict = mapper(ratings['MovieID'])
ratings['Timestamp'], inv_ts_dict = mapper(ratings['Timestamp'])

In [102]:
ratings = (ratings.groupby('UserID')['Rating']
                  .mean()
                  .reset_index()
                  .rename(columns={'index': 'UserID', 'Rating':'UserID_Average'})
                  .merge(ratings, on='UserID'))
ratings['Rating_normed'] = ratings['Rating'] / ratings['UserID_Average']
ratings = (ratings.groupby('MovieID')['Rating_normed']
                  .mean()
                  .reset_index()
                  .rename(columns={'index': 'MovieID', 'Rating_normed':'MovieID_Average'})
                  .merge(ratings, on='MovieID'))
ratings['Rating_normed'] = ratings['Rating_normed'] / ratings['MovieID_Average']
ratings['Rating_multiplier'] = ratings['Rating'] / ratings['Rating_normed']

In [103]:
ratings.head()

Unnamed: 0,MovieID,MovieID_Average,UserID,UserID_Average,Rating,Timestamp,Rating_normed,Rating_multiplier
0,0,1.137663,0,4.188679,5,9,1.049251,4.765305
1,0,1.137663,5,3.901408,4,8,0.901208,4.438488
2,0,1.137663,7,3.884892,4,8,0.905039,4.419698
3,0,1.137663,8,3.735849,5,8,1.176433,4.250137
4,0,1.137663,9,4.114713,5,8,1.068112,4.681157


In [104]:
x_indices = ratings[['UserID', 'MovieID', 'Timestamp']].copy()
x_indices['UserID'] = x_indices['UserID'] - x_indices['UserID'].min() 
x_indices['MovieID'] = x_indices['MovieID'] - x_indices['MovieID'].min()
x_indices['Timestamp'] = x_indices['Timestamp'] - x_indices['Timestamp'].min() 

print x_indices.min()
x_indices = x_indices.values
# mean_rating = ratings['Rating'].mean() 
x_vals = ratings['Rating_normed'].values
x_multiplier = ratings['Rating_multiplier'].values

UserID       0
MovieID      0
Timestamp    0
dtype: int64


In [92]:
len(ratings['MovieID'].unique())

3706

In [93]:
ratings['MovieID'].max()

3705

In [105]:
indices_train, indices_test, val_train, val_test, multiplier_train, multiplier_test = train_test_split(
    x_indices, x_vals, x_multiplier, test_size=0.33, random_state=30)
#     x_indices, x_vals, test_size=0.33, random_state=42)


shape_uid = len(np.unique(x_indices[:,0]))
shape_mid = len(np.unique(x_indices[:,1]))
shape_ts = len(np.unique(x_indices[:,2]))
shape = [shape_uid, shape_mid, shape_ts]
shape

[6040, 3706, 35]

In [109]:
# shape = [len(np.unique(ratings[x])) for x in ['UserID', 'MovieID', 'Timestamp']]
bnf = betantf.BetaNTF(shape, n_components=20, n_iters=100)
before = bnf.score(indices_train, val_train)
initial = bnf.impute(x_indices)
reconstructed = bnf.fit(indices_train, val_train)
after = bnf.score()
assert(after < before)

Update Iter 0 Factor 0 Score 49621.9
Update Iter 0 Factor 1 Score 37133.8
Update Iter 0 Factor 2 Score 36319.3
Update Iter 1 Factor 0 Score 34952.7
Update Iter 1 Factor 1 Score 33749.6
Update Iter 1 Factor 2 Score 33516.3
Update Iter 2 Factor 0 Score 32768.5
Update Iter 2 Factor 1 Score 31947.7
Update Iter 2 Factor 2 Score 31823.3
Update Iter 3 Factor 0 Score 31298.4
Update Iter 3 Factor 1 Score 30696.7
Update Iter 3 Factor 2 Score 30617.6
Update Iter 4 Factor 0 Score 30220.4
Update Iter 4 Factor 1 Score 29758.5
Update Iter 4 Factor 2 Score 29704.4
Update Iter 5 Factor 0 Score 29389.1
Update Iter 5 Factor 1 Score 29021.9
Update Iter 5 Factor 2 Score 28983.3
Update Iter 6 Factor 0 Score 28723.8
Update Iter 6 Factor 1 Score 28423.8
Update Iter 6 Factor 2 Score 28395.2
Update Iter 7 Factor 0 Score 28175.5
Update Iter 7 Factor 1 Score 27924.9
Update Iter 7 Factor 2 Score 27903.1
Update Iter 8 Factor 0 Score 27712.9
Update Iter 8 Factor 1 Score 27499.7
Update Iter 8 Factor 2 Score 27482.7
U

In [115]:
reconstructed = bnf.fit(indices_train, val_train)

Update Iter 0 Factor 0 Score 16824.3
Update Iter 0 Factor 1 Score 16820.2
Update Iter 0 Factor 2 Score 16819.9
Update Iter 1 Factor 0 Score 16813.8
Update Iter 1 Factor 1 Score 16809.8
Update Iter 1 Factor 2 Score 16809.5
Update Iter 2 Factor 0 Score 16803.5
Update Iter 2 Factor 1 Score 16799.5
Update Iter 2 Factor 2 Score 16799.1
Update Iter 3 Factor 0 Score 16793.2
Update Iter 3 Factor 1 Score 16789.3
Update Iter 3 Factor 2 Score 16788.9
Update Iter 4 Factor 0 Score 16783.0
Update Iter 4 Factor 1 Score 16779.1
Update Iter 4 Factor 2 Score 16778.8
Update Iter 5 Factor 0 Score 16772.9
Update Iter 5 Factor 1 Score 16769.1
Update Iter 5 Factor 2 Score 16768.7
Update Iter 6 Factor 0 Score 16762.9
Update Iter 6 Factor 1 Score 16759.1
Update Iter 6 Factor 2 Score 16758.8
Update Iter 7 Factor 0 Score 16753.0
Update Iter 7 Factor 1 Score 16749.3
Update Iter 7 Factor 2 Score 16748.9
Update Iter 8 Factor 0 Score 16743.2
Update Iter 8 Factor 1 Score 16739.5
Update Iter 8 Factor 2 Score 16739.2
U

In [118]:
bnf.n_iters = 1000
reconstructed = bnf.fit(indices_train, val_train)

Update Iter 0 Factor 0 Score 16118.7
Update Iter 0 Factor 1 Score 16117.0
Update Iter 0 Factor 2 Score 16116.8
Update Iter 1 Factor 0 Score 16114.1
Update Iter 1 Factor 1 Score 16112.3
Update Iter 1 Factor 2 Score 16112.2
Update Iter 2 Factor 0 Score 16109.4
Update Iter 2 Factor 1 Score 16107.7
Update Iter 2 Factor 2 Score 16107.6
Update Iter 3 Factor 0 Score 16104.9
Update Iter 3 Factor 1 Score 16103.2
Update Iter 3 Factor 2 Score 16103.0
Update Iter 4 Factor 0 Score 16100.3
Update Iter 4 Factor 1 Score 16098.6
Update Iter 4 Factor 2 Score 16098.4
Update Iter 5 Factor 0 Score 16095.8
Update Iter 5 Factor 1 Score 16094.1
Update Iter 5 Factor 2 Score 16093.9
Update Iter 6 Factor 0 Score 16091.3
Update Iter 6 Factor 1 Score 16089.6
Update Iter 6 Factor 2 Score 16089.5
Update Iter 7 Factor 0 Score 16086.8
Update Iter 7 Factor 1 Score 16085.2
Update Iter 7 Factor 2 Score 16085.0
Update Iter 8 Factor 0 Score 16082.4
Update Iter 8 Factor 1 Score 16080.7
Update Iter 8 Factor 2 Score 16080.6
U

In [119]:
prediction = bnf.impute(indices_test)

In [126]:
# rmse(prediction, val_test) / float(prediction.shape[0])
print mean_squared_error(prediction * multiplier_test, val_test * multiplier_test) 
print mean_absolute_error(prediction * multiplier_test, val_test * multiplier_test)

#print ratings['Rating'].mean()
#print 5 / ratings['Rating'].mean()

5.12925551358
0.767176846831


In [124]:
val_test * multiplier_test

array([ 4.,  2.,  5., ...,  5.,  3.,  3.])

In [125]:
prediction * multiplier_test

array([ 4.31813305,  4.14295961,  4.87178581, ...,  3.85228047,
        3.17982888,  3.9550476 ])

In [129]:
np.sqrt(np.mean(((val_test - prediction) * multiplier_test)**2.0))

2.2647859752256396

In [11]:
!cat ml-1m/README 

SUMMARY

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

USAGE LICENSE

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any research
purposes under the following conditions:

     * The user may not state or imply any endorsement from the
       University of Minnesota or the GroupLens Research Group.

     * The user must acknowledge the use of the data set in
       publications resulting from the use of the data set, and must
       send us an electronic or paper copy of those publications.

     * The user may not redistribute the data without separate
       permission.

     * The user may not use this information for any commercial or
       revenue-bearing purposes 