In [1]:
import numpy as np
import pandas as pd
from time import time, ctime
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
movies = pd.read_csv('./Boltzmann_Machines/ml-1m/movies.dat',
                 sep='::', header = None, 
                 engine='python', encoding='latin-1')
print movies.shape
movies.head()

(3883, 3)


Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.tail()

Unnamed: 0,0,1,2
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [4]:
users = pd.read_csv('./Boltzmann_Machines/ml-1m/users.dat',
                    sep='::', header = None, 
                 engine='python', encoding='latin-1')
print users.shape
users.head()

(6040, 5)


Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
ratings = pd.read_csv('./Boltzmann_Machines/ml-1m/ratings.dat',
                      sep='::', header = None, 
                 engine='python', encoding='latin-1')
print ratings.shape
ratings.head()

(1000209, 4)


Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### training, test sets

In [6]:
training_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.base',
                           delimiter='\t', header = None)
print training_set.shape
training_set.head()

(80000, 4)


Unnamed: 0,0,1,2,3
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [7]:
df=training_set.copy()
training_set = np.array(training_set, dtype='int')
training_set[:2]

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171]])

In [8]:
test_set = pd.read_csv('./Boltzmann_Machines/ml-100k/u1.test',
                           delimiter='\t', header = None)
print test_set.shape
test_set.head()

(20000, 4)


Unnamed: 0,0,1,2,3
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [9]:
test_set = np.array(test_set, dtype='int')
test_set[:2]

array([[        1,         6,         5, 887431973],
       [        1,        10,         3, 875693118]])

In [10]:
nb_users  = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies

(943, 1682)

In [11]:
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))

(943, 1682)

In [12]:
training_set[:,2][training_set[:,0]==1]

array([5, 3, 4, 3, 3, 4, 1, 5, 2, 5, 5, 5, 4, 5, 1, 4, 4, 3, 4, 1, 3, 5, 2,
       1, 2, 3, 3, 2, 5, 4, 5, 4, 5, 5, 4, 5, 5, 4, 5, 2, 4, 4, 3, 4, 4, 4,
       3, 5, 4, 5, 5, 2, 4, 3, 2, 2, 4, 5, 1, 5, 5, 3, 5, 3, 4, 5, 2, 5, 1,
       4, 4, 3, 5, 1, 3, 3, 2, 4, 4, 3, 2, 5, 3, 4, 3, 4, 5, 5, 2, 5, 5, 5,
       5, 5, 5, 3, 5, 4, 4, 5, 4, 4, 5, 5, 5, 4, 4, 5, 3, 5, 3, 5, 3, 3, 5,
       1, 4, 2, 4, 4, 3, 2, 2, 5, 1, 4, 4, 4, 4, 1, 1, 5, 5, 5, 2])

In [13]:
df.columns=[['user','movie','rating','time']]

In [14]:
df.head()

Unnamed: 0,user,movie,rating,time
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [108]:
{3:5}

{3: 5}

In [112]:
{3:5,6:9}

{3: 5, 6: 9}

In [125]:
# df[df['user']==1][['movie','rating']].transpose().to_dict()

In [15]:
tmp = df.groupby('user').apply(lambda x: zip(list(x['movie']),list(x['rating'])))
tmp = tmp.apply(lambda x: {i[0]:i[1] for i in x})
tmp.head()

user
1    {1: 5, 2: 3, 3: 4, 4: 3, 5: 3, 7: 4, 8: 1, 9: ...
2    {1: 4, 258: 3, 10: 2, 269: 4, 14: 4, 272: 5, 2...
3    {258: 2, 260: 4, 268: 3, 271: 3, 288: 2, 302: ...
4    {258: 5, 359: 5, 324: 5, 358: 2, 327: 5, 328: ...
5    {21: 3, 25: 3, 29: 4, 50: 4, 63: 1, 66: 1, 70:...
dtype: object

In [16]:
len(tmp.tolist())

943

In [17]:
len(tmp)

943

In [18]:
pd.DataFrame([{1:2},{3:4}],columns=range(1,3953), index=[0,1])

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,2.0,,,,,,,,,,...,,,,,,,,,,
1,,,4.0,,,,,,,,...,,,,,,,,,,


In [19]:
pd.DataFrame([{1:2},{3:4},{5:6}],columns=range(1,3953), index=range(3))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,2.0,,,,,,,,,,...,,,,,,,,,,
1,,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,6.0,,,,,,...,,,,,,,,,,


In [30]:
mapped = pd.DataFrame(tmp.tolist(),columns=range(1,3953))
#                       , index=range(1,len(tmp)+1))
mapped.shape

(943, 3952)

In [31]:
mapped.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,5.0,3.0,4.0,3.0,3.0,,4.0,1.0,5.0,,...,,,,,,,,,,
1,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [174]:
from scipy import sparse

In [181]:
sparse_matrix = sparse.csr_matrix(mapped.values)
sparse_matrix

<943x3952 sparse matrix of type '<type 'numpy.float64'>'
	with 3726736 stored elements in Compressed Sparse Row format>

In [182]:
mapped.fillna(0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
len(set(np.concatenate((test_set[:,0], training_set[:,0])))), len(set(np.concatenate((test_set[:,1], training_set[:,1]))))

(943, 1682)

### df based on train, test data

In [33]:
mapped2 = pd.DataFrame(tmp.tolist(),columns=set(np.concatenate((test_set[:,1], training_set[:,1])))).fillna(0)
mapped2.shape

(943, 1682)

In [43]:
# mapped2.values.tolist()