In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from d2l import torch as d2l

In [9]:
import os
os.chdir('/nfs/volume-885-1/huangchen_i/ctr/deepFM-torch/data/ml-1m/data')
os.getcwd()

'/nfs/volume-885-1/huangchen_i/ctr/deepFM-torch/data/ml-1m/data'

In [10]:
user_columns = ['userId', 'gender', 'age', 'occupation', 'zipCode']
users = pd.read_table('users.dat', sep='::', header=None, 
                      names = user_columns, engine='python', encoding='utf-8')
users['zipCode'] = users['zipCode'].apply(lambda x: x[:3])

lbe = LabelEncoder()
for col in user_columns:
    users[col] = lbe.fit_transform(users[col])
users

Unnamed: 0,userId,gender,age,occupation,zipCode
0,0,0,0,10,348
1,1,1,6,16,485
2,2,1,2,15,400
3,3,1,4,7,17
4,4,1,2,20,402
...,...,...,...,...,...
6035,6035,0,2,15,245
6036,6036,0,4,1,517
6037,6037,0,6,1,121
6038,6038,0,4,0,3


In [11]:
users = users[users.columns[:4]]
users

Unnamed: 0,userId,gender,age,occupation
0,0,0,0,10
1,1,1,6,16
2,2,1,2,15
3,3,1,4,7
4,4,1,2,20
...,...,...,...,...
6035,6035,0,2,15
6036,6036,0,4,1
6037,6037,0,6,1
6038,6038,0,4,0


In [12]:
ratings_columns = ['userId', 'movieId', 'rating', 'timeStamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, 
                        names=ratings_columns, engine='python', encoding='utf-8')
ratings = ratings[ratings_columns[:3]]

ratings['rating'] = ratings['rating'].apply(lambda x: 1 if x > 3 else 0)

for col in ratings_columns[:2]:
    ratings[col] = lbe.fit_transform(ratings[col])
ratings

Unnamed: 0,userId,movieId,rating
0,0,1104,1
1,0,639,0
2,0,853,0
3,0,3177,1
4,0,2162,1
...,...,...,...
1000204,6039,1019,0
1000205,6039,1022,1
1000206,6039,548,1
1000207,6039,1024,1


In [13]:
movies_columns = ['movieId', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, 
                        names=movies_columns, engine='python', encoding='utf-8')

movies['year'] = movies['title'].apply(lambda x:x[-5:-1])
movies['title'] = movies['title'].apply(lambda x:x[0:-6].strip())
movies['title'] = movies['title'].apply(lambda x:x.split(' '))

def remove_comma(t):
    if len(t) >= 2 and t[-2].find(',') != -1:
        t[-2] = t[-2].strip(',')
        pre = t.pop()
        t.insert(0, pre)
    return t

movies['title'] = movies['title'].apply(lambda x:remove_comma(x))

movies['genres'] = movies['genres'].apply(lambda x:x.split('|'))

In [14]:
movies

Unnamed: 0,movieId,title,genres,year
0,1,"[Toy, Story]","[Animation, Children's, Comedy]",1995
1,2,[Jumanji],"[Adventure, Children's, Fantasy]",1995
2,3,"[Grumpier, Old, Men]","[Comedy, Romance]",1995
3,4,"[Waiting, to, Exhale]","[Comedy, Drama]",1995
4,5,"[Father, of, the, Bride, Part, II]",[Comedy],1995
...,...,...,...,...
3878,3948,"[Meet, the, Parents]",[Comedy],2000
3879,3949,"[Requiem, for, a, Dream]",[Drama],2000
3880,3950,[Tigerland],[Drama],2000
3881,3951,"[Two, Family, House]",[Drama],2000


In [15]:
import collections
titles_vocab = d2l.Vocab(movies['title'].values)
movies['title'] = movies['title'].apply(lambda x:[titles_vocab[token] for token in x])

def pad_or_truncate(x, maxlen=8, pad = 0):
    if len(x) < maxlen:
        x = x + [pad] * (maxlen - len(x))
    else:
        x = x[0:maxlen]
    return x

print(max(len(item) for item in movies['title'].values))
movies['title'] = movies['title'].apply(lambda x:pad_or_truncate(x))
movies

15


Unnamed: 0,movieId,title,genres,year
0,1,"[739, 19, 0, 0, 0, 0, 0, 0]","[Animation, Children's, Comedy]",1995
1,2,"[1359, 0, 0, 0, 0, 0, 0, 0]","[Adventure, Children's, Fantasy]",1995
2,3,"[1360, 149, 37, 0, 0, 0, 0, 0]","[Comedy, Romance]",1995
3,4,"[358, 7, 1361, 0, 0, 0, 0, 0]","[Comedy, Drama]",1995
4,5,"[359, 3, 2, 128, 31, 15, 0, 0]",[Comedy],1995
...,...,...,...,...
3878,3948,"[210, 2, 4939, 0, 0, 0, 0, 0]",[Comedy],2000
3879,3949,"[1333, 23, 9, 117, 0, 0, 0, 0]",[Drama],2000
3880,3950,"[4940, 0, 0, 0, 0, 0, 0, 0]",[Drama],2000
3881,3951,"[38, 138, 29, 0, 0, 0, 0, 0]",[Drama],2000


In [16]:
from functools import reduce
all_genres = set(reduce(lambda x,y: x + y, list(movies['genres'].values)))
genres_dict = dict(zip(all_genres, range(len(all_genres))))
movies['genres'] = movies['genres'].apply(lambda x: [genres_dict[token] for token in x])

In [10]:
movies

Unnamed: 0,movieId,title,genres,year
0,1,"[739, 19, 0, 0, 0, 0, 0, 0]","[6, 4, 15]",1995
1,2,"[1359, 0, 0, 0, 0, 0, 0, 0]","[3, 4, 0]",1995
2,3,"[1360, 149, 37, 0, 0, 0, 0, 0]","[15, 7]",1995
3,4,"[358, 7, 1361, 0, 0, 0, 0, 0]","[15, 2]",1995
4,5,"[359, 3, 2, 128, 31, 15, 0, 0]",[15],1995
...,...,...,...,...
3878,3948,"[210, 2, 4939, 0, 0, 0, 0, 0]",[15],2000
3879,3949,"[1333, 23, 9, 117, 0, 0, 0, 0]",[2],2000
3880,3950,"[4940, 0, 0, 0, 0, 0, 0, 0]",[2],2000
3881,3951,"[38, 138, 29, 0, 0, 0, 0, 0]",[2],2000


In [17]:
movies['year'] = lbe.fit_transform(movies['year'])
movies['movieId'] = lbe.fit_transform(movies['movieId'])

In [18]:
movies

Unnamed: 0,movieId,title,genres,year
0,0,"[739, 19, 0, 0, 0, 0, 0, 0]","[14, 6, 8]",75
1,1,"[1359, 0, 0, 0, 0, 0, 0, 0]","[3, 6, 7]",75
2,2,"[1360, 149, 37, 0, 0, 0, 0, 0]","[8, 17]",75
3,3,"[358, 7, 1361, 0, 0, 0, 0, 0]","[8, 10]",75
4,4,"[359, 3, 2, 128, 31, 15, 0, 0]",[8],75
...,...,...,...,...
3878,3878,"[210, 2, 4939, 0, 0, 0, 0, 0]",[8],80
3879,3879,"[1333, 23, 9, 117, 0, 0, 0, 0]",[10],80
3880,3880,"[4940, 0, 0, 0, 0, 0, 0, 0]",[10],80
3881,3881,"[38, 138, 29, 0, 0, 0, 0, 0]",[10],80


In [19]:
data = pd.merge(users, ratings, on = ['userId'])
data

Unnamed: 0,userId,gender,age,occupation,movieId,rating
0,0,0,0,10,1104,1
1,0,0,0,10,639,0
2,0,0,0,10,853,0
3,0,0,0,10,3177,1
4,0,0,0,10,2162,1
...,...,...,...,...,...,...
1000204,6039,1,2,6,1019,0
1000205,6039,1,2,6,1022,1
1000206,6039,1,2,6,548,1
1000207,6039,1,2,6,1024,1


In [20]:
data = pd.merge(data, movies, on=['movieId'])
data

Unnamed: 0,userId,gender,age,occupation,movieId,rating,title,genres,year
0,0,0,0,10,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[10],76
1,1,1,6,16,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[10],76
2,11,1,2,12,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[10],76
3,14,1,2,7,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[10],76
4,16,1,5,1,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[10],76
...,...,...,...,...,...,...,...,...,...
1000204,5948,1,1,17,2017,1,"[54, 321, 171, 0, 0, 0, 0, 0]","[10, 7]",65
1000205,5674,1,3,14,2498,0,"[3605, 0, 0, 0, 0, 0, 0, 0]",[8],79
1000206,5779,1,1,17,2638,0,"[3757, 252, 0, 0, 0, 0, 0, 0]",[12],79
1000207,5850,0,1,20,3367,1,"[4470, 97, 0, 0, 0, 0, 0, 0]","[10, 17]",71


In [21]:
data.to_pickle('ml-1m.pkl')

In [16]:
data

Unnamed: 0,userId,gender,age,occupation,movieId,rating,title,genres,year
0,0,0,0,10,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[2],76
1,1,1,6,16,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[2],76
2,11,1,2,12,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[2],76
3,14,1,2,7,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[2],76
4,16,1,5,1,1104,1,"[1, 176, 1050, 2409, 2410, 0, 0, 0]",[2],76
...,...,...,...,...,...,...,...,...,...
1000204,5948,1,1,17,2017,1,"[54, 321, 171, 0, 0, 0, 0, 0]","[2, 0]",65
1000205,5674,1,3,14,2498,0,"[3605, 0, 0, 0, 0, 0, 0, 0]",[15],79
1000206,5779,1,1,17,2638,0,"[3757, 252, 0, 0, 0, 0, 0, 0]",[11],79
1000207,5850,0,1,20,3367,1,"[4470, 97, 0, 0, 0, 0, 0, 0]","[2, 7]",71
