In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load rating file
df_ratings = pd.read_csv('./../datahub/movielens1M/ratings.dat', sep='::', engine='python', names=['user_id', 'item_id', 'rating', 'time_stamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Load user file
df_user = pd.read_csv('./../datahub/movielens1M/users.dat', sep='::', engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip-code'])
df_user.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# Load item file
df_item = pd.read_csv('./../datahub/movielens1M/movies.dat', sep='::', engine='python', encoding='ISO-8859-1', names=['item_id', 'title', 'genres'])
df_item.tail()

Unnamed: 0,item_id,title,genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [5]:
# join
df_ratings = df_ratings.join(df_user.set_index('user_id'), on='user_id')
df_ratings = df_ratings.join(df_item.set_index('item_id'), on='item_id')
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
# Preprocess gender
df_ratings['gender'] = df_ratings['gender'].map(lambda x: 0 if x == 'F' else 1)
# Preprocess zip-code
zip_codes = list(set(df_ratings['zip-code']))
print("number of zip code: {}, number of users: {}".format(len(zip_codes), len(set(df_ratings['user_id']))))
zip_code2idx = {}
for idx, value in enumerate(zip_codes):
    zip_code2idx[value] = idx
df_ratings['zip-code'] = df_ratings['zip-code'].map(lambda x: zip_code2idx[x])

number of zip code: 3439, number of users: 6040


In [7]:
# Preprocess year
titles = df_ratings['title']

years = [title[-5:-1] for title in titles]
year_set = list(set(years))
year_set.sort()
year2idx = {y: idx for idx, y in enumerate(year_set)}
years = [year2idx[y] for y in years]
df_ratings['year'] = years

In [8]:
# Preprocess titles
tokenizer = RegexpTokenizer(r'\w+')
titles = df_ratings['title']
tokenized_titles = []
titles = [tokenizer.tokenize(str(title[:-6])) for title in list(titles)]
vocab = []
max_title_len = 0
for title in titles:
    vocab += title
    max_title_len = max_title_len if max_title_len > len(title) else len(title)
vocab = set(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
res_title = []
for title in titles:
    padding_title = [word2idx[word] for word in title]
    padding_title.extend([0 for i in range(max_title_len - len(title))])
    res_title.append(padding_title)
df_ratings['title'] = res_title
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,1292,"[4267, 1187, 3195, 3505, 0, 0, 0, 0, 0, 0, 0, ...",Comedy,69
1000205,6040,1094,5,956704887,1,25,6,1292,"[3682, 638, 2582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",Drama|Romance|War,72
1000206,6040,562,5,956704746,1,25,6,1292,"[4325, 135, 132, 811, 0, 0, 0, 0, 0, 0, 0, 0, ...",Comedy|Drama,75
1000207,6040,1096,4,956715648,1,25,6,1292,"[429, 3505, 303, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",Drama,62
1000208,6040,1097,4,956715569,1,25,6,1292,"[1155, 2947, 132, 562, 4084, 0, 0, 0, 0, 0, 0,...",Children's|Drama|Fantasy|Sci-Fi,62


In [9]:
# Preprocess genre
genres = df_ratings['genres']
genre2idx = {}
cnt = 1
res_genre = []
max_genre_len = 0
for genre in list(genres):
    l = genre.split('|')
    max_genre_len = max_genre_len if max_genre_len > len(l) else len(l)
    for e in l:
        if e not in genre2idx:
            genre2idx[e] = cnt
            cnt += 1
    res_genre.append([genre2idx[e] for e in l])
for genre in res_genre:
    genre.extend([0 for i in range(max_genre_len - len(genre))])
df_ratings['genres'] = res_genre
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,1292,"[4267, 1187, 3195, 3505, 0, 0, 0, 0, 0, 0, 0, ...","[6, 0, 0, 0, 0, 0]",69
1000205,6040,1094,5,956704887,1,25,6,1292,"[3682, 638, 2582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 5, 11, 0, 0, 0]",72
1000206,6040,562,5,956704746,1,25,6,1292,"[4325, 135, 132, 811, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 1, 0, 0, 0, 0]",75
1000207,6040,1096,4,956715648,1,25,6,1292,"[429, 3505, 303, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 0, 0, 0, 0, 0]",62
1000208,6040,1097,4,956715569,1,25,6,1292,"[1155, 2947, 132, 562, 4084, 0, 0, 0, 0, 0, 0,...","[3, 1, 9, 10, 0, 0]",62


In [10]:
# Preprocess timestamp
min_time_stamp = np.min(df_ratings['time_stamp'])
max_time_stamp = np.max(df_ratings['time_stamp'])

df_ratings['time_stamp'] = df_ratings['time_stamp'].map(lambda x: (x - min_time_stamp)/(max_time_stamp - min_time_stamp))
df_ratings = df_ratings.sort_values(by='time_stamp')
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000138,6040,858,4,0.0,1,25,6,1292,"[1012, 2582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7, 13, 1, 0, 0, 0]",52
1000153,6040,2384,4,2.451236e-07,1,25,6,1292,"[154, 2767, 4077, 132, 3946, 0, 0, 0, 0, 0, 0,...","[3, 6, 0, 0, 0, 0]",78
999873,6040,593,5,2.451236e-07,1,25,6,1292,"[3000, 2678, 132, 3349, 2582, 0, 0, 0, 0, 0, 0...","[1, 12, 0, 0, 0, 0]",71
1000007,6040,1961,4,5.013891e-07,1,25,6,1292,"[1533, 3767, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0]",68
1000192,6040,2019,5,5.013891e-07,1,25,6,1292,"[2330, 1798, 2582, 3826, 2330, 2513, 2757, 337...","[7, 1, 0, 0, 0, 0]",34


In [11]:
# Preprocess label
df_ratings['watch'] = df_ratings['rating'].map(lambda x: 0 if x < 4 else 1)
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year,watch
1000138,6040,858,4,0.0,1,25,6,1292,"[1012, 2582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7, 13, 1, 0, 0, 0]",52,1
1000153,6040,2384,4,2.451236e-07,1,25,6,1292,"[154, 2767, 4077, 132, 3946, 0, 0, 0, 0, 0, 0,...","[3, 6, 0, 0, 0, 0]",78,1
999873,6040,593,5,2.451236e-07,1,25,6,1292,"[3000, 2678, 132, 3349, 2582, 0, 0, 0, 0, 0, 0...","[1, 12, 0, 0, 0, 0]",71,1
1000007,6040,1961,4,5.013891e-07,1,25,6,1292,"[1533, 3767, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0]",68,1
1000192,6040,2019,5,5.013891e-07,1,25,6,1292,"[2330, 1798, 2582, 3826, 2330, 2513, 2757, 337...","[7, 1, 0, 0, 0, 0]",34,1


In [12]:
# Reorder the columns 
orders = ['watch', 'rating', 'user_id', 'gender', 'age', 'occupation', 'zip-code', 'time_stamp', 'item_id', 'year', 'title', 'genres']
df_ratings = df_ratings[orders]
description = [
    ('user_id', 1 + np.max(df_ratings['user_id']), 'spr'),
    ('gender', 1 + np.max(df_ratings['gender']), 'spr'),
    ('age', 1 + np.max(df_ratings['age']), 'spr'),
    ('occupation', 1 + np.max(df_ratings['occupation']), 'spr'),
    ('zip-code', 1 + np.max(df_ratings['zip-code']), 'spr'),
    ('item_id', 1 + np.max(df_ratings['item_id']), 'spr'),
    ('year', 1 + np.max(df_ratings['year']), 'spr'),
    ('title', 1 + np.max(list(df_ratings['title'])), 'seq'),
    ('genres', 1 + np.max(list(df_ratings['genres'])), 'seq'),
    ('time_stamp', -1, 'ctn'),
    ('rating', np.max(list(df_ratings['rating'])), 'label'),
    ('watch', 2, 'label')
]
description

[('user_id', 6041, 'spr'),
 ('gender', 2, 'spr'),
 ('age', 57, 'spr'),
 ('occupation', 21, 'spr'),
 ('zip-code', 3439, 'spr'),
 ('item_id', 3953, 'spr'),
 ('year', 81, 'spr'),
 ('title', 4373, 'seq'),
 ('genres', 19, 'seq'),
 ('time_stamp', -1, 'ctn'),
 ('rating', 5, 'label'),
 ('watch', 2, 'label')]

In [13]:
def split(df_ratings, description):
    df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)
    train_df = df_ratings.iloc[:int(len(df_ratings)*0.8), :]
    val_df = df_ratings.iloc[int(len(df_ratings)*0.8):int(len(df_ratings)*0.9), :]
    test_df = df_ratings.iloc[int(len(df_ratings)*0.9):, :]
    save_dic = {
        'train': train_df,
        'val': val_df,
        'test': test_df,
        'description': description
    }
    with open('./../data/preprocess_ml-1M.pkl', 'bw+') as f:
        pickle.dump(save_dic, f)

In [14]:
split(df_ratings, description)

In [15]:
df_ratings.head()

Unnamed: 0,watch,rating,user_id,gender,age,occupation,zip-code,time_stamp,item_id,year,title,genres
1000138,1,4,6040,1,25,6,1292,0.0,858,52,"[1012, 2582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7, 13, 1, 0, 0, 0]"
1000153,1,4,6040,1,25,6,1292,2.451236e-07,2384,78,"[154, 2767, 4077, 132, 3946, 0, 0, 0, 0, 0, 0,...","[3, 6, 0, 0, 0, 0]"
999873,1,5,6040,1,25,6,1292,2.451236e-07,593,71,"[3000, 2678, 132, 3349, 2582, 0, 0, 0, 0, 0, 0...","[1, 12, 0, 0, 0, 0]"
1000007,1,4,6040,1,25,6,1292,5.013891e-07,1961,68,"[1533, 3767, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0]"
1000192,1,5,6040,1,25,6,1292,5.013891e-07,2019,34,"[2330, 1798, 2582, 3826, 2330, 2513, 2757, 337...","[7, 1, 0, 0, 0, 0]"
