In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load rating file
df_ratings = pd.read_csv('./../datahub/movielens1M/ratings.dat', sep='::', engine='python', names=['user_id', 'item_id', 'rating', 'time_stamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Load user file
df_user = pd.read_csv('./../datahub/movielens1M/users.dat', sep='::', engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip-code'])
df_user.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# Load item file
df_item = pd.read_csv('./../datahub/movielens1M/movies.dat', sep='::', engine='python', encoding='ISO-8859-1', names=['item_id', 'title', 'genres'])
df_item.tail()

Unnamed: 0,item_id,title,genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [5]:
# join
df_ratings = df_ratings.join(df_user.set_index('user_id'), on='user_id')
df_ratings = df_ratings.join(df_item.set_index('item_id'), on='item_id')
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
# Preprocess gender
df_ratings['gender'] = df_ratings['gender'].map(lambda x: 0 if x == 'F' else 1)
# Preprocess zip-code
zip_codes = list(set(df_ratings['zip-code']))
print("number of zip code: {}, number of users: {}".format(len(zip_codes), len(set(df_ratings['user_id']))))
zip_code2idx = {}
for idx, value in enumerate(zip_codes):
    zip_code2idx[value] = idx
df_ratings['zip-code'] = df_ratings['zip-code'].map(lambda x: zip_code2idx[x])

number of zip code: 3439, number of users: 6040


In [7]:
# Preprocess year
titles = df_ratings['title']

years = [title[-5:-1] for title in titles]
year_set = list(set(years))
year_set.sort()
year2idx = {y: idx for idx, y in enumerate(year_set)}
years = [year2idx[y] for y in years]
df_ratings['year'] = years

In [8]:
# Preprocess titles
tokenizer = RegexpTokenizer(r'\w+')
titles = df_ratings['title']
tokenized_titles = []
titles = [tokenizer.tokenize(str(title[:-6])) for title in list(titles)]
vocab = []
max_title_len = 0
for title in titles:
    vocab += title
    max_title_len = max_title_len if max_title_len > len(title) else len(title)
vocab = set(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
res_title = []
for title in titles:
    padding_title = [word2idx[word] for word in title]
    padding_title.extend([0 for i in range(max_title_len - len(title))])
    res_title.append(','.join(str(i) for i in padding_title))
df_ratings['title'] = res_title
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,487,38005002478340000000000000,Comedy,69
1000205,6040,1094,5,956704887,1,25,6,487,40433710848000000000000,Drama|Romance|War,72
1000206,6040,562,5,956704746,1,25,6,487,2683312644180300000000000,Comedy|Drama,75
1000207,6040,1096,4,956715648,1,25,6,487,265234004176000000000000,Drama,62
1000208,6040,1097,4,956715569,1,25,6,487,22196934462920020000000000,Children's|Drama|Fantasy|Sci-Fi,62


In [9]:
# Preprocess genre
genres = df_ratings['genres']
genre2idx = {}
cnt = 1
res_tmp_genre = []
max_genre_len = 0
for genre in list(genres):
    l = genre.split('|')
    max_genre_len = max_genre_len if max_genre_len > len(l) else len(l)
    for e in l:
        if e not in genre2idx:
            genre2idx[e] = cnt
            cnt += 1
    res_tmp_genre.append([genre2idx[e] for e in l])
res_genre = []   
for genre in res_tmp_genre:
    genre.extend([0 for i in range(max_genre_len - len(genre))])
    res_genre.append(','.join(str(i) for i in genre))
    
df_ratings['genres'] = res_genre
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,487,38005002478340000000000000,600000,69
1000205,6040,1094,5,956704887,1,25,6,487,40433710848000000000000,1511000,72
1000206,6040,562,5,956704746,1,25,6,487,2683312644180300000000000,610000,75
1000207,6040,1096,4,956715648,1,25,6,487,265234004176000000000000,100000,62
1000208,6040,1097,4,956715569,1,25,6,487,22196934462920020000000000,3191000,62


In [10]:
# Preprocess timestamp
min_time_stamp = np.min(df_ratings['time_stamp'])
max_time_stamp = np.max(df_ratings['time_stamp'])

df_ratings['time_stamp'] = df_ratings['time_stamp'].map(lambda x: (x - min_time_stamp)/(max_time_stamp - min_time_stamp))
df_ratings = df_ratings.sort_values(by='time_stamp')
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
825793,4958,2399,1,0.999997,1,18,7,825,23881677848279400000000000,839000,65
825438,4958,1407,5,0.999998,1,18,7,825,151900000000000000,16120000,76
825724,4958,3264,4,1.0,1,18,7,825,1741441052364500000000000,6160000,72
825731,4958,2634,3,1.0,1,18,7,825,32158480000000000000,1600000,39
825603,4958,1924,4,1.0,1,18,7,825,19213153425919372810000000000,16100000,38


In [14]:
# Preprocess label
df_ratings['watch'] = df_ratings['rating'].map(lambda x: 0 if x < 4 else 1)
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year,watch
1000138,6040,858,4,0.0,1,25,6,487,15888480000000000000,7131000,52,1
1000153,6040,2384,4,2.451236e-07,1,25,6,487,2774247535574412470000000000,360000,78,1
999873,6040,593,5,2.451236e-07,1,25,6,487,147736844420168480000000000,1120000,71,1
1000007,6040,1961,4,5.013891e-07,1,25,6,487,6726290000000000000,100000,68,1
1000192,6040,2019,5,5.013891e-07,1,25,6,487,"2549,3792,848,2676,2549,38,3600,3914,0,0,0,0,0...",710000,34,1


In [15]:
# Reorder the columns 
orders = ['watch', 'rating', 'user_id', 'gender', 'age', 'occupation', 'zip-code', 'time_stamp', 'item_id', 'year', 'title', 'genres']
df_ratings = df_ratings[orders]
df_ratings.tail()

In [17]:
#split data
#df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)
train_df = df_ratings.iloc[:int(len(df_ratings)*0.8), :]
val_df = df_ratings.iloc[int(len(df_ratings)*0.8):int(len(df_ratings)*0.9), :]
test_df = df_ratings.iloc[int(len(df_ratings)*0.9):, :]

In [18]:
#save csv
train_df.head()
train_df.to_csv('./../data/train_data.csv', sep = ',', index = False)

In [19]:
#save csv
val_df.head()
val_df.to_csv('./../data/val_data.csv', sep = ',', index = False)

In [20]:
#save csv
test_df.head()
test_df.to_csv('./../data/test_data.csv', sep = ',', index = False)

In [21]:
test_df = test_df.drop(['time_stamp'], axis=1)
test_df.to_csv('./../data/test/test_data.txt', sep = ' ', index = False, header = None)

In [22]:
train_df = train_df.drop(['time_stamp'], axis=1)
train_df.to_csv('./../data/train/train_data.txt', sep = ' ', index = False, header = None)

In [23]:
val_df = val_df.drop(['time_stamp'], axis=1)
val_df.to_csv('./../data/val/val_data.txt', sep = ' ', index = False, header = None)