In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer

In [2]:
# Load rating file
df_ratings = pd.read_csv('./../datahub/movielens1M/ratings.dat', sep='::', engine='python', names=['user_id', 'item_id', 'rating', 'time_stamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Load user file
df_user = pd.read_csv('./../datahub/movielens1M/users.dat', sep='::', engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip-code'])
df_user.head()

Unnamed: 0,user_id,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# Load item file
df_item = pd.read_csv('./../datahub/movielens1M/movies.dat', sep='::', engine='python', encoding='ISO-8859-1', names=['item_id', 'title', 'genres'])
df_item.tail()

Unnamed: 0,item_id,title,genres
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [5]:
# join
df_ratings = df_ratings.join(df_user.set_index('user_id'), on='user_id')
df_ratings = df_ratings.join(df_item.set_index('item_id'), on='item_id')
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
# Preprocess gender
df_ratings['gender'] = df_ratings['gender'].map(lambda x: 0 if x == 'F' else 1)
# Preprocess zip-code
zip_codes = list(set(df_ratings['zip-code']))
print("number of zip code: {}, number of users: {}".format(len(zip_codes), len(set(df_ratings['user_id']))))
zip_code2idx = {}
for idx, value in enumerate(zip_codes):
    zip_code2idx[value] = idx
df_ratings['zip-code'] = df_ratings['zip-code'].map(lambda x: zip_code2idx[x])

number of zip code: 3439, number of users: 6040


In [7]:
# Preprocess year
titles = df_ratings['title']

years = [title[-5:-1] for title in titles]
year_set = list(set(years))
year_set.sort()
year2idx = {y: idx for idx, y in enumerate(year_set)}
years = [year2idx[y] for y in years]
df_ratings['year'] = years

In [8]:
# Preprocess titles
tokenizer = RegexpTokenizer(r'\w+')
titles = df_ratings['title']
tokenized_titles = []
titles = [tokenizer.tokenize(str(title[:-6])) for title in list(titles)]
vocab = []
max_title_len = 0
for title in titles:
    vocab += title
    max_title_len = max_title_len if max_title_len > len(title) else len(title)
vocab = set(vocab)
word2idx = {word: idx for idx, word in enumerate(vocab)}
res_title = []
for title in titles:
    padding_title = [word2idx[word] for word in title]
    padding_title.extend([0 for i in range(max_title_len - len(title))])
    res_title.append(','.join(str(i) for i in padding_title))
df_ratings['title'] = res_title
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,3127,432243083043281500000000000,Comedy,69
1000205,6040,1094,5,956704887,1,25,6,3127,4478243685000000000000,Drama|Romance|War,72
1000206,6040,562,5,956704746,1,25,6,3127,27384231354146900000000000,Comedy|Drama,75
1000207,6040,1096,4,956715648,1,25,6,3127,5728153562000000000000,Drama,62
1000208,6040,1097,4,956715569,1,25,6,3127,4193157354113516160000000000,Children's|Drama|Fantasy|Sci-Fi,62


In [9]:
# Preprocess genre
genres = df_ratings['genres']
genre2idx = {}
cnt = 1
res_tmp_genre = []
max_genre_len = 0
for genre in list(genres):
    l = genre.split('|')
    max_genre_len = max_genre_len if max_genre_len > len(l) else len(l)
    for e in l:
        if e not in genre2idx:
            genre2idx[e] = cnt
            cnt += 1
    res_tmp_genre.append([genre2idx[e] for e in l])
res_genre = []   
for genre in res_tmp_genre:
    genre.extend([0 for i in range(max_genre_len - len(genre))])
    res_genre.append(','.join(str(i) for i in genre))
    
df_ratings['genres'] = res_genre
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year
1000204,6040,1091,1,956716541,1,25,6,3127,432243083043281500000000000,600000,69
1000205,6040,1094,5,956704887,1,25,6,3127,4478243685000000000000,1511000,72
1000206,6040,562,5,956704746,1,25,6,3127,27384231354146900000000000,610000,75
1000207,6040,1096,4,956715648,1,25,6,3127,5728153562000000000000,100000,62
1000208,6040,1097,4,956715569,1,25,6,3127,4193157354113516160000000000,3191000,62


In [10]:
# Preprocess label
df_ratings['watch'] = df_ratings['rating'].map(lambda x: 0 if x < 4 else 1)
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,gender,age,occupation,zip-code,title,genres,year,watch
0,1,1193,5,978300760,0,1,10,1318,"4188,2254,1828,3541,3633,2815,3880,0,0,0,0,0,0...",100000,55,1
1,1,661,3,978302109,0,1,10,1318,43020943541239911000000000000,234000,76,0
2,1,914,3,978301968,0,1,10,1318,349011752000000000000000,450000,44,0
3,1,3408,4,978300275,0,1,10,1318,183925910000000000000,100000,80,1
4,1,2355,5,978824291,0,1,10,1318,2390281542631000000000000,236000,78,1


In [11]:
# Reorder the columns 
orders = ['watch', 'rating', 'user_id', 'gender', 'age', 'occupation', 'zip-code', 'time_stamp', 'item_id', 'year', 'title', 'genres']
df_ratings = df_ratings[orders]
df_ratings.tail()

Unnamed: 0,watch,rating,user_id,gender,age,occupation,zip-code,time_stamp,item_id,year,title,genres
1000204,0,1,6040,1,25,6,3127,956716541,1091,69,432243083043281500000000000,600000
1000205,1,5,6040,1,25,6,3127,956704887,1094,72,4478243685000000000000,1511000
1000206,1,5,6040,1,25,6,3127,956704746,562,75,27384231354146900000000000,610000
1000207,1,4,6040,1,25,6,3127,956715648,1096,62,5728153562000000000000,100000
1000208,1,4,6040,1,25,6,3127,956715569,1097,62,4193157354113516160000000000,3191000


In [12]:
description = [
    ('user_id', 1 + np.max(df_ratings['user_id']), 'spr'),
    ('gender', 1 + np.max(df_ratings['gender']), 'spr'),
    ('age', 1 + np.max(df_ratings['age']), 'spr'),
    ('occupation', 1 + np.max(df_ratings['occupation']), 'spr'),
    ('zip-code', 1 + np.max(df_ratings['zip-code']), 'spr'),
    ('item_id', 1 + np.max(df_ratings['item_id']), 'spr'),
    ('year', 1 + np.max(df_ratings['year']), 'spr'),
    ('title', 5000, 'seq'),
    ('genres', 20, 'seq'),
    ('rating', np.max(list(df_ratings['rating'])), 'label'),
    ('watch', 2, 'label')
]
description

[('user_id', 6041, 'spr'),
 ('gender', 2, 'spr'),
 ('age', 57, 'spr'),
 ('occupation', 21, 'spr'),
 ('zip-code', 3439, 'spr'),
 ('item_id', 3953, 'spr'),
 ('year', 81, 'spr'),
 ('title', 5000, 'seq'),
 ('genres', 20, 'seq'),
 ('rating', 5, 'label'),
 ('watch', 2, 'label')]

In [13]:
#process feat hash code 
df_ratings['user_id'] = df_ratings['user_id'].map(lambda x: x+0)
df_ratings['gender'] = df_ratings['gender'].map(lambda x: x+7000)
df_ratings['age'] = df_ratings['age'].map(lambda x: x+8000)
df_ratings['occupation'] = df_ratings['occupation'].map(lambda x: x+9000)
df_ratings['zip-code'] = df_ratings['zip-code'].map(lambda x: x+10000)
df_ratings['item_id'] = df_ratings['item_id'].map(lambda x: x+14000)
df_ratings['year'] = df_ratings['year'].map(lambda x: x+19000)
df_ratings.head()

Unnamed: 0,watch,rating,user_id,gender,age,occupation,zip-code,time_stamp,item_id,year,title,genres
0,1,5,1,7000,8001,9010,11318,978300760,15193,19055,"4188,2254,1828,3541,3633,2815,3880,0,0,0,0,0,0...",100000
1,0,3,1,7000,8001,9010,11318,978302109,14661,19076,43020943541239911000000000000,234000
2,0,3,1,7000,8001,9010,11318,978301968,14914,19044,349011752000000000000000,450000
3,1,4,1,7000,8001,9010,11318,978300275,17408,19080,183925910000000000000,100000
4,1,5,1,7000,8001,9010,11318,978824291,16355,19078,2390281542631000000000000,236000


In [14]:
#split data
df_ratings = df_ratings.sample(frac=1).reset_index(drop=True)
train_df = df_ratings.iloc[:int(len(df_ratings)*0.8), :]
val_df = df_ratings.iloc[int(len(df_ratings)*0.8):int(len(df_ratings)*0.9), :]
test_df = df_ratings.iloc[int(len(df_ratings)*0.9):, :]

In [16]:
#save csv
train_df.head()
train_df.to_csv('./../data_shuffle/train_data.csv', sep = ',', index = False)

In [17]:
#save csv
val_df.head()
val_df.to_csv('./../data_shuffle/val_data.csv', sep = ',', index = False)

In [18]:
#save csv
test_df.head()
test_df.to_csv('./../data_shuffle/test_data.csv', sep = ',', index = False)

In [19]:
test_df = test_df.drop(['time_stamp'], axis=1)
test_df.to_csv('./../data_shuffle/test/test_data.txt', sep = ' ', index = False, header = None)

In [20]:
train_df = train_df.drop(['time_stamp'], axis=1)
train_df.to_csv('./../data_shuffle/train/train_data.txt', sep = ' ', index = False, header = None)

In [21]:
val_df = val_df.drop(['time_stamp'], axis=1)
val_df.to_csv('./../data_shuffle/val/val_data.txt', sep = ' ', index = False, header = None)

In [22]:
df_ratings.head()

Unnamed: 0,watch,rating,user_id,gender,age,occupation,zip-code,time_stamp,item_id,year,title,genres
0,0,3,824,7001,8035,9012,11920,975378026,15203,19037,40883126282000000000000,100000
1,0,3,1284,7001,8045,9016,10546,974843269,15479,19077,334536850000000000000,7512000
2,1,5,685,7001,8025,9004,13373,975601687,14266,19074,2007188354171200000000000,15111500
3,0,3,1143,7001,8050,9007,10963,974874361,15231,19063,295620313685000000000000,100000
4,1,4,1127,7001,8025,9000,11763,974910681,16476,19066,401641520000000000000,7110000
