In [1]:
import pandas as pd
import numpy as np
import os
import json
import sys 
import traceback

In [2]:
movies_path = os.path.join(os.path.join(os.getcwd(),'Datasets'),'movies_dataset')

metadata_path = os.path.join(movies_path,'movies_metadata.csv')
ratings_path = os.path.join(movies_path,'ratings.csv')

In [3]:
metadata = pd.read_csv(metadata_path)
ratings = pd.read_csv(ratings_path)

ratings = ratings.drop(columns='timestamp')

In [4]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


In [5]:
nrows = metadata.shape[0]

movies = {}
for index in range(nrows):

    # get the id of the movie
    try:
        id_row = int(metadata.iloc[index,:].id)
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue


    # get the genres of the movie
    try:
         genres_row = eval(metadata.iloc[index,:].genres)
         genres_list = []
         for genres_dict in genres_row:
            id_ = genres_dict['id']
            genres_list.append(id_)
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue

    # get original language
    try:
         original_row = metadata.iloc[index,:].original_language
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue

    
    # get the production companies
    try:
        prod_comp_list = []
        prod_comp_row = eval(metadata.iloc[index,:].production_companies)
        for prod_comp_dict in prod_comp_row:
            id_ = prod_comp_dict['id']
            prod_comp_list.append(id_)
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue

    # get the production countries
    try:
        prod_countries_list = []
        prod_countries_row = eval(metadata.iloc[index,:].production_countries)
        for prod_countries_dict in prod_countries_row:
            country = prod_countries_dict['iso_3166_1']
            prod_countries_list.append(country)
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue

    # get the production countries
    try:
        spoken_languages_list = []
        spoken_languages_row = eval(metadata.iloc[index,:].spoken_languages)
        for spoken_languages_dict in spoken_languages_row:
            spoken_language = spoken_languages_dict['iso_639_1']
            spoken_languages_list.append(spoken_language)
    except:
        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)
        continue

    movies[id_row] = {'genres':genres_list,
                    'original_language':original_row,
                    'production_companies':prod_comp_list,
                    'production_countries': prod_countries_list,
                    'spoken_language':spoken_languages_list}
   

------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-44df0911440c>", line 42, in <module>
    prod_comp_row = eval(metadata.iloc[index,:].production_companies)
TypeError: eval() arg 1 must be a string, bytes or code object
------------------------------------------------------------
------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-44df0911440c>", line 8, in <module>
    id_row = int(metadata.iloc[index,:].id)
ValueError: invalid literal for int() with base 10: '1997-08-20'
------------------------------------------------------------
------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-44df0911440c>", line 42, in <module>
    prod_comp_row = eval(metadata.iloc[index,:].production_companies)
TypeError: eval() arg 1 must be a string, bytes or code object
------------------------------

In [6]:
len(movies)

45430

In [7]:
with open('./movies_metadata.txt','w') as fw:
    json.dump(movies, fw)

In [8]:
with  open('./movies_metadata.txt','r') as fr:
    data = json.load(fr)
print(len(data))

45430


In [7]:
def filter_users(ratings):
    # count occurrences of same userId
    print(ratings.shape)
    ids = ratings.userId
    counts = ids.value_counts()

    # select ids to keep (users with #ratings >= 5)
    to_keep = counts[counts.values >= 10].index
    mask = [(x in to_keep) for x in ratings.userId]
    ratings_5 = ratings[mask]
    return ratings_5

In [8]:
def filter_movies(ratings):
    # count occurrences of same movieId
    print(ratings.shape)
    ids = ratings.movieId
    counts = ids.value_counts()

    # select ids to keep (movies with #ratings >= 20)
    to_keep = counts[counts.values >= 20].index
    mask = [(x in to_keep) for x in ratings.movieId]
    ratings_20 = ratings[mask]
    return ratings_20


In [9]:
def filter(ratings):
    num_rows = ratings.shape[0]

    tmp = ratings
    while True:
        tmp_5 = filter_users(tmp)
        if tmp_5.shape[0] == tmp.shape[0]:
            print('convergence has been reached')
            return tmp_5
        else:
            print('users filtered')
            tmp = tmp_5
        tmp_20 = filter_movies(tmp)
        if tmp_20.shape[0] == tmp.shape[0]:
            print('convergence has been reached')
            return tmp_20
        else:
            print('movie filtered')
            tmp = tmp_20



In [20]:
final_ratings = filter(ratings)

(26024289, 3)
users filtered
(25847473, 3)
movie filtered
(25711452, 3)
users filtered
(25709850, 3)
movie filtered
(25709736, 3)
users filtered
(25709727, 3)
convergence has been reached


In [21]:
def build_user_based_dictionary(unique_users, ratings):
    print('\nunique users in this batch are from {} to {}'.format(unique_users[0], unique_users[-1]))
    print('\nthe shape of ratings is {}'.format(ratings.shape))
    sparse_user_based = {}

    for user in unique_users:
        user_ratings = {}
        user_subtable = ratings[ratings.userId == user]
        for index,row in user_subtable.iterrows():
            user_ratings[str(row['movieId'])] = str(row['rating'])
        sparse_user_based[str(user)] = user_ratings
        if user%1000 in [0,1]:
            print('user is {}'.format(user))
    return sparse_user_based

In [22]:
def build_item_based_dictionary(unique_movies, ratings):
    print('\nunique movies in this batch are from {} to {}'.format(unique_movies[0], unique_movies[-1]))
    print('\nthe shape of ratings is {}'.format(ratings.shape))
    sparse_item_based = {}

    for movie in unique_movies:
        movie_ratings = {}
        movie_subtable = ratings[ratings.movieId == movie]
        for index,row in movie_subtable.iterrows():
            movie_ratings[str(row['userId'])] = str(row['rating'])
        sparse_item_based[str(movie)] = movie_ratings
        if movie%500 in [0,1]:
            print('movie is {}'.format(movie))
    return sparse_item_based

In [23]:
from threading import Thread

class CreateUtilityThread(Thread):
    def __init__(self, name = None, unique_users = None, unique_items = None, ratings = None, job = None):
        Thread.__init__(self)
        self.name = name
        self.unique_users = unique_users
        self.unique_items = unique_items
        self.ratings = ratings
        self.job = job
        self._return = None

    def run(self):
        print(self.name)
        if self.name is not None:
            if self.job == 'item_based':
                self._return = build_item_based_dictionary(self.unique_items, self.ratings)
            elif self.job == 'user_based':
                self._return = build_user_based_dictionary(self.unique_users, self.ratings)
            else:
                print('Error, you have to specify a job')

    def join(self):
        Thread.join(self)
        if self._return is not None:
            return self._return
        else:
            print('Error using threading')

In [24]:
def split_list(l):
    n = len(l) // 2          # length of smaller half
    
    l1 = l[:n]
    l2 = l[n:]

    return l1, l2

In [13]:
unique_users = final_ratings.userId.unique()

# split the dictionary in 2
r1, r2 = split_list(unique_users)

# split the two dictionaries in 4
r11, r12 = split_list(r1)
r21, r22 = split_list(r2)

# split the 4 dictionaries in 8
r1, r2 = split_list(r11)
r3, r4 = split_list(r12)
r5, r6 = split_list(r21)
r7, r8 = split_list(r22)

thread1 = CreateUtilityThread('thread-1', unique_users = r1, ratings = final_ratings, job = 'user_based')
thread2 = CreateUtilityThread('thread-2', unique_users = r2, ratings = final_ratings, job = 'user_based')
thread3 = CreateUtilityThread('thread-3', unique_users = r3, ratings = final_ratings, job = 'user_based')
thread4 = CreateUtilityThread('thread-4', unique_users = r4, ratings = final_ratings, job = 'user_based')
thread5 = CreateUtilityThread('thread-5', unique_users = r5, ratings = final_ratings, job = 'user_based')
thread6 = CreateUtilityThread('thread-6', unique_users = r6, ratings = final_ratings, job = 'user_based')
thread7 = CreateUtilityThread('thread-7', unique_users = r7, ratings = final_ratings, job = 'user_based')
thread8 = CreateUtilityThread('thread-8', unique_users = r8, ratings = final_ratings, job = 'user_based')

threads = [thread1,thread2,thread3,thread4,thread5,thread6,thread7,thread8]

[thread.start() for thread in threads]
print('============= all threads started =============')

dictionaries = [thread.join() for thread in threads]
print('============= all threads joined =============')

thread-1thread-2
thread-3

unique users in this batch are from 67770 to 101724

the shape of ratings is (25709727, 3)
thread-4

unique users in this batch are from 101725 to 135636

the shape of ratings is (25709727, 3)thread-5

unique users in this batch are from 135637 to 169389

the shape of ratings is (25709727, 3)



unique users in this batch are from 1 to 34009

the shape of ratings is (25709727, 3)
thread-6thread-7

unique users in this batch are from 203208 to 236978

the shape of ratings is (25709727, 3)
thread-8

unique users in this batch are from 236979 to 270896

the shape of ratings is (25709727, 3)



unique users in this batch are from 169394 to 203207

the shape of ratings is (25709727, 3)


the shape of ratings is (25709727, 3)
user is 1
user is 237000
user is 237001
user is 68000
user is 68001
user is 102000
user is 102001
user is 136000
user is 136001
user is 170001
user is 204000
user is 204001
user is 35000
user is 35001
user is 1001
user is 238000
user is 238001

In [14]:
utility_matrix_user_based = {**dictionaries[0],**dictionaries[1],**dictionaries[2],**dictionaries[3],
                            **dictionaries[4],**dictionaries[5],**dictionaries[6],**dictionaries[7]}
print('the number of unique_users is {} and the sanity check is {}'.format(len(unique_users),len(utility_matrix_user_based) == len(unique_users)))

the number of unique_users is 233714 and the sanity check is True


In [None]:
with open('./utility_matrix_user_based.txt','w') as fw:
    json.dump(utility_matrix_user_based, fw)

In [None]:
with  open('./utility_matrix_user_based.txt','r') as fr:
    data = json.load(fr)
print(len(data), len(data['2']))

In [25]:
unique_movies = np.sort(final_ratings.movieId.unique())

# split the dictionary in 2
r1, r2 = split_list(unique_movies)

# split the two dictionaries in 4
r11, r12 = split_list(r1)
r21, r22 = split_list(r2)

# split the 4 dictionaries in 8
r1, r2 = split_list(r11)
r3, r4 = split_list(r12)
r5, r6 = split_list(r21)
r7, r8 = split_list(r22)

thread1 = CreateUtilityThread('thread-1', unique_items = r1, ratings = final_ratings, job = 'item_based')
thread2 = CreateUtilityThread('thread-2', unique_items = r2, ratings = final_ratings, job = 'item_based')
thread3 = CreateUtilityThread('thread-3', unique_items = r3, ratings = final_ratings, job = 'item_based')
thread4 = CreateUtilityThread('thread-4', unique_items = r4, ratings = final_ratings, job = 'item_based')
thread5 = CreateUtilityThread('thread-5', unique_items = r5, ratings = final_ratings, job = 'item_based')
thread6 = CreateUtilityThread('thread-6', unique_items = r6, ratings = final_ratings, job = 'item_based')
thread7 = CreateUtilityThread('thread-7', unique_items = r7, ratings = final_ratings, job = 'item_based')
thread8 = CreateUtilityThread('thread-8', unique_items = r8, ratings = final_ratings, job = 'item_based')

threads = [thread1,thread2,thread3,thread4,thread5,thread6,thread7,thread8]

[thread.start() for thread in threads]
print('============= all threads started =============')

dictionaries = [thread.join() for thread in threads]
print('============= all threads joined =============')

thread-1thread-2

unique movies in this batch are from 2218 to 4393

the shape of ratings is (25709727, 3)thread-3

unique movies in this batch are from 4394 to 6615

the shape of ratings is (25709727, 3)


unique movies in this batch are from 1 to 2216
thread-4

unique movies in this batch are from 6616 to 26693

the shape of ratings is (25709727, 3)

the shape of ratings is (25709727, 3)thread-5

unique movies in this batch are from 26694 to 54372

the shape of ratings is (25709727, 3)


thread-6

unique movies in this batch are from 54378 to 81639

the shape of ratings is (25709727, 3)
thread-7

unique movies in this batch are from 81641 to 110451

the shape of ratings is (25709727, 3)


unique movies in this batch are from 110453 to 175281

the shape of ratings is (25709727, 3)
movie is 110501
movie is 1
movie is 112501
movie is 4500
movie is 4501
movie is 56001
movie is 113501
movie is 114001
movie is 86000
movie is 31000
movie is 89000
movie is 7000
movie is 7001
movie is 59000
m

In [26]:
utility_matrix_item_based = {**dictionaries[0],**dictionaries[1],**dictionaries[2],**dictionaries[3],
                            **dictionaries[4],**dictionaries[5],**dictionaries[6],**dictionaries[7]}
print('the number of unique_items is {} and the sanity check is {}'.format(len(unique_movies),len(utility_matrix_item_based) == len(unique_movies)))

the number of unique_items is 16770 and the sanity check is True


In [27]:
with open('./utility_matrix_item_based.txt','w') as fw:
    json.dump(utility_matrix_item_based, fw)

In [2]:
with  open('./utility_matrix_item_based.txt','r') as fr:
    data = json.load(fr)
print(len(data))

16770
