In [None]:
import numpy as np
from scipy.sparse import csr_matrix, save_npz
import operator
import time

In [None]:
def read_netflix_dataset(*files, n_films = None):
    new_users_id = {}
    movies_count = {}
    movie_id = -1
    n_users = 0
    
    for file in files:
        f = open(file, 'r')
        for line in f:
            if line.strip()[-1] == ':':
                movie_id = int(line[:-2])
                movies_count[movie_id] = 0
            else:
                arr = line.split(',')
                user_id = int(arr[0])
                if user_id not in new_users_id:
                    n_users += 1
                    new_users_id[user_id] = n_users
                movies_count[movie_id] += 1
        f.close()
    
    movies = list()
    num_lines = 0
    if n_films == None:
        n_films = len(movies_count)
        movies = movies_count.keys()
        num_lines = sum(movies_count.values())
    else:
        movies_count = sorted(movies_count.items(), key = operator.itemgetter(1), reverse = True)
        movies = [movies_count[i][0] for i in range(n_films)]
        for _, n_rates in movies_count[:n_films]:
            num_lines += n_rates
    
    row = np.hstack((np.arange(num_lines), np.arange(num_lines)))
    col = np.zeros((2 * num_lines,), dtype = int)
    data = np.ones((2 * num_lines,), dtype = int)
    target = np.zeros((num_lines,), dtype = int)
    
    line_i = 0
    for file in files:
        f = open(file, 'r')
        for line in f:
            if line.strip()[-1] == ':':
                movie_id = int(line[:-2])
            elif movie_id in movies:
                arr = line.split(',')
                user_id = int(arr[0])
                col[line_i] = movie_id - 1
                col[num_lines + line_i] = n_films + new_users_id[user_id] - 1
                target[line_i] = int(arr[1])
                line_i += 1
        f.close()
    
    features = csr_matrix((data, (row, col)), shape = (num_lines, n_users + n_films))
    
    return (features, target)

In [None]:
start = time.time()
X, y = read_netflix_dataset("combined_data_1.txt", n_films = 10)
stop = time.time()
print('Reading time: %.2f min' % ((stop - start) / 60))

In [None]:
start = time.time()
X, y = read_netflix_dataset("combined_data_1.txt", n_films = 100)
stop = time.time()
print('Reading time: %.2f min' % ((stop - start) / 60))

In [None]:
save_npz('sparse_X_100_films.npz', X)

In [None]:
np.savez('y_100_films.npz', y)

In [None]:
start = time.time()
X, y = read_netflix_dataset("combined_data_1.txt")
stop = time.time()
print('Reading time: %.2f min' % ((stop - start) / 60))

In [None]:
save_npz('sparse_X_1_file.npz', X)

In [None]:
np.savez('y_1_file.npz', y)