In [1]:
from scipy import sparse
from scipy.sparse import csr_matrix
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
sample_train_sparse_matrix = sparse.load_npz("sample/small/sample_train_sparse_matrix.npz")
sample_test_sparse_matrix = sparse.load_npz("sample/small/sample_test_sparse_matrix.npz")

In [3]:
# get the user averages in dictionary (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):
    
    # average ratings of user/axes
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes

    # ".A1" is for converting Column_Matrix to 1-D numpy array 
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Boolean matrix of ratings ( whether a user rated that movie or not)
    is_rated = sparse_matrix!=0
    # no of ratings that each user OR movie..
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user  and max_movie ids in sparse matrix 
    u,m = sparse_matrix.shape
    # creae a dictonary of users and their average ratigns..
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m) 
                                    if no_of_ratings[i] !=0}

    # return that dictionary of average ratings
    return average_ratings

sample_train_averages = dict()

# get the global average of ratings in our train set.
global_average = sample_train_sparse_matrix.sum()/sample_train_sparse_matrix.count_nonzero()
sample_train_averages['global'] = global_average
sample_train_averages

sample_train_averages['user'] = get_average_ratings(sample_train_sparse_matrix, of_users=True)
print('\nAverage rating of user 1515220 :',sample_train_averages['user'][1515220])

sample_train_averages['movie'] =  get_average_ratings(sample_train_sparse_matrix, of_users=False)
print('\n AVerage rating of movie 15153 :',sample_train_averages['movie'][15153])

print('\n No of ratings in Our Sampled train matrix is : {}\n'.format(sample_train_sparse_matrix.count_nonzero()))
print('\n No of ratings in Our Sampled test  matrix is : {}\n'.format(sample_train_sparse_matrix.count_nonzero()))


Average rating of user 1515220 : 3.923076923076923

 AVerage rating of movie 15153 : 2.752

 No of ratings in Our Sampled train matrix is : 856986


 No of ratings in Our Sampled test  matrix is : 856986



In [4]:
# get users, movies and ratings from our samples train sparse matrix
sample_train_users, sample_train_movies, sample_train_ratings = sparse.find(sample_train_sparse_matrix)

In [6]:
from datetime import datetime


def multi_thread(sample_train_users, sample_train_movies, sample_train_ratings, thread):
    start = datetime.now()
    with open('sample/small/multi/train_reg_file_{}.csv'.format(thread), mode='w') as reg_data_file:
        count = 0
        for (user, movie, rating)  in zip(sample_train_users, sample_train_movies, sample_train_ratings):
            st = datetime.now()
        #     print(user, movie)    
            #--------------------- Ratings of "movie" by similar users of "user" ---------------------
            # compute the similar Users of the "user"        
            user_sim = cosine_similarity(sample_train_sparse_matrix[user], sample_train_sparse_matrix).ravel()
            top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # get the ratings of most similar users for this movie
            top_ratings = sample_train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            # we will make it's length "5" by adding movie averages to .
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([sample_train_averages['movie'][movie]]*(5 - len(top_sim_users_ratings)))
        #     print(top_sim_users_ratings, end=" ")    


            #--------------------- Ratings by "user"  to similar movies of "movie" ---------------------
            # compute the similar movies of the "movie"        
            movie_sim = cosine_similarity(sample_train_sparse_matrix[:,movie].T, sample_train_sparse_matrix.T).ravel()
            top_sim_movies = movie_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
            # get the ratings of most similar movie rated by this user..
            top_ratings = sample_train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            # we will make it's length "5" by adding user averages to.
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([sample_train_averages['user'][user]]*(5-len(top_sim_movies_ratings))) 
        #     print(top_sim_movies_ratings, end=" : -- ")

            #-----------------prepare the row to be stores in a file-----------------#
            df_list = list()
            row = list()
            row.append(user)
            row.append(movie)
            # Now add the other features to this data...
            row.append(sample_train_averages['global']) # first feature
            # next 5 features are similar_users "movie" ratings
            row.extend(top_sim_users_ratings)
            # next 5 features are "user" ratings for similar_movies
            row.extend(top_sim_movies_ratings)
            # Avg_user rating
            row.append(sample_train_averages['user'][user])
            # Avg_movie rating
            row.append(sample_train_averages['movie'][movie])

            # finalley, The actual Rating of this user-movie pair...
            row.append(rating)

            df_list.append(row)
            count = count + 1

            # add rows to the file opened..
            reg_data_file.write(','.join(map(str, row)))
            reg_data_file.write('\n')        
            # print(','.join(map(str, row)))
            print("Done for {} rows----- {}".format(count, datetime.now() - start))

    print("Time taken to compute 1000 rows: \n")
    print(datetime.now() - start)

In [7]:
import threading   
if __name__ == "__main__": 
    #Creates threads
    t1 = threading.Thread(target=multi_thread, args=(sample_train_users[0:100], sample_train_movies[0:100], sample_train_ratings[0:100], 't1')) 
    t2 = threading.Thread(target=multi_thread, args=(sample_train_users[100:200], sample_train_movies[100:200], sample_train_ratings[100:200], 't2'))
    t3 = threading.Thread(target=multi_thread, args=(sample_train_users[200:300], sample_train_movies[200:300], sample_train_ratings[200:300], 't3')) 
    t4 = threading.Thread(target=multi_thread, args=(sample_train_users[300:400], sample_train_movies[300:400], sample_train_ratings[300:400], 't4'))
    t5 = threading.Thread(target=multi_thread, args=(sample_train_users[400:500], sample_train_movies[400:500], sample_train_ratings[400:500], 't5'))
    t6 = threading.Thread(target=multi_thread, args=(sample_train_users[500:600], sample_train_movies[500:600], sample_train_ratings[500:600], 't6'))
    t7 = threading.Thread(target=multi_thread, args=(sample_train_users[600:700], sample_train_movies[600:700], sample_train_ratings[600:700], 't7'))
    t8 = threading.Thread(target=multi_thread, args=(sample_train_users[700:800], sample_train_movies[700:800], sample_train_ratings[700:800], 't8'))    
    
    
    #Start threads
    t1.start() 
    t2.start()
    t3.start() 
    t4.start()
    t5.start() 
    t6.start()
    t7.start() 
    t8.start()

  
    #Join the threads
    t1.join() 
    t2.join()
    t3.join() 
    t4.join()
    t5.join() 
    t6.join()
    t7.join() 
    t8.join()

Done for 1 rows----- 0:00:02.142174
Done for 1 rows----- 0:00:02.303772
Done for 1 rows----- 0:00:02.269345
Done for 1 rows----- 0:00:02.269689
Done for 1 rows----- 0:00:02.439762
Done for 1 rows----- 0:00:02.573578
Done for 1 rows----- 0:00:02.767233
Done for 1 rows----- 0:00:02.761190
Done for 2 rows----- 0:00:04.206008
Done for 2 rows----- 0:00:04.314338
Done for 2 rows----- 0:00:04.382328
Done for 2 rows----- 0:00:04.498751
Done for 2 rows----- 0:00:04.732900
Done for 2 rows----- 0:00:04.763383
Done for 2 rows----- 0:00:05.025320
Done for 2 rows----- 0:00:05.065704
Done for 3 rows----- 0:00:06.135287
Done for 3 rows----- 0:00:06.183237
Done for 3 rows----- 0:00:06.432673
Done for 3 rows----- 0:00:06.645565
Done for 3 rows----- 0:00:06.681815
Done for 3 rows----- 0:00:06.872872
Done for 3 rows----- 0:00:07.143130
Done for 3 rows----- 0:00:07.590905
Done for 4 rows----- 0:00:08.203963
Done for 4 rows----- 0:00:08.139543
Done for 4 rows----- 0:00:08.196451
Done for 4 rows----- 0:00:08

Done for 30 rows----- 0:00:55.215419
Done for 29 rows----- 0:00:55.432994
Done for 29 rows----- 0:00:55.469020
Done for 30 rows----- 0:00:55.766971
Done for 29 rows----- 0:00:55.778667
Done for 29 rows----- 0:00:56.038049
Done for 29 rows----- 0:00:56.687426
Done for 28 rows----- 0:00:56.772800
Done for 30 rows----- 0:00:57.023443
Done for 31 rows----- 0:00:57.145151
Done for 30 rows----- 0:00:57.371696
Done for 31 rows----- 0:00:57.513010
Done for 30 rows----- 0:00:57.772902
Done for 30 rows----- 0:00:57.914578
Done for 32 rows----- 0:00:58.554109
Done for 30 rows----- 0:00:58.644609
Done for 31 rows----- 0:00:58.782767
Done for 29 rows----- 0:00:58.831852
Done for 31 rows----- 0:00:59.277303
Done for 32 rows----- 0:00:59.508493
Done for 31 rows----- 0:00:59.662717
Done for 31 rows----- 0:00:59.685861
Done for 33 rows----- 0:01:00.187042
Done for 32 rows----- 0:01:00.567090
Done for 31 rows----- 0:01:00.554203
Done for 30 rows----- 0:01:00.682521
Done for 33 rows----- 0:01:00.984282
D

Done for 54 rows----- 0:01:46.981017
Done for 55 rows----- 0:01:47.505527
Done for 58 rows----- 0:01:47.639458
Done for 58 rows----- 0:01:47.999996
Done for 59 rows----- 0:01:48.119350
Done for 54 rows----- 0:01:48.040801
Done for 59 rows----- 0:01:48.056967
Done for 58 rows----- 0:01:48.367568
Done for 55 rows----- 0:01:48.839795
Done for 59 rows----- 0:01:49.532134
Done for 60 rows----- 0:01:49.550633
Done for 60 rows----- 0:01:49.778929
Done for 56 rows----- 0:01:49.821634
Done for 59 rows----- 0:01:50.021244
Done for 59 rows----- 0:01:50.179494
Done for 55 rows----- 0:01:50.104455
Done for 56 rows----- 0:01:50.951613
Done for 60 rows----- 0:01:51.457853
Done for 61 rows----- 0:01:51.673217
Done for 61 rows----- 0:01:51.750848
Done for 57 rows----- 0:01:51.780811
Done for 60 rows----- 0:01:51.839440
Done for 60 rows----- 0:01:52.156202
Done for 56 rows----- 0:01:52.404606
Done for 57 rows----- 0:01:52.516870
Done for 61 rows----- 0:01:53.142696
Done for 62 rows----- 0:01:53.501890
D

Done for 86 rows----- 0:02:39.486292
Done for 88 rows----- 0:02:39.432028
Done for 87 rows----- 0:02:39.649781
Done for 86 rows----- 0:02:39.756902
Done for 81 rows----- 0:02:40.193228
Done for 88 rows----- 0:02:40.687263
Done for 82 rows----- 0:02:40.658923
Done for 81 rows----- 0:02:40.731571
Done for 89 rows----- 0:02:40.999400
Done for 87 rows----- 0:02:41.303752
Done for 88 rows----- 0:02:41.590790
Done for 87 rows----- 0:02:42.073974
Done for 82 rows----- 0:02:42.296315
Done for 89 rows----- 0:02:42.389270
Done for 83 rows----- 0:02:42.516632
Done for 82 rows----- 0:02:42.690262
Done for 90 rows----- 0:02:42.787765
Done for 88 rows----- 0:02:43.229637
Done for 89 rows----- 0:02:43.375623
Done for 88 rows----- 0:02:43.760859
Done for 90 rows----- 0:02:44.238998
Done for 91 rows----- 0:02:44.385634
Done for 83 rows----- 0:02:44.564675
Done for 84 rows----- 0:02:44.699109
Done for 83 rows----- 0:02:44.782580
Done for 90 rows----- 0:02:44.994664Done for 89 rows----- 0:02:45.008068

D

In [9]:
import numpy as np
import pandas as pd
df1 = pd.read_csv('sample/small/multi/train_reg_file_t1.csv', header=None, names = np.arange(0,16))
df2 = pd.read_csv('sample/small/multi/train_reg_file_t2.csv', header=None, names = np.arange(0,16))
df3 = pd.read_csv('sample/small/multi/train_reg_file_t3.csv', header=None, names = np.arange(0,16))
df4 = pd.read_csv('sample/small/multi/train_reg_file_t4.csv', header=None, names = np.arange(0,16))
df5 = pd.read_csv('sample/small/multi/train_reg_file_t5.csv', header=None, names = np.arange(0,16))
df6 = pd.read_csv('sample/small/multi/train_reg_file_t6.csv', header=None, names = np.arange(0,16))
df7 = pd.read_csv('sample/small/multi/train_reg_file_t7.csv', header=None, names = np.arange(0,16))
df8 = pd.read_csv('sample/small/multi/train_reg_file_t8.csv', header=None, names = np.arange(0,16))

In [10]:
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8])

In [11]:
df.shape

(800, 16)

In [13]:
df.to_csv("sample/small/multi/train_reg_multi_thread")