# SHORYA SETHIA [ 22B2725 ]


### Data
Using data from : https://www.kaggle.com/netflix-inc/netflix-prize-data/data
It contains:
1. combined_data_1.txt
2. combined_data_2.txt
3. combined_data_3.txt
4. combined_data_4.txt
5. movie_titles.csv

### Data Overview
The first line of each file combined_data_{i}.txt contains the movie id followed by a colon. Each subsequent line in the file corresponds to a rating from a customer and its date in the format: CustomerID,Rating,Date

- MovieIDs range from 1 to 17770 sequentially.
- CustomerIDs range from 1 to 2649429, with gaps. There are 480189 users.
- Ratings are on a five star (integral) scale from 1 to 5.
- Dates have the format YYYY-MM-DD.

In [1]:
# this is just to know how much time will it take to run this entire ipython notebook 
from datetime import datetime

In [2]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('nbagg')

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

import seaborn as sns
sns.set_style('whitegrid')
import os
from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import random

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

# Exploratory Data Analysis
### Converting entire data to following format:
u_i,m_j,r_ij

In [3]:
start = datetime.now()
if not os.path.isfile('data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in Netflix Prize Data and store them in one big file('data.csv')
    # I am Re-reading from each of the four files and appendig each rating to a global file 'train.csv'
    data = open('data.csv', mode='w')
    
    row = list()
    files=['data/combined_data_1.txt','data/combined_data_2.txt', 
           'data/combined_data_3.txt', 'data/combined_data_4.txt']
    for file in files:
        print("Reading ratings from {}...".format(file))
        with open(file) as f:
            for line in f: 
                del row[:] # you don't have to do this.
                line = line.strip()
                if line.endswith(':'):
                    # All below are ratings for this movie, until another movie appears.
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
        print("Done.\n")
    data.close()
print('Time taken :', datetime.now() - start)

Time taken : 0:00:00


In [10]:
start = datetime.now()

if not os.path.isfile('sorted_data.csv'):
  print("creating the dataframe from data.csv file..")
  df = pd.read_csv('data.csv', sep=',', names=['movie', 'user','rating','date'])
  df.date = pd.to_datetime(df.date)
  print('Done.\n')

  # we are arranging the ratings according to time.
  print('Sorting the dataframe by date..')
  df.sort_values(by='date', inplace=True)
  print('Done..')

  output_filename = 'sorted_data.csv'
  df.to_csv(output_filename, index=False)

else:
  print("File already exists. Reading it...")
  df = pd.read_csv('sorted_data.csv')
  
print('Time taken :', datetime.now() - start)

creating the dataframe from data.csv file..
Done.

Sorting the dataframe by date..
Done..
Time taken : 0:03:26.270092


In [11]:
df.head()

Unnamed: 0,movie,user,rating,date
56431994,10341,510180,4,1999-11-11
9056171,1798,510180,5,1999-11-11
58698779,10774,510180,3,1999-11-11
48101611,8651,510180,2,1999-11-11
81893208,14660,510180,2,1999-11-11


In [12]:
df.describe()['rating']

count    1.004805e+08
mean     3.604290e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
std      1.085219e+00
Name: rating, dtype: float64

### Checking for NaN values

In [None]:
print("Number of Nan values in our dataframe : ", sum(df.isnull().any()))

Number of Nan values in our dataframe :  0


### Deleting Duplicates either movie_id, user/customer_id, ratings, date

In [None]:
dup_bool = df.duplicated(['movie','user','rating'])
dups = sum(dup_bool) 
print("There are {} duplicate rating entries in the data..".format(dups))

There are 0 duplicate rating entries in the data..


### Number of Users, movies and ratings in data.csv

In [None]:
print("Total No of Users   :", len(np.unique(df.user)))
print("Total No of movies  :", len(np.unique(df.movie)))
print("Total no of ratings :",df.shape[0]) #total rows == no. of ratings

Total No of Users   : 480189
Total No of movies  : 17770
Total no of ratings : 100480507


### Spliting data into Train and Test (0.80 : 0.20 respectively)

In [13]:
if not os.path.isfile('train.csv'):
    # create the dataframe and store it as csv for further purposes
    df.iloc[:int(df.shape[0]*0.80)].to_csv("train.csv", index=False)
    print("train.csv formed.")
else :
    print("train.csv exists")

if not os.path.isfile('test.csv'):
    # create the dataframe and store it as csv for further purposes
    df.iloc[int(df.shape[0]*0.80):].to_csv("test.csv", index=False)
    print("test.csv formed.")
else :
    print("test.csv exists")

start = datetime.now()
train_df = pd.read_csv("train.csv", parse_dates=['date'])
test_df = pd.read_csv("test.csv")
print("read both csv")
print('Time taken :', datetime.now() - start)


train.csv exists
test.csv exists
read both csv
Time taken : 0:00:33.163274


### Number of Users, Movies and ratings in train.csv and test.csv

In [None]:
print("Numbers for train.csv")
print("Total No of Users   :", len(np.unique(train_df.user)))
print("Total No of movies  :", len(np.unique(train_df.movie)))
print("Total no of ratings :",train_df.shape[0])

print("\nNumbers for test.csv")
print("Total No of Users   :", len(np.unique(test_df.user)))
print("Total No of movies  :", len(np.unique(test_df.movie)))
print("Total no of ratings :",test_df.shape[0])

Numbers for train.csv


Total No of Users   : 405041
Total No of movies  : 17424
Total no of ratings : 80384405

Numbers for test.csv
Total No of Users   : 349312
Total No of movies  : 17757
Total no of ratings : 20096102


### EDA on trian_df

In [14]:
# method to make y-axis more readable
def human(num, units = 'M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return str(num/10**3) + " K"
    elif units == 'm':
        return str(num/10**6) + " M"
    elif units == 'b':
        return str(num/10**9) +  " B"

In [None]:
#Ratind Distribution ploting was taking very long time

# fig, ax = plt.subplots()
# plt.title('Distribution of ratings over Training dataset', fontsize=15)
# sns.countplot(train_df.rating)
# ax.set_yticklabels([human(item, 'M') for item in ax.get_yticks()])
# ax.set_ylabel('No. of Ratings(Millions)')

# plt.savefig('img/rating-distribution-train_df')

In [None]:
start = datetime.now()
rating_counts = train_df['rating'].value_counts()
print("Distribution of ratings over Training dataset:")
print(rating_counts)
print('Time taken:', datetime.now() - start)

Distribution of ratings over Training dataset:
rating
4    27161596
3    23339084
5    17772845
2     8369795
1     3741085
Name: count, dtype: int64
Time taken: 0:00:00.425591


In [15]:
# # Add new column (week day) to the data
# train_df['day_of_week'] = train_df.date.dt.weekday_name
# train_df.head()

# Add new column (week day) to the data
train_df['day_of_week'] = train_df['date'].dt.day_name()
train_df.head()

Unnamed: 0,movie,user,rating,date,day_of_week
0,10341,510180,4,1999-11-11,Thursday
1,1798,510180,5,1999-11-11,Thursday
2,10774,510180,3,1999-11-11,Thursday
3,8651,510180,2,1999-11-11,Thursday
4,14660,510180,2,1999-11-11,Thursday


In [None]:
avg_week_df = train_df.groupby(by=['day_of_week'])['rating'].mean()
print("Average ratings")
print(avg_week_df)

Average ratings
day_of_week
Friday       3.585274
Monday       3.577250
Saturday     3.591791
Sunday       3.594144
Thursday     3.582463
Tuesday      3.574438
Wednesday    3.583751
Name: rating, dtype: float64


In [None]:
fig, ax = plt.subplots()
sns.countplot(x='day_of_week', data=train_df, ax=ax)
plt.title('No of ratings on each day.')
plt.ylabel('Total no of ratings')
plt.xlabel('')
ax.set_yticklabels([human(item, 'M') for item in ax.get_yticks()])
plt.savefig('img/no.-of-rating-on-each-day_of_week-train_df.png')

  ax.set_yticklabels([human(item, 'M') for item in ax.get_yticks()])


In [None]:
ax = train_df.resample('m', on='date')['rating'].count().plot()
ax.set_title('No of ratings per month (Training data)')
plt.xlabel('Month')
plt.ylabel('No of ratings(per month)')
ax.set_yticklabels([human(item, 'M') for item in ax.get_yticks()])
plt.savefig('img/no.-of-ratings-per-month-train_df.png')

### Analysis on ratings given by a user

In [16]:
no_of_rated_movies_per_user = train_df.groupby(by='user')['rating'].count().sort_values(ascending=False)
no_of_rated_movies_per_user.head()

user
305344     17112
2439493    15896
387418     15402
1639792     9767
1461435     9447
Name: rating, dtype: int64

In [None]:
fig = plt.figure(figsize=plt.figaspect(.5))

ax1 = plt.subplot(121)
sns.kdeplot(no_of_rated_movies_per_user, shade=True, ax=ax1)
plt.xlabel('No of ratings by user')
plt.title("PDF")

ax2 = plt.subplot(122)
sns.kdeplot(no_of_rated_movies_per_user, shade=True, cumulative=True,ax=ax2)
plt.xlabel('No of ratings by user')
plt.title('CDF')

plt.savefig('img/pdf-cdf-rating-by-user-train_df.png')


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(no_of_rated_movies_per_user, shade=True, ax=ax1)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(no_of_rated_movies_per_user, shade=True, cumulative=True,ax=ax2)


Above warning is just about to use "fill" in place of "shade"

In [None]:
no_of_rated_movies_per_user.describe()

count    405041.000000
mean        198.459921
std         290.793238
min           1.000000
25%          34.000000
50%          89.000000
75%         245.000000
max       17112.000000
Name: rating, dtype: float64

In [17]:
quantiles = no_of_rated_movies_per_user.quantile(np.arange(0,1.01,0.01), interpolation='higher')
quantiles

0.00        1
0.01        1
0.02        2
0.03        4
0.04        5
        ...  
0.96      829
0.97      934
0.98     1079
0.99     1341
1.00    17112
Name: rating, Length: 101, dtype: int64

In [None]:
plt.title("Quantiles and their Values")
quantiles.plot()
# quantiles with 0.05 difference
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
# quantiles with 0.25 difference
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings by user')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')

# annotate the 25th, 50th, 75th and 100th percentile values....
for x,y in zip(quantiles.index[::25], quantiles[::25]):
    s= s="({} , {})".format(x,y)
    plt.annotate(s, xy=(x,y), xytext=(x-0.05, y+500)
                ,fontweight='bold')

plt.savefig('img/quantiles.png')

In [None]:
quantiles[::5]

0.00        1
0.05        7
0.10       15
0.15       21
0.20       27
0.25       34
0.30       41
0.35       50
0.40       60
0.45       73
0.50       89
0.55      109
0.60      133
0.65      163
0.70      199
0.75      245
0.80      307
0.85      392
0.90      520
0.95      749
1.00    17112
Name: rating, dtype: int64

In [None]:
no_of_ratings_per_movie = train_df.groupby(by='movie')['rating'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_movie.values)
plt.title('# RATINGS per Movie')
plt.xlabel('Movie')
plt.ylabel('No of Users who rated a movie')
ax.set_xticklabels([])

plt.savefig('img/per-movie-ratings-train_df.png')
# plt.show()

- There are some (<10%) movies which are rated by huge number of users.
- But majority movies exists which are rated by some hundereds of users. 

## Building sparse matrices from data

- Present data has 3 columns, user, movie, ratings; for each movie there are many users and each user gives rating.
- This takes lot of memory.
- To minimize usage of memory, I am creating two arrays, one for movies(m_i's) and one for users(u_j's), by some matrix operation (generally dot product) would give me rating (r_ij's)

In [18]:
start = datetime.now()
if os.path.isfile('train_sparse_matrix.npz'):
    print("It is present in pwd, loading it")
    train_sparse_matrix = sparse.load_npz('train_sparse_matrix.npz')
    print('Done. It\'s shape is : (user, movie) : ',train_sparse_matrix.shape)
else: 
    print("Building sparse_matrix from the dataframe...")
    # create sparse_matrix and store it for after usage.
    # csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
    # It should be in such a way that, MATRIX[row, col] = data
    train_sparse_matrix = sparse.csr_matrix((train_df.rating.values, (train_df.user.values,
                                               train_df.movie.values)),)
    
    print('Done. It\'s shape is : (user, movie) : ',train_sparse_matrix.shape)
    print('Saving it into pwd for further usages...')

    sparse.save_npz("train_sparse_matrix.npz", train_sparse_matrix)
    print('Done.\n')

print(datetime.now() - start)

It is present in pwd, loading it
Done. It's shape is : (user, movie) :  (2649430, 17771)
0:00:02.215208


In [19]:
start = datetime.now()
if os.path.isfile('test_sparse_matrix.npz'):
    print("It is present in pwd, loading it.")
    # just get it from the disk instead of computing it
    test_sparse_matrix = sparse.load_npz('test_sparse_matrix.npz')
    print('Done. It\'s shape is : (user, movie) : ',test_sparse_matrix.shape)
else: 
    print("Building sparse_matrix from the dataframe...")
    # create sparse_matrix and store it for after usage.
    # csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
    # It should be in such a way that, MATRIX[row, col] = data
    test_sparse_matrix = sparse.csr_matrix((test_df.rating.values, (test_df.user.values,
                                               test_df.movie.values)))
    
    print('Done. It\'s shape is : (user, movie) : ',test_sparse_matrix.shape)
    print('Saving it into pwd for further usages...')

    sparse.save_npz("test_sparse_matrix.npz", test_sparse_matrix)
    print('Done.')
    
print(datetime.now() - start)

It is present in pwd, loading it.
Done. It's shape is : (user, movie) :  (2649430, 17771)
0:00:00.585268


### Sparsity = (Number of Zero enteries/Number of total enteries)*100

In [None]:
us,mv = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()
print("Sparsity Of Train matrix : {} % ".format((1-(elem/(us*mv)))*100))

Sparsity Of Train matrix : 99.8292709259195 % 


In [None]:
us,mv = test_sparse_matrix.shape
elem = test_sparse_matrix.count_nonzero()
print("Sparsity Of Test matrix : {} % ".format(  (1-(elem/(us*mv))) * 100) )

Sparsity Of Test matrix : 99.95731772988694 % 


### Calculating Average rating globally, per movie and per user

In [None]:
def get_average_ratings(sparse_matrix, of_users):  # of_users is boolean flag (1: users, 0:movies)
    
    # selecting axes of sparse matrix
    ax = 1 if of_users else 0
    
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1     # ".A1" for converting Column_Matrix to 1-D numpy array 
    
    # Boolean matrix of ratings (whether a user rated that movie or not)
    is_rated = sparse_matrix!=0
    
    # no of ratings that each user OR movie..
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    u,m = sparse_matrix.shape     # max_user(u)  and max_movie(m) id's in sparse matrix 

    # average_rating = sum of ratings/sum of non-zero entries
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]            
                                 for i in range(u if of_users else m) 
                                    if no_of_ratings[i] !=0}  
    
    return average_ratings # returns dict

In [None]:
train_averages = dict()

train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
print(f"Global Average of Ratings in training data is {train_averages}")


Global Average of Ratings in training data is {'global': 3.582890686321557}


In [None]:
train_averages['user'] = get_average_ratings(train_sparse_matrix, of_users=True)
# user = random.randint(1,train_sparse_matrix.shape [0])
# print(user)

# Generate a random user ID within the valid range
valid_users = list(train_averages['user'].keys())  # Get the list of valid user IDs
user = random.choice(valid_users) 
print(f'Average rating of user {user} :',train_averages['user'][user])

Average rating of user 573242 : 4.138339920948616


In [None]:
train_averages['movie'] =  get_average_ratings(train_sparse_matrix, of_users=False)

valid_movies = list(train_averages['movie'].keys())
movie = random.choice(valid_movies)
print(f'Average rating of movie {movie} :',train_averages['movie'][movie])

Average rating of movie 12910 : 2.7708333333333335


PDF and CDF of avg rating of user and movie in train_df

In [None]:
start = datetime.now()

# Draw PDFs for average rating per user and per movie
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=plt.figaspect(.5))
fig.suptitle('Avg Ratings per User and per Movie', fontsize=15)

ax1.set_title('Users-Avg-Ratings')
# Get the list of average user ratings from the averages dictionary
user_averages = [rat for rat in train_averages['user'].values()]
sns.kdeplot(user_averages, cumulative=True, ax=ax1, label='Cdf')
sns.kdeplot(user_averages, ax=ax1, label='Pdf')

ax2.set_title('Movies-Avg-Rating')
# Get the list of movie average ratings from the dictionary
movie_averages = [rat for rat in train_averages['movie'].values()]
sns.kdeplot(movie_averages, cumulative=True, ax=ax2, label='Cdf')
sns.kdeplot(movie_averages, ax=ax2, label='Pdf')

plt.savefig('img/pdf-cdf-avg-rating-user&movie.png')
print(datetime.now() - start)

0:00:05.615467


### How many new users and movies would I encounter in test_csv ?

In [None]:
total_users = len(np.unique(df.user))
users_train = len(train_averages['user'])
new_users = total_users - users_train

print('Total number of Users  :', total_users)
print('Number of Users in Train data :', users_train)
print("No of Users that didn't appear in train data: {} ({} %) \n ".format(new_users,(new_users/total_users)*100))

Total number of Users  : 480189
Number of Users in Train data : 405041
No of Users that didn't appear in train data: 75148 (15.649671275268695 %) 
 


In [None]:
total_movies = len(np.unique(df.movie))
movies_train = len(train_averages['movie'])
new_movies = total_movies - movies_train

print('Total number of Movies  :', total_movies)
print('Number of Users in Train data :', movies_train)
print("No of Movies that didn't appear in train data: {} ({} %) \n ".format(new_movies,(new_movies/total_movies)*100))

Total number of Movies  : 17770
Number of Users in Train data : 17424
No of Movies that didn't appear in train data: 346 (1.9471018570624647 %) 
 


# Computing similarity matrix

### user - user collaborative filtering

In [33]:
from sklearn.metrics.pairwise import cosine_similarity


def compute_user_similarity(sparse_matrix, compute_for_few=False, top = 100, verbose=False, verb_for_n_rows = 20,
                            draw_time_taken=True):
    no_of_users = sparse_matrix.shape[0]
    # get the indices of  non zero rows (users) from our sparse matrix
    row_ind, col_ind = sparse_matrix.nonzero()
    row_ind = sorted(set(row_ind)) # we don't have to
    time_taken = list() #  time taken for finding similar users for an user
    
    # Create rows, cols, and data lists.., which can be used to create sparse matrices
    rows, cols, data = list(), list(), list()
    if verbose: print("Computing strted for top",top,"similarities for each user...")
    
    start = datetime.now()
    temp = 0
    
    for row in row_ind[:top] if compute_for_few else row_ind:
        temp = temp+1
        prev = datetime.now()
        
        # get the similarity row for this user with all other users
        sim = cosine_similarity(sparse_matrix.getrow(row), sparse_matrix).ravel()
        # I will consider only the top 10/20/40/100 etc  most similar users and ignore rest of them..
        top_sim_ind = sim.argsort()[-top:]
        top_sim_val = sim[top_sim_ind]
        
        # add them to our rows, cols and data
        rows.extend([row]*top)
        cols.extend(top_sim_ind)
        data.extend(top_sim_val)
        time_taken.append(datetime.now().timestamp() - prev.timestamp())
        if verbose:
            if temp%verb_for_n_rows == 0:
                print("Computing done for {} users [  time elapsed : {}  ]"
                      .format(temp, datetime.now()-start))
            
        
    # lets create sparse matrix out of these and return it
    if verbose: print('Creating Sparse matrix from the computed similarities')
    #return rows, cols, data
    
    if draw_time_taken:
        plt.plot(time_taken, label = 'time taken for each user')
        plt.plot(np.cumsum(time_taken), label='Total time')
        plt.legend(loc='best')
        plt.xlabel('User')
        plt.ylabel('Time (seconds)')
        plt.savefig('img/u-u-cf-17k-dim-per-user.png')
        
    return sparse.csr_matrix((data, (rows, cols)), shape=(no_of_users, no_of_users)), time_taken 

In [None]:
start = datetime.now()
u_u_sim_sparse, _ = compute_user_similarity(train_sparse_matrix, compute_for_few=True, top = 200,
                                                     verbose=True)
print("Time taken for user-user cf with 17k dimensions per user :",datetime.now()-start)

Computing strted for top 200 similarities for each user...
Computing done for 20 users [  time elapsed : 0:01:04.300235  ]
Computing done for 40 users [  time elapsed : 0:01:57.593116  ]
Computing done for 60 users [  time elapsed : 0:02:48.430230  ]
Computing done for 80 users [  time elapsed : 0:03:38.995090  ]
Computing done for 100 users [  time elapsed : 0:04:30.595433  ]
Computing done for 120 users [  time elapsed : 0:05:25.550533  ]
Computing done for 140 users [  time elapsed : 0:06:17.773366  ]
Computing done for 160 users [  time elapsed : 0:07:08.075599  ]
Computing done for 180 users [  time elapsed : 0:08:07.425267  ]
Computing done for 200 users [  time elapsed : 0:09:08.188622  ]
Creating Sparse matrix from the computed similarities
Time taken for user-user cf with 17k dimensions per user : 0:09:13.713449


- Calculating user-user Similarity_Matrix (user-user collaborative filtering) is not an easy task
- For top 200 users it took **0:09:13.713449** time, and as users count increases, complexity increases as one could find more and more similarities. 

* On avg per time consumed for searching similarity for one user = (9*60 + 13.71)/200 = **2.76 seconds**
* training data have 405041 users, so approximately it would take **405041*2.76 = 1117913 seconds = 12.93 days**
* It will take almost **13** days to just find similarities !

- Hence, i would try to find user-user similarity via reduced dimensions

### Truncated SVD for reducing the dimesnion of user vector
- SVD basically is a factorization of that matrix into three smaller matrices.
- The SVD of mxn matrix A is given by the formula A = U Σ V^T 
- Where
   - U is m*m matrix of orthonormal eigen vectors of AA^T
   - V^T is n*n matrix of orthonormal eigen vectors of (A^T)A
   - Σ is diagonal matrix with r elements, r = square root of positive eigen values of AA^T (or (A^T)A)

In [20]:
from datetime import datetime
from sklearn.decomposition import TruncatedSVD

start = datetime.now()

# All parameters are default except n_components. n_itr is for Randomized SVD solver.
netflix_svd = TruncatedSVD(n_components=100, algorithm='randomized', random_state=42)
print("Fitting started...")
trunc_svd = netflix_svd.fit_transform(train_sparse_matrix)

# num_iterations = 10
# for i in range(num_iterations):
#     # Fit the TruncatedSVD model for each iteration
#     trunc_svd = netflix_svd.fit_transform(train_sparse_matrix)
    
#     # Print progress update
#     print(f"Iteration {i+1}/{num_iterations} completed")

print(datetime.now()-start)

Fitting started...
0:02:09.566720


In [21]:
expl_var = np.cumsum(netflix_svd.explained_variance_ratio_)
expl_var

array([0.23362135, 0.26270872, 0.28323418, 0.29936103, 0.31129667,
       0.32272449, 0.33168545, 0.33816688, 0.34421001, 0.34939129,
       0.35412811, 0.35790579, 0.36145969, 0.36481079, 0.36796535,
       0.3709693 , 0.37381048, 0.37654066, 0.37892266, 0.38128434,
       0.38355732, 0.38573246, 0.38787214, 0.38996681, 0.39201513,
       0.39393495, 0.39577018, 0.39753914, 0.39924786, 0.40091947,
       0.40251418, 0.40408101, 0.40563205, 0.40715363, 0.40864418,
       0.41009275, 0.41151715, 0.41291575, 0.41428276, 0.4156209 ,
       0.41692581, 0.41818944, 0.41941626, 0.4206308 , 0.42183602,
       0.42301205, 0.42417439, 0.42530795, 0.42642577, 0.42753769,
       0.42862981, 0.4297012 , 0.43074982, 0.43178112, 0.43281107,
       0.43382522, 0.434825  , 0.43580279, 0.43677693, 0.43773492,
       0.43868205, 0.43962079, 0.44054526, 0.44145286, 0.44236078,
       0.44325366, 0.44413545, 0.4450134 , 0.44587698, 0.44672994,
       0.44757815, 0.44841497, 0.44924205, 0.45006438, 0.45087

- It basically is the gain of variance explained, if we add one additional latent factor to it via np.cumsum()
- By adding one by one latent factore to it,___gain in explained variance__ is decreasing.
- To take it to greter than 0.60, we have to take almost 400-500+ latent factors. It's totally us-less (more compute power and memory loss)

In [23]:
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 12))

ax1.set_ylabel("Cummulative Variance Explained")
ax1.set_xlabel("Number of Latent Facors")
ax1.plot(expl_var)
# annote some (latentfactors, expl_var) to make it clear
ind = [1, 2, 4, 8, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
ax1.scatter(x = [i-1 for i in ind], y = expl_var[[i-1 for i in ind]], c='#ff3300')
for i in ind:
    ax1.annotate("({}, {})".format(i, np.round(expl_var[i-1], 2)), xy=(i-1, expl_var[i-1]),
                xytext = ( i+20, expl_var[i-1] - 0.01),fontweight='bold')

change_in_expl_var = [expl_var[i+1] - expl_var[i] for i in range(len(expl_var)-1)]
ax2.plot(change_in_expl_var)

ax2.set_ylabel("Increment in Cummulative Variance with One Additional Latent Factor", fontsize=10)
ax2.yaxis.set_label_position("right")
ax2.set_xlabel("Number of Latent Factor")

plt.savefig('img/netflix_svd-expl-var.png')

We are not getting benifitted from adding one latent factor each time. This is what is shown in the plots (specially the bottom plot, it gets almost flatten after that knee).

In [24]:
for i in ind:
    print("({}, {})".format(i, np.round(expl_var[i-1], 2)))

(1, 0.23)
(2, 0.26)
(4, 0.3)
(8, 0.34)
(10, 0.35)
(20, 0.38)
(30, 0.4)
(40, 0.42)
(50, 0.43)
(60, 0.44)
(70, 0.45)
(80, 0.45)
(90, 0.46)
(100, 0.47)


In [25]:
# Project Original U_M matrix into into 100 Dimensional space...
start = datetime.now()
trunc_matrix = train_sparse_matrix.dot(netflix_svd.components_.T)
print(datetime.now()- start)

0:00:03.645870


In [26]:
type(trunc_matrix), trunc_matrix.shape

(numpy.ndarray, (2649430, 100))

In [28]:
if not os.path.isfile('trunc_sparse_matrix.npz'):
    trunc_sparse_matrix = sparse.csr_matrix(trunc_matrix)
    sparse.save_npz('trunc_sparse_matrix', trunc_sparse_matrix)
else:
    print("trunc_sparse_matrix.npz already exists. Loading it...")
    start = datetime.now()
    trunc_sparse_matrix = sparse.load_npz('trunc_sparse_matrix.npz')
    print(datetime.now()- start)

trunc_sparse_matrix.npz already exists. Loading it...
0:00:01.438214


In [29]:
trunc_sparse_matrix.shape

(2649430, 100)

In [32]:
start = datetime.now()
trunc_u_u_sim_matrix, _ = compute_user_similarity(trunc_sparse_matrix, compute_for_few=True, top=50, verbose=True, 
                                                 verb_for_n_rows=10)

print("time:",datetime.now()-start)

Computing strted for top 50 similarities for each user...
Computing done for 10 users [  time elapsed : 0:00:07.850645  ]
Computing done for 20 users [  time elapsed : 0:00:18.650507  ]
Computing done for 30 users [  time elapsed : 0:00:26.201450  ]
Computing done for 40 users [  time elapsed : 0:00:34.316376  ]
Computing done for 50 users [  time elapsed : 0:00:41.799720  ]
Creating Sparse matrix from the computed similarities
time: 0:00:44.379248


- Time taken per user = 0:00:44.379248 / 50 = **0.88 seconds**
-  We have total users = 405041, which means u-u similarity presize computation would take 405041*0.88 = 4.125 days
- No doubt, svd has decreased the time of computation, but 4+ days time is also a very long time. It would take lot of memory and computation power, which is very very hard to execute.

### Alternative/Modification to traditional SVD
But one drawback i noticed in my above method is, it re-calculate the similarities of a user with another user in some iterations.
To minimize/optimize it:
- I will maintain a binary Vector for users, which tells us whether program has already computed top(say, 100) similarities for a user or not.
-  **If not** : Compute top (say, 100) most similar users for this user, and add this to our datastructure, so that we can just access it(similar users) without recomputing it again. The way which i did above
- But **If It is already Computed** : Just get it directly from our datastructure. In due time,i might have to recompute similarities, if it is computed a long time ago. Because user preferences changes over time. 
- So, program could maintain some kind of **Timer**, which when expires, we have to update it ( recompute it ).


### Movie - Movie collaborative filtering

In [34]:
start = datetime.now()
if not os.path.isfile('m_m_sim_sparse.npz'):
    start = datetime.now()
    m_m_sim_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output=False)
    # store this sparse matrix in disk before using it. For future purposes.
    sparse.save_npz("m_m_sim_sparse.npz", m_m_sim_sparse)
    print("Done.")
else:
    print("m_m_sim_saprse.npz is there already, Loading it...")
    m_m_sim_sparse = sparse.load_npz("m_m_sim_sparse.npz")
    print("Done.")

# print("m_m_sim_sparse.npz is a ",m_m_sim_sparse.shape," dimensional matrix")

print(datetime.now() - start)

Done.
m_m_sim_sparse.npz is a  (17771, 17771)  dimensional matrix
0:06:13.093896


In [35]:
m_m_sim_sparse.shape

(17771, 17771)

- Even though we have similarity measure of each movie, with all other movies. But generally one don't care much about least similar movies.
- Most of the times platforms recommends only top_xx similar items (here, item = movie). It may be top 10 or 100.
- So, its better to take only top similar movie ratings and store them in a saperate dictionary.

In [None]:
movie_ids = np.unique(m_m_sim_sparse.nonzero()[1])

In [38]:
len(movie_ids)

17424

m_m_sim_sparse is based on training dataset, so 0.8*17771 = 17424

In [39]:
start = datetime.now()
similar_movies = dict() 
for movie in movie_ids:
    # get the top similar movies and store them in the dictionary
    sim_movies = m_m_sim_sparse[movie].toarray().ravel().argsort()[::-1][1:]
    similar_movies[movie] = sim_movies[:100]
print(datetime.now() - start)

# just testing similar movies for movie_id=
movie=random.choice(movie_ids)
print(f"Similar movies for movie id {movie} are :\n")
similar_movies[movie]

0:00:20.529888
Similar movies for movie id 12711 are :



array([ 9958, 10439, 11825,  8666,  5963, 15896,   471, 16157,  1708,
        9001, 14687,  9810, 13147,  5231, 13374, 15193,  9103,  7660,
       11111, 12871, 15359, 12061,   370,  8708, 16670, 11586,  9734,
       16500,  3387, 14828, 10476,  4802,  6604, 13096, 14372,   925,
        4804,  9334,  3987, 16238, 12585,  3855,  5742,  4116,   840,
        3554, 14183,  6669, 13742,  2525,  8626, 14704,  5948, 13647,
       13149, 17184,  4249, 14057, 17257,  5986,   990, 10360, 17001,
        3868,  6086, 10273,  7220,  4253, 12139, 14262, 12702, 12182,
        2644,  4004,  7006, 14063,  1327,   974, 17389,  2408,  8890,
        8864,  1074, 12250,  5605, 11020, 12645,  6438,  4419,  7960,
        8539, 11854,  6710, 14719,   261,  3164,  9373,  8522,  9106,
       17275], dtype=int64)

### To verify whether these movies are actually similar? 
 - #### I am using netflix's movie_titles.csv

In [58]:
movie_titles = pd.read_csv("data/movie_titles.csv", sep=',', header = None,
                           names=['movie_id', 'year_of_release', 'title'],
                           usecols=[0, 1, 2], verbose=True,
                      index_col = 'movie_id', encoding = "ISO-8859-1")  
#encoding necessary as movie_titles.csv has characters outside ASCII range

Tokenization took: 4.00 ms
Type conversion took: 5.57 ms
Parser memory cleanup took: 0.00 ms


  movie_titles = pd.read_csv("data/movie_titles.csv", sep=',', header = None,


In [45]:
movie_titles.head()

Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


### Check : I am searching similar movies for movie id = 67 

In [48]:
mv_id = 67

print(f"Movie id {mv_id} corresponds to ",movie_titles.loc[mv_id].values[1])

print("It has {} Ratings from users.".format(train_sparse_matrix[:,mv_id].getnnz()))

print(f"Movide id = {mv_id}" + " have {} movies which are similar to this and but only top 100 most similar ones are of interest.".format(m_m_sim_sparse[:,mv_id].getnnz()))

Movie id 67 corresponds to  Vampire Journals
It has 270 Ratings from users.


Movide id = 67 have 17284 movies which are similar to this and but only top 100 most similar ones are of interest.


In [49]:
similarities = m_m_sim_sparse[mv_id].toarray().ravel()
similar_indices = similarities.argsort()[::-1][1:]
similarities[similar_indices]

sim_indices = similarities.argsort()[::-1][1:] 
# It will sort and reverse the array and ignore its similarity (i.e. 1)
# and return its indices (movie_ids)

In [54]:
plt.plot(similarities[sim_indices], label='All the ratings')
plt.plot(similarities[sim_indices[:100]], label='top 100 similar movies')
plt.title("Similar Movies of {}(movie_id)".format(mv_id), fontsize=20)
plt.xlabel("Movies (Not Movie_Ids)", fontsize=15)
plt.ylabel("Cosine Similarity",fontsize=15)
plt.legend()
plt.savefig("img/similar-movies-for-movieId-67.png")

In [55]:
# Top 10 similar movies for moive_id = 67
movie_titles.loc[sim_indices[:10]]

Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
323,1999.0,Modern Vampires
4044,1998.0,Subspecies 4: Bloodstorm
1688,1993.0,To Sleep With a Vampire
13962,2001.0,Dracula: The Dark Prince
12053,1993.0,Dracula Rising
16279,2002.0,Vampires: Los Muertos
4667,1996.0,Vampirella
1900,1997.0,Club Vampire
13873,2001.0,The Breed
15867,2003.0,Dracula II: Ascension


### This same approach could be applied for **user-user** similarity too, then one could get top 10/100 etc similar users (except itself)