<a href="https://colab.research.google.com/github/silvhua/Netflix-Recommender-Engines-Challenge/blob/main/recommender_engines_II_2022_12_13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab stuff

In [None]:
# You can see what GPU you've been assigned at any time by executing the following cell
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# You can see how much memory you have available at any time by running the following code cell. 
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Wed Dec 14 01:18:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    27W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Load Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
# Load all ratings data
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2])
df2 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_2.txt', header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2])
df3 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_3.txt', header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2])
df4 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_4.txt', header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2])

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
from collections import deque 
def reshape_df(df):
    tmp_movies = df[df['Rating'].isna()]['Customer'].reset_index()
    movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

    # Shift the movie_indices by one to get start and endpoints of all movies
    shifted_movie_index = deque(movie_index)
    shifted_movie_index.rotate(-1)
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2


In [None]:
# Create a single dataframe with all the ratings
df = pd.concat([df1, df2, df3, df4]).reset_index(drop=True)

In [None]:
# reshape the dataframe
df0 = reshape_df(df)
df0.to_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_reshaped.csv')
del df1, df2, df3, df4

## Load processed CSV

In [None]:
import pandas as pd
import numpy as np

def load_csv(filename,filepath,column1_as_index=False,truncate=None, usecols=None, sep=','):
    """
    Load a csv file as a dataframe using specified file path copied from windows file explorer.
    Back slashes in file path will be converted to forward slashes.
    Arguments:
    - filepath (raw string): Use the format r'<path>'.
    - filename (string).
    - colum1_as_index (bool): If true, take the first column as the index. 
        Useful when importing CSV files from previously exported dataframes.

    Returns: dataframe object.
    """
    filename = f'{filepath}/'.replace('\\','/')+filename
    df = pd.read_csv(filename, usecols=usecols, sep=sep)
    if column1_as_index==True:
        df.set_index(df.columns[0], inplace=True)
        df.index.name = None
    print('Dataframe shape: ',df.shape)

    if truncate:
        return df.sample(n=truncate,random_state=0)
    else:
        return df

df0 = load_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_reshaped.csv',filepath='',
               column1_as_index=True)
df0.head()

Dataframe shape:  (100480507, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


# Plan
1. Predict Ratings
  *   Surprise algorithms with and without hyperparameter tuning
    * SVD
    * SVDpp
    * NMF
  * Linear regression
  * SVD plus bias with different `n_component` values

2. Make recommendations using the best estimator

# Scikit Surprise

In [None]:
! pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 7.6 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626468 sha256=39917d7dc7432b2961d923ff6a13b69987979ace435dcec062d899b68a80fc80
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
# # import dataset from surprise
from surprise import Dataset
from surprise import Reader

# Create data set in surprise format
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data = Dataset.load_from_df(df0[['Customer', 'Movie_ID', 'Rating']], reader)

In [None]:
from surprise.model_selection import train_test_split
# Train test split
trainset, testset = train_test_split(data, test_size=.15)
del data

In [None]:
# Save trainset and test set
import pickle
with open('/content/drive/MyDrive/data exercises/W10/netflix-challenge/surprise_trainset.pickle', 'wb') as fh:
  pickle.dump(trainset, fh)
with open('/content/drive/MyDrive/data exercises/W10/netflix-challenge/surprise_testset.pickle', 'wb') as fh2:
  pickle.dump(testset, fh2)

## `surprise_predictions` function and SVD algorithm

In [None]:
# import SVD from surprise
from surprise import SVD

# import accuracy from surprise
from surprise import accuracy

# import GridSearchCV from surprise.model_selection
from surprise.model_selection import GridSearchCV
# import cross_validate from surprise.model_selection
from surprise.model_selection import cross_validate


def surprise_gridsearch_predictions(estimator, param_grid, data, pickle_name=None):
  """
  Perform gridsearch with surprise data set.
  """
  gs = GridSearchCV(estimator, param_grid, measures={'rmse'})
  gs.fit(data)
  print('Best grid search parameters:', gs.best_params['rmse'])
  predictions = gs.test(testset)
  rmse = accuracy.rmse(predictions)
  print(f'Model RMSE: {rmse:.2f}')
  filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
  if pickle:
    with open(filepath+pickle_name+'_model.pickle', 'wb') as fh:
      pickle.dump(gs.best_estimator_, fh)
    with open(filepath+pickle_name+'_predictions.pickle', 'wb') as fh2:
      pickle.dump(predictions, fh)
  return gs.best_estimator_, predictions

  
def surprise_predictions(algorithm, trainset=trainset, testset=testset, pickle_name=None):
  output = algorithm.fit(trainset)
  predictions = algorithm.test(testset)
  rmse = accuracy.rmse(predictions)
  print(f'Model RMSE: {rmse:.2f}')
  if pickle_name:
    try:
      filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
      with open(filepath+pickle_name+'_model.pickle', 'wb') as fh:
        pickle.dump(algorithm, fh)
      with open(filepath+pickle_name+'_predictions.pickle', 'wb') as fh2:
        pickle.dump(predictions, fh)
      print(f'Saved: {filepath+pickle_name}_model.pickle')
      print(f'Saved: {filepath+pickle_name}_predictions.pickle')
    except:
      print('Outputs not saved')
    return algorithm, predictions


model_svd, predictions_svd = surprise_predictions(SVD(), pickle_name='surpriseSVD')

RMSE: 0.8333
Model RMSE: 0.83
Outputs not saved


In [None]:
import pickle
filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
pickle_name='surpriseSVD'
with open(filepath+pickle_name+'_model.pickle', 'wb') as fh:
  pickle.dump(model_svd, fh)


In [None]:
with open(filepath+pickle_name+'_predictions.pickle', 'wb') as fh2:
  pickle.dump(predictions_svd, fh2)
print(f'Saved: {filepath+pickle_name}_predictions.pickle')

Saved: /content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/surpriseSVD_predictions.pickle


In [None]:
param_grid = {
    'n_factors': [75, 100, 125],
    'lr_all': [0.005, 0.01],
}


## SVDpp algorithm

In [None]:

from surprise import SVDpp
model_svdpp, predictions_svdpp = surprise_predictions(SVDpp(), pickle_name='surpriseSVDpp')
# Not sure why, but after 8 h 37 min using Google Colab Pro+ with Premium GPU, got an error saying that cell execution failed

## NMF

In [None]:
! pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 4.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626486 sha256=ff4a9e4ddf15b2e32f3ceb791df029afc26c80e7986bb9d6876151d1e071b98a
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
import pickle
def savepickle(model,filename, ext='sav', path='/content/drive/MyDrive/data exercises/W10/netflix-challenge/',append_version=False):
    """
    Export object as a pickle.
    Parameters:
    - model: Model variable name.
    - filename: Root of the filename.
    - extension: Extension to append (do not include dot as it will be added)
    - filepath (raw string): Use the format r'<path>'. If None, file is saved in same director.
    - append_version (bool): If true, append date and time to end of filename.
    """
    if path:
        path = f'{path}/'.replace('\\','/')
    if append_version == True:
        filename+=datetime.now().strftime('%Y-%m-%d_%H%M')
    with open (path+filename+'.'+ext, 'wb') as fh:
        pickle.dump(model, fh)
    print('File saved: ',path+filename+'.'+ext)

def loadpickle(filename,filepath='/content/drive/MyDrive/data exercises/W10/netflix-challenge/'):
    """
    Load a pickled model using specified file path copied from windows file explorer.
    Back slashes in file path will be converted to forward slashes.
    Arguments:
    - filepath (raw string): Use the format r'<path>'.
    - filename (string).
    
    Returns saved object.
    """
    filename = filepath+filename
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

trainset = loadpickle('surprise_trainset.pickle')
testset = loadpickle('surprise_testset.pickle')

In [None]:
from surprise import accuracy
def surprise_predictions(algorithm, trainset=trainset, testset=testset, pickle_name=None):
  output = algorithm.fit(trainset)
  predictions = algorithm.test(testset)
  rmse = accuracy.rmse(predictions)
  print(f'Model RMSE: {rmse:.2f}')
  if pickle_name:
    try:
      filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
      with open(filepath+pickle_name+'_model.pickle', 'wb') as fh:
        pickle.dump(algorithm, fh)
      print(f'Saved: {filepath+pickle_name}_model.pickle')
      with open(filepath+pickle_name+'_predictions.pickle', 'wb') as fh2:
        pickle.dump(predictions, fh2)
      print(f'Saved: {filepath+pickle_name}_predictions.pickle')
    except:
      print('Outputs not saved')
    return algorithm, predictions

In [None]:
from surprise import NMF
model_NMF, predictions_NMF = surprise_predictions(NMF(), pickle_name='surpriseNMF')

RMSE: 0.9256
Model RMSE: 0.93
Saved: /content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/surpriseNMF_model.pickle
Outputs not saved


In [None]:
import pickle
filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
pickle_name='surpriseNMF'
with open(filepath+pickle_name+'_predictions.pickle', 'wb') as fh2:
      pickle.dump(predictions_NMF, fh2)
print(f'Saved: {filepath+pickle_name}_predictions.pickle')

NameError: ignored

# Randomized SVD
Predict rating based on [Sci-kit Surprise's SVD algorithm](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#matrix-factorization-based-algorithms): 
rui = mu + bu + bi + qi'*pu

In [None]:
import pandas as pd
import numpy as np

def load_csv(filename,filepath,column1_as_index=False,truncate=None, usecols=None, sep=','):
    """
    Load a csv file as a dataframe using specified file path copied from windows file explorer.
    Back slashes in file path will be converted to forward slashes.
    Arguments:
    - filepath (raw string): Use the format r'<path>'.
    - filename (string).
    - colum1_as_index (bool): If true, take the first column as the index. 
        Useful when importing CSV files from previously exported dataframes.

    Returns: dataframe object.
    """
    filename = f'{filepath}/'.replace('\\','/')+filename
    df = pd.read_csv(filename, usecols=usecols, sep=sep)
    if column1_as_index==True:
        df.set_index(df.columns[0], inplace=True)
        df.index.name = None
    print('Dataframe shape: ',df.shape)

    if truncate:
        return df.sample(n=truncate,random_state=0)
    else:
        return df
        
def create_matrix(df, index='Customer', columns='Movie_ID', values='Rating', pickle_name=None):
  """
  Create a utility matrix. This can then be used for randomizedSVD by converting into csr_matrix
  and filling nan with zero.
  """
  df = df.pivot_table(index=index, columns=columns, values=values)
  
  if pickle_name:
    try:
      filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
      with open(filepath+pickle_name+'_sparse_utility_matrix.pickle') as fh:
        pickle.dump(df, fh)
      print(f'Saved: {filepath+pickle_name}_sparse_utility_matrix.pickle')
    except:
      print('Unable to save outputs')
  return df

df0 = load_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_reshaped.csv',filepath='',
               column1_as_index=True)
utility_matrix = create_matrix(df0)

Dataframe shape:  (100480507, 4)


ValueError: ignored

In [None]:
from sklearn.utils.extmath import randomized_svd
from scipy.sparse import csr_matrix
def run_svd(utility_matrix, n_components_list, pickle_name=None):
  print('Original array shape: ', matrix.shape)
  U_dict = dict()
  S_dict = dict()
  VT_dict = dict()
  predictions_dict = dict()
  for n_components in n_components_list:
    U_dict[n_components], S_dict[n_components], VT_dict[n_components] = randomized_svd(
        csr_matrix(utility_matrix.fillna(0)), n_components=n_components, random_state=0)
    # Reconstruct the decomposed matrix
    reconst = U_dict[n_components].dot(np.diag(S_dict[n_components])).dot(VT_dict[n_components])
    print(f'n_components:', n_components)
    print('Reconstructed array shape:', reconst.shape)
    print('\tU shape: ',U_dict[n_components].shape)
    print('\tSigma shape: ', S_dict[n_components].shape)
    print('\tV shape: ',VT_dict[n_components].shape)
    try:
      # Mean rating in array (nan values excluded)
      mu = utility_matrix.values.reshape(-1)[~np.isnan(utility_matrix.values.reshape(-1))].mean()

      # Array with bias per user. Reshape to be array with same number of rows as customers.
      bu = (utility_matrix.mean(axis=1) - utility_matrix.mean(axis=1).mean()).to_numpy().reshape(-1,1)

      # Array with bias per item. Reshape to be array with same number of columns as movies.
      bi = (utility_matrix.mean() - utility_matrix.mean().mean()).to_numpy().reshape(1,-1)

      predicted_ratings = (mu + bu + bi + reconst) - utility_matrix.fillna(0).values
      print('Predictions matrix shape:', predictions_dict[n_components].shape)
      predictions_dict[n_components] = pd.DataFrame(
          predicted_ratings, index=utility_matrix.index, columns=utility_matrix.columns)
    except:
      pass

  if pickle_name:
    try:
      filepath = '/content/drive/MyDrive/data exercises/W10/netflix-challenge/saved_models/'
      with open(filepath+pickle_name+'_randomizedSVD_U_dict.pickle', 'wb') as fh:
        pickle.dump(U_dict, fh)
      print(f'Saved: {filepath+pickle_name}_randomizedSVD_U_dict.pickle')
      with open(filepath+pickle_name+'_randomizedSVD_S_dict.pickle', 'wb') as fh2:
        pickle.dump(S_dict, fh2)
      print(f'Saved: {filepath+pickle_name}_randomizedSVD_S_dict.pickle')
      with open(filepath+pickle_name+'_randomizedSVD_VT_dict.pickle', 'wb') as fh3:
        pickle.dump(VT_dict, fh3)
      print(f'Saved: {filepath+pickle_name}_randomizedSVD_VT_dict.pickle')
      with open(filepath+pickle_name+'_randomizedSVD_predictions_dict.pickle', 'wb') as fh4:
        pickle.dump(predictions_dict, fh4)
      print(f'Saved: {filepath+pickle_name}_randomizedSVD_VT_dict.pickle')
    except:
      print('Outputs not saved')

  return U_dict, S_dict, VT_dict, predictions_dict

n_components_list = [20, 50]
U_dict, S_dict, VT_dict, predictions_dict = run_svd(matrix, n_components_list, pickle_name='netflix')