In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from lightfm import LightFM
from time import time



In [2]:
movies = pd.read_csv(r"C:\Users\Sparsh Jain\Downloads\Movie Recommendation System\ml-latest-small\movies.csv")
ratings = pd.read_csv(r"C:\Users\Sparsh Jain\Downloads\Movie Recommendation System\ml-latest-small\ratings.csv")

In [3]:
def createMovieDictionary(movies):
    movie_dict = {}
    for i in range(len(movies)):
        movieId = int(movies.iloc[i:i+1,0])
        movieName = str(movies.iloc[i:i+1,1])
        # sclicing done on the string part to remove the prefix that contained indices
        sliceIndex = len(str(i))
        movieName = movieName[sliceIndex:]
        movieName = movieName.strip()
        movie_dict[movieId] = movieName 
    
    return movie_dict

In [4]:
movie_dict = createMovieDictionary(movies)

In [5]:
movie_dict

{1: 'Toy Story (1995)\nName: title, dtype: object',
 2: 'Jumanji (1995)\nName: title, dtype: object',
 3: 'Grumpier Old Men (1995)\nName: title, dtype: object',
 4: 'Waiting to Exhale (1995)\nName: title, dtype: object',
 5: 'Father of the Bride Part II (1995)\nName: title, dtype: object',
 6: 'Heat (1995)\nName: title, dtype: object',
 7: 'Sabrina (1995)\nName: title, dtype: object',
 8: 'Tom and Huck (1995)\nName: title, dtype: object',
 9: 'Sudden Death (1995)\nName: title, dtype: object',
 10: 'GoldenEye (1995)\nName: title, dtype: object',
 11: 'American President, The (1995)\nName: title, dtype: object',
 12: 'Dracula: Dead and Loving It (1995)\nName: title, dtype: object',
 13: 'Balto (1995)\nName: title, dtype: object',
 14: 'Nixon (1995)\nName: title, dtype: object',
 15: 'Cutthroat Island (1995)\nName: title, dtype: object',
 16: 'Casino (1995)\nName: title, dtype: object',
 17: 'Sense and Sensibility (1995)\nName: title, dtype: object',
 18: 'Four Rooms (1995)\nName: title, 

In [6]:
def createRatingsMatrix(movie_dict, ratings):
    columns = list(movie_dict.keys())
    userIds = np.unique(ratings.iloc[:,0])
    numRows = len(userIds)
    numColumns = len(columns)
    ratingsMatrix = pd.DataFrame(np.zeros((numRows, numColumns)), columns = columns, index = userIds)
    
    for i in range(len(ratings)):
        rowIndex = int(ratings.loc[i][0])
        columnIndex = int(ratings.loc[i][1])
        rating = float(ratings.loc[i][2])
        ratingsMatrix.loc[rowIndex][columnIndex] = rating    
    
    return ratingsMatrix

In [7]:
st = time()
ratingsMatrix = createRatingsMatrix(movie_dict, ratings)
et = time()
et-st

144.85875749588013

In [8]:
# random check
ratingsMatrix.loc[21][38038]

1.5

In [9]:
def create_model(ratingsMatrix):
    sparse_matrix = csr_matrix(ratingsMatrix.values)
    recommender_obj = LightFM(no_components=30, loss='warp', learning_schedule='adagrad')
    recommender_obj.fit(sparse_matrix, epochs=50)
    return recommender_obj

In [10]:
st = time()
model = create_model(ratingsMatrix)
et = time()
et-st

50.07336497306824

In [11]:
def recommendMovieToUser(model, ratingsMatrix, user_id, movie_dict, threshold = 0, nrec_items = 3, showKnownLikes = False, n_known_likes =5):
    n_items = ratingsMatrix.shape[1]

    pred = model.predict(user_id,np.arange(n_items))
    scores = pd.Series(pred)

    # scores.index is from [0,94]
    # data.columns is from [1,95]
    # So, updating indicies of scores  
    scores.index = ratingsMatrix.columns

    # "scores" is an object of type Series
    # sorting the key value pair of scores 
    scores = scores.sort_values(ascending=False)

    # saing only the indicies after sorting
    scores = list(scores.index)

    # elements of "scores" are of the type str
    # converting the type to int
    scores = [int(i) for i in scores]

    # retrieving the row of the user with userID = user_id
    userRow = ratingsMatrix.iloc[user_id,:]

    # keeping only those elements of the row that have value > 0 (Since threshold = 0)
    userRow = userRow[userRow > threshold]

    # sorting the userRow
    userRowSorted = userRow.sort_values(ascending=False)

    # Now, "userRowSorted" is an obj of type Series
    # saving only the indicies of "userRowSorted"
    userRowSortedIndex = userRowSorted.index

    # elements of "userRowSortedIndex" are of the type str
    # converting the type to int
    userRowSortedIndex = [int(i) for i in userRowSortedIndex]

    # Now, "scores" contains indicies of the scores for all the columns
    # The scores were in descending order
    # "userRowSortedIndex" contains the indicies of all those Movies which the user has already rated
    # Movies were sorted from max rating to min rating 

    # Now, we need to extract the scores of those Movies which the user has not rated yet
    # By doing so we will recommend only those Movies to the user that the user has not watched yet
    scores = [i for i in scores if i not in userRowSortedIndex]

    # Since "scores" was sorted before it will also be sorted now
    # scores[0] now contains the index of the Movies that the user has not watched yet 
    # and it is the best Movie that we can recommend to the user

    # selecting top "nrec_items" Movies to recommend to the user
    return_score_list = scores[0:nrec_items]

    # known_items will now conatin the names of those Movies that the user has already rated in decreasing order of Movie rating 
    known_items = []

    for i in userRowSortedIndex:
        temp = movie_dict[i]
        known_items.append(temp)

    # recommend_items will now contain the names of those Movies that the user has never watched 
    # and it is most likely that user will like these Movies
    recommend_items = []

    for i in return_score_list:
        temp = movie_dict[i]
        recommend_items.append(temp)

    # Printing the Known Likes
    if showKnownLikes == True:
        print("Known Likes:")
        counter = 1
        for i in known_items[0:n_known_likes]:
            print(str(counter) + ": " + i)
            counter+=1
        print("\n")

    # Printing the Recommended Movies
    print("Recommended Movies:")
    counter = 1
    for i in recommend_items:
        print(str(counter) + ": " + i)
        counter+=1

In [12]:
recommendMovieToUser(model, ratingsMatrix, 145, movie_dict, showKnownLikes = True)

Known Likes:
1: Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Name: title, dtype: object
2: 3-Iron (Bin-jip) (2004)
Name: title, dtype: object
3: 50 First Dates (2004)
Name: title, dtype: object
4: Flatliners (1990)
Name: title, dtype: object
5: Illusionist, The (2006)
Name: title, dtype: object


Recommended Movies:
1: Lion King, The (1994)
Name: title, dtype: object
2: Forrest Gump (1994)
Name: title, dtype: object
3: Aladdin (1992)
Name: title, dtype: object


In [13]:
# References
# https://towardsdatascience.com/solving-business-usecases-by-recommender-system-using-lightfm-4ba7b3ac8e62

# How does Netflix recommend movies? Matrix Factorization

# https://www.youtube.com/watch?v=ZspR5PZemcs&t=661s