# Movie Recommender System
> Content-based and collaborative recommendation methods on MovieLens

- toc: true
- badges: true
- comments: true
- categories: [movie]
- image:

## Load data

In [1]:
!mkdir '/content/data'

from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1Of9rK8ds1a1iyl1jFnf_7oRgPB-8bfdK',
                                    dest_path='/content/data/data.zip',
                                    unzip=True)

Downloading 1Of9rK8ds1a1iyl1jFnf_7oRgPB-8bfdK into /content/data/data.zip... Done.
Unzipping...Done.


## Clean data

In [2]:
import os
import numpy as np
import pandas as pd
from ast import literal_eval

In [3]:
#hide-output
md = pd.read_csv("/content/data/imdb/movies_metadata.csv")
credits = pd.read_csv('/content/data/imdb/credits.csv')
keywords = pd.read_csv('/content/data/imdb/keywords.csv')
links_small = pd.read_csv('/content/data/imdb/links_small.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
#hide-output
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
md.loc[:, 'genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md = md.drop([19730, 29503, 35587])
keywords.loc[:, 'id'] = keywords['id'].astype('int')
credits.loc[:, 'id'] = credits['id'].astype('int')
md.loc[:, 'id'] = md['id'].astype('int')

md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

smd = md[md['id'].isin(links_small)]

smd.loc[:, 'tagline'] = smd['tagline'].fillna('')

smd.loc[:,'cast'] = smd['cast'].apply(literal_eval)
smd.loc[:,'crew'] = smd['crew'].apply(literal_eval)
smd.loc[:,'keywords'] = smd['keywords'].apply(literal_eval)
smd.loc[:,'cast_size'] = smd['cast'].apply(lambda x: len(x))
smd.loc[:,'crew_size'] = smd['crew'].apply(lambda x: len(x))

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

smd.loc[:,'director'] = smd['crew'].apply(get_director)
smd.loc[:,'cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd.loc[:,'cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd.loc[:,'keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words
smd.loc[:,'keywords'] = smd['keywords'].apply(filter_keywords)
smd.drop_duplicates(subset ="title",
                     keep = 'first', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://p

In [6]:
out_df = smd[['id', 'title', 'year', 'director', 'cast',  'genres', 'vote_count', 'vote_average',  'overview', 'keywords']]
out_df.head()
out_df.to_csv('super_clean_data.csv', index=False)

In [7]:
out_df.head()

Unnamed: 0,id,title,year,director,cast,genres,vote_count,vote_average,overview,keywords
0,862,Toy Story,1995,John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]","[Animation, Comedy, Family]",5415.0,7.7,"Led by Woody, Andy's toys live happily in his ...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,1995,Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[Adventure, Fantasy, Family]",2413.0,6.9,When siblings Judy and Peter discover an encha...,"[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,1995,Howard Deutch,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[Romance, Comedy]",92.0,6.5,A family wedding reignites the ancient feud be...,"[fishing, best friend, duringcreditsstinger]"
3,31357,Waiting to Exhale,1995,Forest Whitaker,"[Whitney Houston, Angela Bassett, Loretta Devine]","[Comedy, Drama, Romance]",34.0,6.1,"Cheated on, mistreated and stepped on, the wom...","[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,1995,Charles Shyer,"[Steve Martin, Diane Keaton, Martin Short]",[Comedy],173.0,5.7,Just when George Banks has recovered from his ...,"[baby, midlife crisis, confidence, aging, daug..."


## Content-based Recommender

In [14]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [15]:
ori_df = pd.read_csv('/content/super_clean_data.csv')
df = ori_df.copy()
df.head()

Unnamed: 0,id,title,year,director,cast,genres,vote_count,vote_average,overview,keywords
0,862,Toy Story,1995,John Lasseter,"['Tom Hanks', 'Tim Allen', 'Don Rickles']","['Animation', 'Comedy', 'Family']",5415.0,7.7,"Led by Woody, Andy's toys live happily in his ...","['jealousy', 'toy', 'boy', 'friendship', 'frie..."
1,8844,Jumanji,1995,Joe Johnston,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...","['Adventure', 'Fantasy', 'Family']",2413.0,6.9,When siblings Judy and Peter discover an encha...,"['board game', 'disappearance', ""based on chil..."
2,15602,Grumpier Old Men,1995,Howard Deutch,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret']","['Romance', 'Comedy']",92.0,6.5,A family wedding reignites the ancient feud be...,"['fishing', 'best friend', 'duringcreditssting..."
3,31357,Waiting to Exhale,1995,Forest Whitaker,"['Whitney Houston', 'Angela Bassett', 'Loretta...","['Comedy', 'Drama', 'Romance']",34.0,6.1,"Cheated on, mistreated and stepped on, the wom...","['based on novel', 'interracial relationship',..."
4,11862,Father of the Bride Part II,1995,Charles Shyer,"['Steve Martin', 'Diane Keaton', 'Martin Short']",['Comedy'],173.0,5.7,Just when George Banks has recovered from his ...,"['baby', 'midlife crisis', 'confidence', 'agin..."


In [16]:
print(f"No of records: {len(df)}")

No of records: 8809


### Preprocess data

In [17]:
df.loc[:,'cast'] = df['cast'].apply(literal_eval)
df.loc[:,'genres'] = df['genres'].apply(literal_eval)
df.loc[:,'keywords'] = df['keywords'].apply(literal_eval)

stemmer = SnowballStemmer('english')

def preprocess(x, remove_spaces=False, stemming=False):
    if isinstance(x, list):
        y = []
        for i in x:
            token = preprocess(i, remove_spaces, stemming)
            if token is not None:
                y.append(token)
    else:
        
        y = str(x)

        # Lower all words
        y = str.lower(y)

        # Remove spaces (for person's name)
        if remove_spaces:
            y = y.replace(" ", "")

        # Remove digits
        y = ''.join([i for i in y if not i.isdigit()])

        # Stemming words
        if stemming:
            y = stemmer.stem(y)

        if len(y) <=1:
            return None

    return y


df.loc[:,'cast'] = df['cast'].apply(lambda x: preprocess(x, remove_spaces=True))
df.loc[:,'director'] = df['director'].astype('str').apply(lambda x: preprocess(x, remove_spaces=True))
df.loc[:, 'title'] = df['title'].apply(lambda x: preprocess(x, stemming=True))
df.loc[:, 'overview'] = df['overview'].apply(lambda x: preprocess(str.split(str(x)), stemming=True))
df.loc[:, 'genres'] = df['genres'].apply(lambda x: preprocess(x, stemming=True))
df.loc[:,'keywords'] = df['keywords'].apply(lambda x: preprocess(x, stemming=True))
df.head()

Unnamed: 0,id,title,year,director,cast,genres,vote_count,vote_average,overview,keywords
0,862,toy stori,1995,johnlasseter,"[tomhanks, timallen, donrickles]","[anim, comedi, famili]",5415.0,7.7,"[led, by, woody,, andi, toy, live, happili, in...","[jealousi, toy, boy, friendship, friend, rival..."
1,8844,jumanji,1995,joejohnston,"[robinwilliams, jonathanhyde, kirstendunst]","[adventur, fantasi, famili]",2413.0,6.9,"[when, sibl, judi, and, peter, discov, an, enc...","[board gam, disappear, based on children's boo..."
2,15602,grumpier old men,1995,howarddeutch,"[waltermatthau, jacklemmon, ann-margret]","[romanc, comedi]",92.0,6.5,"[famili, wed, reignit, the, ancient, feud, bet...","[fish, best friend, duringcreditssting]"
3,31357,waiting to exhal,1995,forestwhitaker,"[whitneyhouston, angelabassett, lorettadevine]","[comedi, drama, romanc]",34.0,6.1,"[cheat, on,, mistreat, and, step, on,, the, wo...","[based on novel, interracial relationship, sin..."
4,11862,father of the bride part ii,1995,charlesshyer,"[stevemartin, dianekeaton, martinshort]",[comedi],173.0,5.7,"[just, when, georg, bank, has, recov, from, hi...","[babi, midlife crisi, confid, age, daughter, m..."


In [18]:
df.shape

(8809, 10)

### Vectorize using TF-IDF

In [19]:
dictionary = []
for i, row in df.iterrows():
    item = [row.title, row.director] + row.cast + row.genres + row.keywords
    string = ' '.join([j for j in item if j is not None])
    dictionary.append(string)


tf = TfidfVectorizer(analyzer='word',min_df=2, stop_words='english')
tfidf_matrix = tf.fit_transform(dictionary)
print(tfidf_matrix.shape)
print(tf.get_feature_names()[:10])

(8809, 11312)
['aaliyah', 'aamirkhan', 'aaronabrams', 'aaroneckhart', 'aarontaylor', 'abandon', 'abandoned', 'abbaskiarostami', 'abbiecornish', 'abbott']


### Cosine similarity matrix

In [20]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Get recommendations

In [21]:
def get_recommendations(query_title, cosine_sim, df, top_k=10):
    df = df.reset_index()
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])

    # query_title = preprocess(query_title)
    query_idx = indices[query_title]

    # Get similarity score of current movie with others
    sim_scores = list(enumerate(cosine_sim[query_idx]))

    # Sort scores and get top k
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_k+1]

    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    result = titles.iloc[movie_indices].to_frame()
    result['matching_score'] = movie_scores
    return result

In [22]:
get_recommendations("The Dark Knight", cosine_sim, ori_df)

Unnamed: 0,title,matching_score
6001,Batman Begins,0.485814
7699,The Dark Knight Rises,0.481957
1110,Batman Returns,0.440829
7355,Batman: Under the Red Hood,0.414155
1236,Batman & Robin,0.356176
524,Batman,0.353583
8641,Batman v Superman: Dawn of Justice,0.331215
7989,"Batman: The Dark Knight Returns, Part 2",0.312213
132,Batman Forever,0.301016
2548,Batman: Mask of the Phantasm,0.297825


## Collaborative Filtering

### Item-based Recommender


In [23]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
ratings = pd.read_csv("/content/data/imdb/ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [25]:
movie_data = pd.read_csv("/content/super_clean_data.csv")
movie_id_title = movie_data[['id', 'title']]
movie_id_title.head()

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [26]:
top_ratings = movie_data[['title', 'vote_count']]
top_ratings.sort_values('vote_count', ascending=False).head(10)

Unnamed: 0,title,vote_count
7346,Inception,14075.0
6726,The Dark Knight,12269.0
7198,Avatar,12114.0
8495,Deadpool,11444.0
8258,Interstellar,11187.0
7966,Django Unchained,10297.0
8354,Guardians of the Galaxy,10014.0
2343,Fight Club,9678.0
7697,The Hunger Games,9634.0
8490,Mad Max: Fury Road,9629.0


In [27]:
new_ratings = ratings.merge(movie_id_title, left_on='movieId', right_on='id')
new_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,title
0,1,1371,2.5,1260759135,1371,Rocky III
1,4,1371,4.0,949810302,1371,Rocky III
2,7,1371,3.0,851869160,1371,Rocky III
3,19,1371,4.0,855193404,1371,Rocky III
4,21,1371,3.0,853852263,1371,Rocky III


#### User-Item matrix

In [28]:
ui_matrix = new_ratings.pivot(index = 'userId', columns ='title', values = 'rating').fillna(0)
ui_matrix.head()

title,...And God Created Woman,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,13 Tzameti,1408,15 Minutes,16 Blocks,1984,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2010,2046,21 Grams,24 Hour Party People,25th Hour,28 Days Later,28 Weeks Later,3 Ninjas: High Noon at Mega Mountain,3-Iron,300,"4 Months, 3 Weeks and 2 Days",40 Days and 40 Nights,42nd Street,48 Hrs.,50 First Dates,54,8 Mile,8 Women,88 Minutes,8½,A Beautiful Mind,A Bridge Too Far,A Brief History of Time,A Chorus Line,A Christmas Carol,A Christmas Story,A Clockwork Orange,...,We're No Angels,Wee Willie Winkie,Weekend at Bernie's,Westworld,Wet Hot American Summer,Whale Rider,What Lies Beneath,What Women Want,What's Eating Gilbert Grape,What's New Pussycat?,When Harry Met Sally...,While You Were Sleeping,White Lightning,Wilbur Wants to Kill Himself,Wild Orchid,Wild Strawberries,Wild Things,Wild Wild West,Wild at Heart,Willy Wonka & the Chocolate Factory,Wings of Desire,Women on the Verge of a Nervous Breakdown,Working Girl,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,Yankee Doodle Dandy,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Z,Zardoz,Zatoichi,Zazie dans le métro,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,eXistenZ,xXx,¡Three Amigos!
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
movie_title = ui_matrix.columns
index_movies = pd.Series(movie_title, index=(range(len(movie_title))))
movie_indices = pd.Series(range(len(movie_title)), index=movie_title)

In [30]:
movie_indices

title
...And God Created Woman         0
10 Items or Less                 1
10 Things I Hate About You       2
10,000 BC                        3
11'09''01 - September 11         4
                              ... 
Zombie Flesh Eaters           1502
Zombie Holocaust              1503
eXistenZ                      1504
xXx                           1505
¡Three Amigos!                1506
Length: 1507, dtype: int64

#### Mean rating of each movie

In [31]:
sum_ratings = ui_matrix.sum(axis=0)
num_ratings = ui_matrix[ui_matrix>0].count()
mean_ratings = sum_ratings/num_ratings
mean_ratings.head()

title
...And God Created Woman      4.000000
10 Items or Less              4.318182
10 Things I Hate About You    2.642857
10,000 BC                     3.833333
11'09''01 - September 11      2.000000
dtype: float64

#### Use k nearest neighbors to predict score

In [32]:
def predict_score(ui_matrix, user_name, movie_name, mean_ratings, k =2):
    
    movie_id = movie_indices[movie_name]
    ui_matrix_ = ui_matrix.dropna()
    cosine_sim = cosine_similarity(ui_matrix_.T, ui_matrix_.T)

    # nearest neighbors
    sim_scores = list(enumerate(cosine_sim[movie_id]))
    
    # Sort scores and get top k
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:k+1]

    # print(f"Nearest movies of {movie_name}:", end='')
    # nearest_neighor_movies = [index_movies[i[0]] for i in sim_scores]
    # print(nearest_neighor_movies)

    r_ui = mean_ratings[movie_name]

    total_scores = sum([i[1] for i in sim_scores])
    for movie_j, score_ij in sim_scores:
        r_uj = ui_matrix.loc[user_name, index_movies[movie_j]]
        rmean_j = mean_ratings.iloc[movie_j]
        r_ui += ((score_ij*(r_uj - rmean_j))/total_scores)

    return r_ui

In [33]:
user_id = 4
movie_name = "Young Frankenstein"
num_neighbors = 10

score_4yf = ui_matrix.loc[user_id, movie_name]
print(f"True real rating of user {user_id} for movie {movie_name} is {score_4yf}")

pred_4yf = predict_score(ui_matrix, user_id, movie_name, mean_ratings, k=num_neighbors)
print(f"True predicted rating of {user_id} for movie {movie_name} is {pred_4yf}")

True real rating of user 4 for movie Young Frankenstein is 5.0
True predicted rating of 4 for movie Young Frankenstein is 4.536436084266795


### Model-based Recommender

In [34]:
import pandas as pd
import numpy as np
from ast import literal_eval
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
ratings = pd.read_csv("/content/data/imdb/ratings_small.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [36]:
movie_data = pd.read_csv("/content/super_clean_data.csv")
movie_id_title = movie_data[['id', 'title']]
movie_id_title.head()

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [37]:
new_ratings = ratings.merge(movie_id_title, left_on='movieId', right_on='id')
new_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,title
0,1,1371,2.5,1260759135,1371,Rocky III
1,4,1371,4.0,949810302,1371,Rocky III
2,7,1371,3.0,851869160,1371,Rocky III
3,19,1371,4.0,855193404,1371,Rocky III
4,21,1371,3.0,853852263,1371,Rocky III


#### User-Item matrix

In [38]:
ui_matrix = new_ratings.pivot(index = 'userId', columns ='title', values = 'rating').fillna(0)
ui_matrix.head()

title,...And God Created Woman,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,13 Tzameti,1408,15 Minutes,16 Blocks,1984,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2010,2046,21 Grams,24 Hour Party People,25th Hour,28 Days Later,28 Weeks Later,3 Ninjas: High Noon at Mega Mountain,3-Iron,300,"4 Months, 3 Weeks and 2 Days",40 Days and 40 Nights,42nd Street,48 Hrs.,50 First Dates,54,8 Mile,8 Women,88 Minutes,8½,A Beautiful Mind,A Bridge Too Far,A Brief History of Time,A Chorus Line,A Christmas Carol,A Christmas Story,A Clockwork Orange,...,We're No Angels,Wee Willie Winkie,Weekend at Bernie's,Westworld,Wet Hot American Summer,Whale Rider,What Lies Beneath,What Women Want,What's Eating Gilbert Grape,What's New Pussycat?,When Harry Met Sally...,While You Were Sleeping,White Lightning,Wilbur Wants to Kill Himself,Wild Orchid,Wild Strawberries,Wild Things,Wild Wild West,Wild at Heart,Willy Wonka & the Chocolate Factory,Wings of Desire,Women on the Verge of a Nervous Breakdown,Working Girl,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,Yankee Doodle Dandy,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Z,Zardoz,Zatoichi,Zazie dans le métro,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,eXistenZ,xXx,¡Three Amigos!
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### SVD Decomposition: Low rank factorization

In [39]:
# Singular Value Decomposition
U, sigma, Vt = svds(ui_matrix, k = 600)

# Construct diagonal array in SVD
sigma = np.diag(sigma)

In [40]:
print("X = U * sigma * Vt")
print(f"{ui_matrix.shape} = {U.shape} * {sigma.shape} * {Vt.shape}")

X = U * sigma * Vt
(671, 1507) = (671, 600) * (600, 600) * (600, 1507)


In [41]:
# Low-rank matrix
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 

# Convert predicted ratings to dataframe
pred_ui_matrix = pd.DataFrame(all_user_predicted_ratings, columns = ui_matrix.columns)
pred_ui_matrix.head()

title,...And God Created Woman,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,13 Tzameti,1408,15 Minutes,16 Blocks,1984,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2010,2046,21 Grams,24 Hour Party People,25th Hour,28 Days Later,28 Weeks Later,3 Ninjas: High Noon at Mega Mountain,3-Iron,300,"4 Months, 3 Weeks and 2 Days",40 Days and 40 Nights,42nd Street,48 Hrs.,50 First Dates,54,8 Mile,8 Women,88 Minutes,8½,A Beautiful Mind,A Bridge Too Far,A Brief History of Time,A Chorus Line,A Christmas Carol,A Christmas Story,A Clockwork Orange,...,We're No Angels,Wee Willie Winkie,Weekend at Bernie's,Westworld,Wet Hot American Summer,Whale Rider,What Lies Beneath,What Women Want,What's Eating Gilbert Grape,What's New Pussycat?,When Harry Met Sally...,While You Were Sleeping,White Lightning,Wilbur Wants to Kill Himself,Wild Orchid,Wild Strawberries,Wild Things,Wild Wild West,Wild at Heart,Willy Wonka & the Chocolate Factory,Wings of Desire,Women on the Verge of a Nervous Breakdown,Working Girl,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,Yankee Doodle Dandy,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Z,Zardoz,Zatoichi,Zazie dans le métro,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,eXistenZ,xXx,¡Three Amigos!
0,-0.001824,-0.01306,-0.033895,-0.013552,0.004857,-0.012065,-2.2e-05,0.000269,-0.004589,0.004969,-0.003875,0.013477,0.013752,0.011222,0.012818,-0.009012,-0.020127,0.042934,-0.031099,0.012655,0.054123,-0.024107,-0.026365,-0.003498,-0.022653,-0.027189,0.008024,-0.001322,0.013068,-0.013028,0.017011,0.002759,0.014413,0.013653,-0.026469,-0.010153,-0.005941,-0.001596,-0.023442,0.002015,...,0.01594,-0.005563,-0.005438,0.008858,-0.024222,0.014146,0.000161,0.015496,-0.01974,-0.027203,0.030003,-0.023383,-0.000592,0.045767,0.000778,-0.000484,-0.000592,0.004268,-0.004704,-0.007675,-0.02422,0.000549,-0.000714,-0.002366,-0.02521,-0.017727,0.001921,0.037743,-0.002958,0.025668,-0.02309,-0.00719,-0.00986,0.014233,-0.007925,-0.001824,0.007435,0.005727,0.032049,-0.001824
1,0.000129,0.001134,-0.0086,2e-05,-0.000762,-0.000976,-0.000613,0.000226,-0.003114,-0.002803,-0.004658,0.003607,3.000631,-0.00871,-0.003879,0.000968,-0.005763,-0.010756,0.007939,0.004862,-0.010943,0.00486,-0.007617,0.004192,0.002618,-0.003913,5.00245,-0.00475,-0.003433,-0.005846,0.009835,-0.002012,-0.004078,0.000428,0.001261,-0.005249,0.003851,0.000112,-0.00125,3.001976,...,0.005772,0.009581,-0.000734,-0.002152,0.001686,-0.004267,0.000136,0.010982,-0.008346,-0.004466,0.0024,-0.001375,0.000421,-0.005288,-0.002587,0.000511,0.000421,-0.002068,-0.004027,-0.015801,2.983212,-0.00327,-0.004966,-0.002685,-0.000776,-0.003705,0.004641,-0.005497,0.002104,-0.004162,-0.000941,-0.001363,-0.001132,0.008294,0.008796,0.000129,0.000413,0.005353,-0.004449,0.000129
2,-0.001568,0.006566,0.010625,0.003151,0.000703,-0.004763,0.010966,-0.000694,0.010081,0.003523,-0.00282,-0.002801,-0.004074,0.008589,0.001497,0.00033,-0.005924,0.034418,0.009179,0.012273,-0.007749,0.004134,2.995466,-0.010105,-0.014541,0.023826,-0.00337,-0.009107,-0.01925,-0.002713,-0.006548,-0.004856,0.000565,0.001734,0.002098,0.004429,0.002977,-0.001372,-0.00083,-0.006322,...,0.001373,0.007726,-0.003889,0.002632,-0.010117,0.004854,-0.000416,-0.004237,0.005208,-0.006135,0.011015,-0.010989,0.001371,0.003792,-0.003643,0.001093,0.001371,0.000146,-0.010245,-0.006419,0.001102,0.008358,0.00991,-0.00538,-0.003788,-0.000763,0.013707,-0.013706,0.006854,-0.015675,0.000117,-0.002435,0.001743,0.010452,0.001461,-0.001568,-0.011326,-0.006301,0.037546,-0.001568
3,6.9e-05,-0.000763,0.000295,-0.00012,-7e-05,0.000221,0.002347,-0.000402,-0.001041,1.3e-05,0.000809,3.001269,-0.001037,-0.00035,0.000132,0.000484,0.003686,0.001672,0.001112,0.003251,-0.0008,0.001585,0.000797,0.00049,-7.4e-05,0.000591,0.000903,0.000279,0.000714,-0.002858,-0.00414,3.7e-05,-1.4e-05,0.001072,0.000213,-8.6e-05,-0.000377,6e-05,-0.00101,3.000108,...,0.001424,0.000359,0.000977,0.000118,-0.000275,-0.000855,-0.000241,0.000135,0.001576,-0.003887,0.000471,5.00085,5.5e-05,-0.000892,-0.000401,-0.000106,5.5e-05,0.000377,0.0015,-0.001851,-0.001674,0.000856,0.001208,0.000508,5.001748,-0.002642,0.001109,0.000711,0.000274,5.002635,0.001322,-0.000358,-0.001849,0.00243,-0.001812,6.9e-05,-0.000984,-0.001734,-0.000685,6.9e-05
4,0.001009,-0.013957,0.004684,0.012933,-0.001736,-0.006671,0.008866,0.00119,-0.006246,-0.003815,-0.004334,-0.004912,0.00355,0.003713,0.006218,0.003416,0.000768,0.007745,0.000413,-0.011054,0.02218,0.009233,0.011858,0.010177,0.019144,-0.006177,4.006056,-0.008547,0.006804,0.02221,0.003406,0.008908,-0.005508,-0.001427,0.004392,-0.00272,0.000666,0.000883,-0.001619,0.005126,...,0.008611,0.009241,0.002093,-0.001518,0.004747,-0.007432,0.000714,0.003251,0.002103,-0.008819,0.011644,0.012056,0.000178,-0.0025,0.001012,0.000655,0.000178,-0.003458,-0.00562,0.019083,-0.009199,-0.002224,0.004117,-0.004041,0.011687,0.003228,-0.009085,0.027037,0.000892,0.011536,-0.007103,-0.007682,-0.005607,0.010577,0.007031,0.001009,-0.008779,0.016566,-0.026467,0.001009


#### Predict score 

In [42]:
def predict_score(pred_ui_matrix, user_id, movie_name):
    return pred_ui_matrix.loc[user_id-1, movie_name]

In [43]:
user_id = 4
movie_name = "Young Frankenstein"

score_4yf = ui_matrix.loc[user_id, movie_name]
print(f"True real rating of user {user_id} for movie {movie_name} is {score_4yf}")

pred_4yf = predict_score(pred_ui_matrix, user_id, movie_name)
print(f"True predicted rating of {user_id} for movie {movie_name} is {pred_4yf}")

True real rating of user 4 for movie Young Frankenstein is 5.0
True predicted rating of 4 for movie Young Frankenstein is 5.002634544369512


#### Evaluate model

In [44]:
rmse_df = pd.concat([ui_matrix.mean(), pred_ui_matrix.mean()], axis=1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head()

Unnamed: 0_level_0,Avg_actual_ratings,Avg_predicted_ratings,item_index
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
...And God Created Woman,0.005961,0.005959,0
10 Items or Less,0.07079,0.070839,1
10 Things I Hate About You,0.027571,0.027423,2
"10,000 BC",0.017139,0.01742,3
11'09''01 - September 11,0.002981,0.002937,4


In [45]:
RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean() ** 0.5), 5)
print(f'RMSE SVD Model = {RMSE}')

RMSE SVD Model = 0.00023


##### Evaluate with different value k

In [46]:
for i in [10, 100, 300, 500, 600]:

    # Singular Value Decomposition
    U, sigma, Vt = svds(ui_matrix, k = i)

    # Construct diagonal array in SVD
    sigma = np.diag(sigma)

    # Low-rank matrix
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 

    # Convert predicted ratings to dataframe
    pred_ui_matrix = pd.DataFrame(all_user_predicted_ratings, columns = ui_matrix.columns)

    rmse_df = pd.concat([ui_matrix.mean(), pred_ui_matrix.mean()], axis=1)
    rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
    rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)

    RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean() ** 0.5), 5)
    print(f'RMSE with value k = {i} : {RMSE}')

RMSE with value k = 10 : 0.01127
RMSE with value k = 100 : 0.00652
RMSE with value k = 300 : 0.00263
RMSE with value k = 500 : 0.00074
RMSE with value k = 600 : 0.00023


#### Recommend movies

In [47]:
# Recommend the items with the highest predicted ratings

def recommend_items(user_id, ui_matrix, pred_ui_matrix, num_recommendations=5):

    # Get and sort the user's ratings
    sorted_user_ratings = ui_matrix.loc[user_id].sort_values(ascending=False)
    #sorted_user_ratings
    sorted_user_predictions = pred_ui_matrix.loc[user_id-1].sort_values(ascending=False)
    #sorted_user_predictions
    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    temp = temp.loc[temp.user_ratings == 0]   
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(user_id))
    print(temp.head(num_recommendations))

In [48]:
recommend_items(4, ui_matrix, pred_ui_matrix, num_recommendations=5)


Below are the recommended items for user(user_id = 4):

                              user_ratings  user_predictions
Recommended Items                                           
Shaun of the Dead                      0.0          0.007939
The Curse of the Were-Rabbit           0.0          0.006702
Little Buddha                          0.0          0.006153
Two Girls and a Guy                    0.0          0.005301
Flashdance                             0.0          0.005178
