In [1]:
# EXECUTE FIRST

# computational imports
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
# for reading files from urls
import urllib.request
# display imports
from IPython.display import display, IFrame
from IPython.core.display import HTML

# import notebook styling for tables and width etc.
response = urllib.request.urlopen('https://raw.githubusercontent.com/DataScienceUWL/DS775v2/master/ds755.css')
HTML(response.read().decode("utf-8"));

<p><font size=18>Lesson 10: Recommender Systems 1</font></p>

# The Data
For our lesson, we're going to be using the movies_metadata.csv file in the data directory of the lesson. This is the same file used in the book. First let's do some simple cleaning. Banik explains most of this in the book. When we deviate from Banik's approach, we'll let you know.

In [2]:
#read in the data
df = pd.read_csv('data/movies_metadata.csv')

#print the shape of the dataframe
print(f"The shape is {df.shape}")

#get the column info
df.info()

#####################
# Helper Functions
#####################
#converts ints & string representations of numbers to floats
def to_float(x):
    try:
        x = float(x)
    except:
        x = np.nan
    return x

#Helper function to convert NaT to 0 and all other years to integers.
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

#we can run both apply and astype in one line by chaining them
df['budget'] = df['budget'].apply(to_float).astype('float')

#Convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'],errors='coerce')

#Extract year from the datetime and convert to integer. (Again, we're chaining functions)
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan).apply(convert_int)

#convert vote_count to integer
df['vote_count'] = df['vote_count'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan).apply(convert_int)

#Convert all NaN into stringified empty lists and apply literal eval and convert to list by chaining functions
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

#filter to just the relevant columns
df = df[['id','title','budget', 'genres', 'overview', 'revenue', 'runtime', 'vote_average', 'vote_count', 'year']]
df.head()

The shape is (5000, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  5000 non-null   bool   
 1   belongs_to_collection  825 non-null    object 
 2   budget                 5000 non-null   int64  
 3   genres                 5000 non-null   object 
 4   homepage               311 non-null    object 
 5   id                     5000 non-null   int64  
 6   imdb_id                5000 non-null   object 
 7   original_language      5000 non-null   object 
 8   original_title         5000 non-null   object 
 9   overview               4979 non-null   object 
 10  popularity             5000 non-null   float64
 11  poster_path            4979 non-null   object 
 12  production_companies   5000 non-null   object 
 13  production_countries   5000 non-null   object 
 14  release_date           4996 non-

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
2,15602,Grumpier Old Men,0.0,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,0,101.0,6.5,92,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


If you followed along with the comments while we cleaned data, you noted that we made some minor changes, primarily in chaining some of the functions that Banik uses. While it's not necessary to chain functions, it does simplify our code, so we included this method as another option. It's one of the nifty features of Pandas.

### Avoiding "Explode"
Here's where we're going to make a bigger diversion. Banik would have you "explode" the genres. He does this to make it easier to filter on the genres column, but it has an unfortunate side effect of making duplicate rows per movie. It's easy to avoid that by using a different method of filtering. Let's see how that works.

In [3]:
#let's fetch just the first 5 rows of our dataframe
snip = df[:5]
display(snip)

#let's create a filter that will be True if "Family" is in the list of genres for each movie
hasFamilyFilter = snip['genres'].apply(lambda x: "Family" in x)
print(f'Family filter values \n {hasFamilyFilter}')

#let's create a filter that will be True if "Drama" is in the list of genres of each movie
hasDramaFilter = snip['genres'].apply(lambda x: "Drama" in x)
print(f'Drama filter values \n{hasDramaFilter}')

#let's filter our dataset to just those movies that have Family OR Drama. Note the placement of the parenthesis
display(snip[(hasFamilyFilter) | (hasDramaFilter)])

#let's filter our dataset to just those movies that have Comedy AND Romance OR have a vote_count > 5000.
#let's use variables for our two genres
selected1 = 'Romance'
selected2 = 'Comedy'

#instead of creating stand-alone filters, we'll filter "on the fly" using the apply right in the filter
#again, pay attention to where the parentheses go
snip[(snip['vote_count'] > 5000) | 
     ((snip['genres'].apply(lambda x: selected1 in x)) & 
      (snip['genres'].apply(lambda x: selected2 in x)))]


Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
2,15602,Grumpier Old Men,0.0,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,0,101.0,6.5,92,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


Family filter values 
 0     True
1     True
2    False
3    False
4    False
Name: genres, dtype: bool
Drama filter values 
0    False
1    False
2    False
3     True
4    False
Name: genres, dtype: bool


Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995


Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
2,15602,Grumpier Old Men,0.0,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,0,101.0,6.5,92,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995


### Fetching unique values from a column of lists
The one other thing we need to be able to do where the exploding helps is to get the list of unique genres. Let's look at how we could do that. We'll do it slightly differently than Banik. And, we'll first break it down into steps so you can see what's happening.

In [4]:
#in steps
#convert the genres list in a series of columns
step1 = snip.apply(lambda x:pd.Series(x['genres']),axis=1)
print(f"Step 1\n{step1}")

#this step converts the rows into columns and "stacks" them all together
step2 = step1.stack()
print(f"Step 2\n{step2}")

#let's get just the unique values from this series
step3 = step2.unique()
print(f"Step 3\n{step3}")
print(f"Step 3 is a \n{type(step3)}")

#numpy arrays can be joined just like lists, so let's join it to create a comma-delimited string
step4 = ', '.join(step3)
print(f"Step 4\n{step4}")


#let's do it all in one step
allGenres = ', '.join(snip.apply(lambda x:pd.Series(x['genres']),axis=1).stack().unique())
allGenres

Step 1
           0        1        2
0  Animation   Comedy   Family
1  Adventure  Fantasy   Family
2    Romance   Comedy      NaN
3     Comedy    Drama  Romance
4     Comedy      NaN      NaN
Step 2
0  0    Animation
   1       Comedy
   2       Family
1  0    Adventure
   1      Fantasy
   2       Family
2  0      Romance
   1       Comedy
3  0       Comedy
   1        Drama
   2      Romance
4  0       Comedy
dtype: object
Step 3
['Animation' 'Comedy' 'Family' 'Adventure' 'Fantasy' 'Romance' 'Drama']
Step 3 is a 
<class 'numpy.ndarray'>
Step 4
Animation, Comedy, Family, Adventure, Fantasy, Romance, Drama


'Animation, Comedy, Family, Adventure, Fantasy, Romance, Drama'

# Simple Recommender
Simple recommenders simply return a list of values that are sorted by some kind of score. The most complicated bit of a simple recommender is determining the appropriate score to use. There's no right answer for what the score is or how it should be calculated. You'll need to consider the data that you're working with and decide how to calculate a meaningful score. For example, if you're interested in highly-rated movies, you might need to consider both the average rating for a movie and the number of people that rated the movie. Say you have a rating scale from 1 to 5 stars. Your highest average rating would be 5 - a perfect score. But does a rating of 5 by one user mean the same thing as a rating of 5 by 100 users? Probably not.

Banik solves this problem by using the IMDB weighted rating. 

$$Weighted Raiting (WR) = (\frac{v}{v+m} * R) + (\frac{m}{v+m}*C)$$
 
Where:
* v is the number of votes garnered by the movie
* m is the minimum number of votes required for the movie to be in the chart (the prerequisite)
* R is the mean rating of the movie
* C is the mean rating of all the movies in the dataset

Banik chose the 80th percentile for the minimum number of votes to be included in the recommender. 

Note that Banik chooses m (our minimum number of votes) based on the whole dataset, because IMDB sets this as the minimum threshold for being included in the ratings. So m is both a part of the metric AND a filter.

Let's also base C on the whole dataset to start.

Let's fetch C and m and filter to movies that have vote_counts greater than or equal to the 80th quantile. (This is equivalent to getting the top 20% of votes.) 

In [5]:
#fetch C from the whole dataset
C = df['vote_average'].mean()
print(f"C is {C}")

#fetch m from the whole dataset
m = df['vote_count'].quantile(.8)
print(f"m is {m}")

#filter to movies that have greater than or equal to 80% of the votes
df = df[df['vote_count'] >= m]

#see how many movies are left.
df.shape

C is 6.069160000000003
m is 255.20000000000027


(1000, 10)

Our simple recommender just needs the movie score now. Let's write our function to do scoring. Note that unlike Banik's metric, our version takes in x (the row of data) and m & C. Passing all the variables you need into the the function is a best practice that Banik does not follow, but we do.

In [6]:
def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    # Compute the weighted score
    return (v/(v+m) * R) + (m/(m+v) * C)

Let's apply the score to the dataframe.

In [7]:
df['score1'] = df.apply(weighted_rating, args=(m,C), axis=1)
df.head()

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year,score1
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995,7.6266
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995,6.820534
5,949,Heat,60000000.0,"[Action, Crime, Drama, Thriller]","Obsessive master thief, Neil McCauley leads a ...",187436818,170.0,7.7,1886,1995,7.505628
9,710,GoldenEye,58000000.0,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,352194034,130.0,6.6,1194,1995,6.506521
12,21032,Balto,0.0,"[Family, Animation, Adventure]",An outcast half-wolf risks his life to prevent...,11348324,78.0,7.1,423,1995,6.712105


What happens if use C from just the filtered dataframe?

In [8]:
#fetch c from the already filtered data
C2 = df['vote_average'].mean()
print(f"C is {C2}")

df['score2'] = df.apply(weighted_rating, args=(m,C2), axis=1)
df.head()

C is 6.805500000000003


Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year,score1,score2
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995,7.6266,7.659741
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995,6.820534,6.890962
5,949,Heat,60000000.0,"[Action, Crime, Drama, Thriller]","Obsessive master thief, Neil McCauley leads a ...",187436818,170.0,7.7,1886,1995,7.505628,7.593389
9,710,GoldenEye,58000000.0,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,352194034,130.0,6.6,1194,1995,6.506521,6.636188
12,21032,Balto,0.0,"[Family, Animation, Adventure]",An outcast half-wolf risks his life to prevent...,11348324,78.0,7.1,423,1995,6.712105,6.989183


You can see that it does indeed make a difference in the score. But would it make a difference in our recommendations?

## Simple Recommender Self-Assessment
### *Self-Assessment: Load and Display*

Load the data set **ted_main.csv** and display the first 5 rows. This data set can be found in the presentation download for this lesson.  More information about this data set <a href = https://www.kaggle.com/rounakbanik/ted-talks> here </a>.  

In [9]:
# enter your code here

### *Self-Assessment: Pandas*

How many talks are in the TED Talks data frame?

### *Self-Assessment: Prerequisites*

Select TED talks with these prerequisites:

1. talks with duration of at least 5 minutes (i.e. 300 seconds)
2. talks with only 1 speaker
3. talks in the top 90\% of views (exclude the bottom 10\%)

Also inspect the number of talks that made the cut.

In [10]:
# enter your code here

### *Self-Assessment: Compute a Metric, Sort and Print*

In the absence of numerical ratings here, use the ratio of the number of comments per 1000 views as a metric to sort the TED talks and print the 10 with the highest ratios.  

Display only the description, the main speaker, and the number of views.

In [11]:
# enter your code here

## Knowledge-Based Recommender

The knowledge-based recommender is just a simple recommender that takes in some input from the user. Banik describes it as a recommender that:
1. Gets user input on their preferences
2. Extracts all the movies that match the conditions set by the user.
3. Calculates the values of m and C for **ONLY THOSE MOVIES** and uses m and C to calculate scores and return the results.

Note that Banik's appproach here is inconsistent with what he'd previously said about how the IMDB weighted metric is calculated. That's okay, we're going to follow his advice, but we'll compare the different approaches to computing the score afterward.

We already have most of the pieces that we need. Let's just look at how to wrap it all up in a function.

Our function will take in a cleaned dataframe and a percentile to use for m. By default, the percentile will be .8. Note that the only changes we're making here from Banik's metric is to adjust how we do the genres filter.

In [12]:
def build_chart(gen_df, percentile=0.8):
    
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    
    #Filter based on the condition
    movies = movies[(movies['genres'].apply(lambda x: genre in x)) & #updated filtering based on a list.
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

We'll test our code by requesting movies that have the genres 'Family' and a runtime between 80 and 120 minutes and a year between 1980 and 2000. We'll return the output to a variable so we can review it in different ways.

In [13]:
out_movies = build_chart(df, .8)

Input preferred genre


 Family

Input shortest duration


 80

 

Input longest duration


 

Input earliest year


 

Input latest year


We have two other versions of scores in the dataset. Let's compare the rankings if we sort in different ways.

In [14]:
#set the score rank column
out_movies['scoreRank'] = np.arange(len(out_movies))
#sort by score1 and set the score1rank column
out_movies = out_movies.sort_values('score1', ascending=False)
out_movies['score1Rank'] = np.arange(len(out_movies))
#sort by score2 and set the score2rank column
out_movies = out_movies.sort_values('score2', ascending=False)
out_movies['score2Rank'] = np.arange(len(out_movies))
#resort by score
out_movies = out_movies.sort_values('score', ascending=False)

#display the final result with just name and scores
out_movies[['title','score1Rank', 'score2Rank', 'scoreRank' ]]

Unnamed: 0,title,score1Rank,score2Rank,scoreRank
1225,Back to the Future,0,0,0
359,The Lion King,1,1,1
0,Toy Story,2,2,2
588,Beauty and the Beast,4,4,3
1902,Back to the Future Part II,5,5,4
581,Aladdin,6,6,5
1798,Mulan,3,3,6
2997,Toy Story 2,7,7,7
1065,E.T. the Extra-Terrestrial,8,8,8
1903,Back to the Future Part III,10,10,9


Did the different methods of computing the score impact the recommendation results in a significant way?

## Knowledge-based Recommender Self Assessment
For this example we will use the TED Talks data set that you have already loaded to build a knowledge-based recommender by soliciting the desired publication year and word rating from the user.

### *Self-Assessment: Dealing with Dates*

Extract the year of the talk from the feature called **published_date** and put it in a new variable called **published_year**.  

First, the film dates need to be converted to datetime objects and then extract the year of the film date.  However, for the TED Talks data, include the argument *unit='s'* in the **to_datetime()** function in order to convert the dates correctly (based on the number of seconds to the unix epoch start).

Then convert **published_year** to an integer data type and be sure that there are no NAT values among them.

In [15]:
# enter your code here

### *Self-Assessment: Stringified Dictionaries*

Since we will be asking the user to enter a descriptive word rating to select a talk and the feature  **ratings** is a stringified dictionary, convert the list of dictionaries into a list of strings. Do not explode like the book does. Follow the directons in this lesson.

In [16]:
# enter your code here

### *Self-Assessment: Create the Knowledge-Based Recommender*

1. Print a list of the descriptive word ratings for the user to choose from. (*Hint: follow the directions in this lesson.*)

2. Ask the user to enter answers to the following questions:

    - Enter a descriptive word for rating.
    - Enter the earliest year published for the talk (between 2006 and 2017).
    - Enter the latest year published for the talk (between 2006 and 2017).

3. Consider only talks with the top 90% of views (after filtering based on user preferences).

4. Display the top 5 recommended talks according to the "comments per 1000 views" ratio (calculated AFTER doing steps 2 & 3).

5. Display only the main speaker, the name of the talk, the year published, and the comments per thousand views ratio.

6.  Show the results for the word rating "obnoxious" and published years between 2009 and 2014.

In [17]:
# enter your code here

## Content-Based Recommender

There are a number of different approaches to doing content-based recommenders. What they all share is that they use some method to determine a similarity metric between any two items that you could recommend. Then, given one "seed" item, we can recommend items that are similar to that item.

Banik uses Term Frequency-Inverse Document Frequency and Cosine Similarity. Be sure to read the book to fully understand these concepts. But we'll step through it here for you with a the "snip" dataset we created above.

Note also that you can <a href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">read the documentation</a> for the TfidfVectorizer. By default, it will convert everything to lowercase. There are many other parameters that you can pass in to change how it preprocesses the words. We're passing in one - stop_words.

### Computing TF-IDF

In [18]:
#remember what's in snip
display(snip)

#Import TfIdfVectorizer from the scikit-learn library
# from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Tell the vectorizer we'll want to remove english stopwords for whatever we pass to it.
tfidf = TfidfVectorizer(stop_words='english')

#if we had any empty overview fields, we'd need to replace NaN with an empty string. 
# we don't in our snip dataset, but we'll step through it as a good practice
snip['overview'] = snip['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(snip['overview'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
2,15602,Grumpier Old Men,0.0,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,0,101.0,6.5,92,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


(5, 131)

Because we have such a small matrix, we can actually print it and take a peek at what's going on in there. In order to print a tfidf matrix, we need to convert it to something that can be displayed.

*Note: You won't need to do this bit in your homework or self-assessment, but it's helpful in understanding what we have.*

In [19]:
#this extracts all the words (features) in the matrix - we'll use this for our columns
feature_names = tfidf.get_feature_names()
#this extracts the IDs of the movies - we'll use this for our rows
corpus_index = snip['id']
#this puts both into a dataframe. 
#The tfidf_matrix is usually a sparse matrix, meaning not all row/col combinations have a value. Using todense() puts a zero in that row/col slot

pd.DataFrame(tfidf_matrix.todense(), index=corpus_index, columns=feature_names)


Unnamed: 0_level_0,26,adult,afraid,alan,alarming,ancient,andy,arrival,aside,away,...,vannah,waiting,way,wedding,wife,women,woody,world,worry,years
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,0.0,0.0,0.143369,0.0,0.0,0.0,0.430106,0.0,0.143369,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.430106,0.0,0.0,0.0
8844,0.152582,0.152582,0.0,0.305165,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152582,0.0,0.152582
15602,0.0,0.0,0.0,0.0,0.174202,0.174202,0.0,0.0,0.0,0.174202,...,0.0,0.0,0.0,0.140545,0.0,0.0,0.0,0.0,0.174202,0.0
31357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.2,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0
11862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.0,0.0,...,0.0,0.0,0.0,0.150728,0.186823,0.0,0.0,0.0,0.0,0.0


When we print out this matrix, we can see that "afraid" is an important word in the description of Toy Story, but not in any of our other movies. But "wedding" is important in both Grumpier Old Men and Father of the Bride.

### Computing Cosine Similarity
Now that we have a tfidf matrix, we can compute the cosign similarity of the matrix with itself. What this is doing is determining the overall similarity score between each movie. A cosine similarity score is a number between -1 and 1. The higher the number, the more similar two items are.

In [20]:
# Import linear_kernel to compute the dot product
# from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#let's look at what we've got.
pd.DataFrame(cosine_sim, columns=snip['title'], index=snip['title'])

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story,1.0,0.014239,0.0,0.0,0.0
Jumanji,0.014239,1.0,0.034603,0.0,0.0
Grumpier Old Men,0.0,0.034603,1.0,0.0,0.021184
Waiting to Exhale,0.0,0.0,0.0,1.0,0.0
Father of the Bride Part II,0.0,0.0,0.021184,0.0,1.0


Note that what we get is a matrix (which we've converted to a dataframe for display) that scores each movie in relation to another movie. A movie scored with itself will always be a one. Note that in our tiny dataset here, there's nothing very similar to Toy Story. 

Which two movies are most similar?

Banik uses a reverse mapping of indexes and titles to fetch data from the cosine similarity matrix. Let's take a look at what that is doing.

In [21]:
#create the reverse mapping
indices = pd.Series(snip.index, index=snip['title']).drop_duplicates()
#print it 
print(f'The index series looks like this: \n{indices}')

#if I wanted to get the index from the title I would do this:
print(f'The index for Waiting to Exhale is: {indices["Waiting to Exhale"]}')
      


The index series looks like this: 
title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64
The index for Waiting to Exhale is: 3


Let's also break down what's going on with converting our cosine similarity to a list of tuples. 

The first thing we're doing is getting the row from the matrix that corresponds to the movie we want to review. Let's say we want to review Grumpier Old Men. Grumpier Old Men has an index of 2. Let's fetch that row.

In [22]:
cosine_sim[2]

array([0.       , 0.034603 , 1.       , 0.       , 0.0211841])

The <a href="https://book.pythontips.com/en/latest/enumerate.html">enumerate function</a> loops over some iterable object and returns a counter and the value for each item in the iterable. We can see that what cosine_sim[2] returns is an array, which is an iterable object. We can't directly print the results from enumerate, so we have to wrap it in a list function.

What this results in is a list of tuples that correspond to the column number and the cosine similarity score for each movie that we compared to Grumpier Old Men. Which column number would be Grumpier Old Men compared with itself?

In [23]:
sim_scores = list(enumerate(cosine_sim[2]))
sim_scores

[(0, 0.0),
 (1, 0.03460299560436649),
 (2, 0.9999999999999992),
 (3, 0.0),
 (4, 0.02118409541454349)]

We can see that the most similar movie to Grumpier Old Men is... Grumpier Old Men. This makes sense - it's the same movie! We don't want that movie in our results, though. Since we know this is a balanced matrix (the indexes are the same for the columns and for the rows), we can just delete the item with our Grumpier Old Men index. Remember, that's 2. Let's see how that works.

In [24]:
del sim_scores[2]
sim_scores

[(0, 0.0), (1, 0.03460299560436649), (3, 0.0), (4, 0.02118409541454349)]

Great. That got rid of the tuple that corresponded to the column Grumpier Old Men.

The next thing we do is to sort this list by the score (the second bit of the tuple). We're using a lambda function to do that. Let's see what we get when we sort.

In [25]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores

[(1, 0.03460299560436649), (4, 0.02118409541454349), (0, 0.0), (3, 0.0)]

We can see that column 1 and column 4 of our matrix contain our 2 most similar movies. But, what movies are those? We need to go back to our dataframe to figure that out. Let's extract just the indices for our top 2 movies. Finally, we'll use <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html">iloc</a> to find the corresponding movie titles.

In [26]:
top_two = [i[0] for i in sim_scores[0:2]]
print(f'The top two indices are: {top_two}')

snip.iloc[top_two]

The top two indices are: [1, 4]


Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


### Recommender Function

Let's wrap this up in a function. We're going to do this slightly differently than Banik did. 
* We'll avoid giving it any variable defaults (which is a good practice unless you're hard-coding the defaults). 
* We'll also pass in the number of results to return. This is hard-coded number, so we will set a default for that.
* We'll delete the passed-in movie explicitly, instead of assuming it's the first after sorting
* We'll return the whole dataframe, not just the titles

In [27]:
def content_recommender(df, title, cosine_sim, indices, topN=2): 
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie and convert to tuples
    sim_scores = list(enumerate(cosine_sim[idx]))
    #delete the movie that was passed in
    del sim_scores[idx]
    
    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top-n most similar movies.
    sim_scores = sim_scores[:topN]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df.iloc[movie_indices]

Let's test our recommender with Grumpier Old Men and our snipped dataset again.

In [28]:
content_recommender(snip, 'Grumpier Old Men', cosine_sim, indices, 2)

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


## Content-Based Recommender Self Assessment

For this example we will use the TED Talks data set that you have already loaded to build a content-based recommender based on the descriptions of the talks.  This will correspond to the **plot description-based recommender**.

### *Self-Assessment: TF-IDF Vectors*

From the original TED Talks data frame that use in this lesson, create the TF-IDF (term frequency - inverse document frequency) matrix from the descriptions of the talks.  The TF-IDF is high where a rare term is present or frequent in a document and TF-IDF is near zero where a term is absent from a document, or abundant across all documents.

The feature name in the data frame is **description**.

Output the shape of the TF-IDF matrix you create. The number of rows corresponds to the number of TED talks in the data frame and the number of columns represents the number of unique terms. 

In [29]:
# enter your code here

### *Self-Assessment: Create the Content-Based Recommender Based on Cosine Similarity*

Compute the cosine similarity score for all of the TED talks in the data frame. Next build the recommender to request the name of a TED talk in the data frame and provide the top 5 recommended talks based on the similarity of the descriptions with the name of the talk supplied.

Show that it works by getting the top 5 recommended talks that are similar to the talk named "Tyler Cowen: Be suspicious of simple stories" (from the `name` column of the data frame).

In [30]:
# enter your code here

# Metadata-Based Recommender

A metadata-based recommender is nothing more than a content-based recommender that has more words, and sometimes more specific words. Banik demonstrates this with keywords and credits and we have that code for you in the Content Based Recommenders file in this directory. 

We can also demonstrate the basic principles with our snip dataset. We already have our genres in a list and none of our snip movies have more than 3 genres and each of our genres are only 1 word. So we don't have to generate lists or sanitize anything. We simply need to create a soup of our overview and our genres.

Note: genres is a list, so we'll need to use ' '.join() to turn it into a string. Overview is a string, so we just need to add that string onto the end of the string created after ' '.join()ing the genres. Be sure to add a space in between.

In [31]:
#reminder again - what's in snip
display(snip)

#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['genres']) + ' ' + x['overview'] 

#create a column with the soup in it    
snip['soup'] = snip.apply(create_soup, axis=1)   


print(f'The soup for Toy Story is: \n{snip["soup"][0]}')

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year
0,862,Toy Story,30000000.0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",373554033,81.0,7.7,5415,1995
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995
2,15602,Grumpier Old Men,0.0,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,0,101.0,6.5,92,1995
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995
4,11862,Father of the Bride Part II,0.0,[Comedy],Just when George Banks has recovered from his ...,76578911,106.0,5.7,173,1995


The soup for Toy Story is: 
Animation Comedy Family Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.


Banik used a count vectorizer instead of a tf-idf vectorizer for his metadata recommender. He does this because using a tf-idf would downweight actors that appear in more than one movie. The same thing would happen with genres. So we'll follow suit and use the count vectorizer here.

You can read <a href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">the documentation for count_vectorizer</a> to learn more about it.

In [32]:
#Import countVectorizer & cosine_similarity function
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(snip['soup'])

#Compute the cosine similarity score 
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

#get our indices (these haven't actually changed)
indices2 = pd.Series(snip.index, index=snip['title'])

#call our same function, using the same movie. 
content_recommender(snip, 'Grumpier Old Men', cosine_sim2, indices2, topN=2)

Unnamed: 0,id,title,budget,genres,overview,revenue,runtime,vote_average,vote_count,year,soup
1,8844,Jumanji,65000000.0,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,262797249,104.0,6.9,2413,1995,Adventure Fantasy Family When siblings Judy an...
3,31357,Waiting to Exhale,16000000.0,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",81452156,127.0,6.1,34,1995,"Comedy Drama Romance Cheated on, mistreated an..."


Even with this tiny dataset, switching between TF-IDF and CountVectorizer and the linear kernel and the cosine-similarity changed our top two results.

It's worth trying different approaches with your data to determine the right fit.