# 2. Content-Based Recommendations

In [2]:
import numpy as np
import pandas as pd

### Rating data

In [3]:
df_rating = pd.read_csv('data/user_ratings.csv')
print('Num de ratings:', df_rating.shape[0])
df_rating.head(3)

Num de ratings: 100836


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


### Movies data

In [7]:
df_movies = pd.read_csv('data/movies.csv')
print('Num. de películas:', df_movies.shape[0])
df_movies.head(3)

Num. de películas: 9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [10]:
#df_movies['genres'].apply(lambda x: pd.Series(str(x).split('|')))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,Adventure,Children,Fantasy,,,,,,,
2,Comedy,Romance,,,,,,,,
3,Comedy,Drama,Romance,,,,,,,
4,Comedy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9737,Action,Animation,Comedy,Fantasy,,,,,,
9738,Animation,Comedy,Fantasy,,,,,,,
9739,Drama,,,,,,,,,
9740,Action,Animation,,,,,,,,


In [45]:
df_movies['generos'] = df_movies['genres'].apply(lambda x: str(x).split('|'))
df_movies.head()

Unnamed: 0,movieId,title,genres,generos
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]


In [21]:
df_movies['generos'].values

array([list(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']),
       list(['Adventure', 'Children', 'Fantasy']),
       list(['Comedy', 'Romance']), ..., list(['Drama']),
       list(['Action', 'Animation']), list(['Comedy'])], dtype=object)

In [8]:
# extract year of release of each movie from the title column
# convert the data type of the movie_year column to numeric (from str)

import re 
#movies['rel_year'] = movies.title.str[-5:-1]
#movies['rel_year'] = movies.title.apply(lambda x: x[-5:-1])
df_movies['movie_year'] = df_movies['title']
#movies['movie_year'] = movies['movie_year'].apply(lambda x: re.findall('\((.*?)\)',x))
df_movies['movie_year'] = df_movies['movie_year'].str.extract(r"\(([0-9]+)\)", expand=False)


# creating a new column with just the movie titles
df_movies['name'] = df_movies['title']
df_movies['name'] = df_movies['name'].str.extract('(.*?)\s*\(', expand=False)

In [None]:
df_movies

#### Other way to do it:

## 2. 1 Creating content-based data
We first need to get our data in a usable format. We will explore our base data and work through how to format that data to be used for content-based recommendations.

The desired outcome is a row per movie with each column indicating whether a genre applies to the movie, i.e., a dataFrame containing a row per movie, and each of its attributes as columns.

In [12]:
movie_att = df_movies.join(df_movies.pop('genres').str.get_dummies('|'))

In [13]:
movie_att

Unnamed: 0,movieId,title,movie_year,name,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),1995,Toy Story,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1995,Jumanji,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),1995,Grumpier Old Men,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),1995,Waiting to Exhale,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),1995,Father of the Bride Part II,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),2017,Black Butler: Book of the Atlantic,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),2017,No Game No Life: Zero,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),2017,Flint,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),2018,Bungo Stray Dogs: Dead Apple,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
movie_att.columns

Index(['movieId', 'title', 'movie_year', 'name', '(no genres listed)',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [18]:
movie_gen = movie_att[['name', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western']]
movie_gen.set_index('name', inplace = True)

In [19]:
movie_gen

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Toy Story,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Jumanji,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Grumpier Old Men,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Waiting to Exhale,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
Father of the Bride Part II,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
No Game No Life: Zero,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Flint,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Bungo Stray Dogs: Dead Apple,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Another way to do it:

Inspect the rows corresponding to 'Toy Story' and 'Yogi Bear' in movie_gen. How many genres do they have in common?

In [20]:
movie_gen.loc['Toy Story', :]

Action         0
Adventure      1
Animation      1
Children       1
Comedy         1
Crime          0
Documentary    0
Drama          0
Fantasy        1
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: Toy Story, dtype: int64

In [21]:
movie_gen.loc['Yogi Bear', :]

Action         0
Adventure      0
Animation      0
Children       1
Comedy         1
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: Yogi Bear, dtype: int64

Yogi Bear and Toy Story both have the 'Children' and 'Comedy' attributes. The more genres that two movies have in common, the more likely it is that someone who liked one will like the other, so now we're going to apply this at a larger scale instead of just one pair of movies.

## 2. 2 Comparing individual movies with Jaccard similarity
We've just built a DataFrame of movies, where each column represents a different genre. Now, we can now use this DataFrame to compare movies by measuring the **Jaccard similarity between rows**. 

    `The higher the Jaccard similarity score, the more similar the two items are.`

We will compare the movie GoldenEye with the movie Toy Story, and GoldenEye with SkyFall and compare the results.

We'll use the movie_cross_table `movie_gen` containing all the movies as rows and the genres as Boolean columns.

In [22]:
# Import numpy and the distance metric
from sklearn.metrics import jaccard_score

# Extract just the rows containing GoldenEye and Toy Story
goldeneye_values = movie_gen.loc['GoldenEye'].values
toy_story_values = movie_gen.loc['Toy Story'].values

In [23]:
goldeneye_values

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [24]:
toy_story_values

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [27]:
# Find the similarity between GoldenEye and Toy Story
print('Similarity between GoldenEye and Toy Story:',jaccard_score(goldeneye_values, toy_story_values))

# Repeat for GoldenEye and Skyfall
skyfall_values = movie_gen.loc['Skyfall'].values
print('Similarity between GoldenEye and Skyfall:',jaccard_score(goldeneye_values, skyfall_values))

Similarity between GoldenEye and Toy Story: 0.14285714285714285
Similarity between GoldenEye and Skyfall: 0.75


Based on Jaccard similarity, GoldenEye and Skyfall (both James Bond movies) are more similar than GoldenEye and Toy Story (a spy movie and an animated kids movie).

## Comparing all your movies at once
While finding the Jaccard similarity between any two individual movies in our dataset is great for small-scale analyses, it can prove slow on larger datasets to make recommendations.

Now we will find the similarities between all movies and store them in a DataFrame for quick and easy lookup.

When finding the similarities between the rows in a DataFrame, you could run through all pairs and calculate them individually, but it's more efficient to use the **`pdist()` (pairwise distance) function from scipy.**

This can be reshaped into the desired rectangular shape using **`squareform()`** from the same library. 

Since we want **similarity values** as opposed to distances, you should `subtract the values from 1`.

In [34]:
# Import functions from scipy
from scipy.spatial.distance import pdist, squareform

# Calculate all pairwise distances
jaccard_distances = pdist(movie_gen.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index = movie_gen.index, columns = movie_gen.index)

# Print the top 5 rows of the DataFrame
jaccard_similarity_df.head()

name,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Gintama: The Movie,anohana: The Flower We Saw That Day - The Movie,Silver Spoon,Love Live! The School Idol Movie,Jon Stewart Has Left the Building,Black Butler: Book of the Atlantic,No Game No Life: Zero,Flint,Bungo Stray Dogs: Dead Apple,Andrew Dice Clay: Dice Rules
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,1.0,0.6,0.166667,0.142857,0.2,0.0,0.166667,0.4,0.0,0.142857,...,0.285714,0.166667,0.166667,0.2,0.0,0.5,0.6,0.0,0.166667,0.2
Jumanji,0.6,1.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.2,0.0,0.0,0.0
Grumpier Old Men,0.166667,0.0,1.0,0.666667,0.5,0.0,1.0,0.0,0.0,0.0,...,0.2,0.0,0.333333,0.0,0.0,0.2,0.25,0.0,0.0,0.5
Waiting to Exhale,0.142857,0.0,0.666667,1.0,0.333333,0.0,0.666667,0.0,0.0,0.0,...,0.166667,0.25,0.666667,0.0,0.0,0.166667,0.2,0.333333,0.0,0.333333
Father of the Bride Part II,0.2,0.0,0.5,0.333333,1.0,0.0,0.5,0.0,0.0,0.0,...,0.25,0.0,0.5,0.0,0.0,0.25,0.333333,0.0,0.0,1.0


In [32]:
len(jaccard_distances)

47448411

In [33]:
squareform(jaccard_distances).shape

(9742, 9742)

The table has the movies as rows and columns, allowing you to quickly look up any distance of any movie pairing

## Making recommendations based on movie genres
Now that we have our data in a usable format and know how to compare two movies, the next step is to use this to generate recommendations. 

We will generate recommendations for any movie in our dataset.

In [40]:
# Find the values for the movie Thor
jaccard_similarity_series = jaccard_similarity_df.loc['Thor']

# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending = False)

# Print the results
print(ordered_similarities)

name
Thor                                            1.000000
Harry Potter and the Deathly Hallows: Part 2    0.833333
Beowulf & Grendel                               0.800000
The Huntsman Winter's War                       0.800000
In the Name of the King III                     0.800000
                                                  ...   
Daria: Is It Fall Yet?                          0.000000
Late Night Shopping                             0.000000
Darkness                                        0.000000
American Psycho II: All American Girl           0.000000
Andrew Dice Clay: Dice Rules                    0.000000
Name: Thor, Length: 9742, dtype: float64


"Harry Potter and the Deathly Hallows: Part 2" has the highest similarity value to Thor. This means that viewers that liked Thor are likely to enjoy "Harry Potter and the Deathly Hallows: Part 2" also.

## Text-based similarities

### Instantiate the TF-IDF model
**TF-IDF (Term Frequency Inverse Document Frequency)** by default generates a column for every word in all of your documents (movie summaries in our case). This creates a huge and unintuitive dataset as it will contain both very common words that appear in every document, and words that appear so rarely they provide no value in finding similarities between items.

We will work with the df_plots DataFrame. It contains movies' names in the Title column and their plots in the Plot column.

Using this DataFrame, we will generate the default TF-IDF scores and see if non-valuable columns are present.

W will go on to rerun the TF-IDF calculations, this time limiting the number of columns using the `min_df` and `max_df` arguments and hopefully see the improvement.

In [69]:
wikiplots = pd.read_csv('data/wiki_movie_plots.csv')

In [70]:
wikiplots.head(2)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."


In [71]:
df_plots = wikiplots[['Title', 'Plot']]
df_plots

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...
...,...,...
34881,The Water Diviner,"The film begins in 1919, just after World War ..."
34882,Çalgı Çengi İkimiz,"Two musicians, Salih and Gürkan, described the..."
34883,Olanlar Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,Non-Transferable,The film centres around a young woman named Am...


In [72]:
df_plots.isna().sum()

Title    0
Plot     0
dtype: int64

**In order to have a reasonable sample size for not having memory problems, I am going to take a random sample from the dataframe**   
(Try doing this with the entire dataset in Google Colab)

In [85]:
#df = df.sample(n=3)
df_plots = df_plots.sample(frac=0.10, random_state=42)

In [86]:
df_plots.shape

(349, 2)

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer(min_df = 2, max_df = 0.7)

# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df_plots['Plot'])

# Look at the features generated
print(vectorizer.get_feature_names())



We now have a way of trainsforming free bodies of text into structured arrays, with each relevant word being stored as a feature. This can be used to to measure similarities between items and make recommendations, even for items that you have no structured attribute data for.

## Creating the TF-IDF DataFrame
Now that we have generated our TF-IDF features, we will need to get them in a format that we can use to make recommendations. 

We will once again leverage pandas for this and wrap the array in a DataFrame. As we will be using the movie titles to do our filtering of the data, we can assign the titles to the DataFrame's index.

In [88]:
vectorized_data.shape

(349, 6271)

In [90]:
# Create Dataframe from TF-IDFarray. The tfidf_df DataFrame contains the movies and their TF-IDF features.
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns = vectorizer.get_feature_names())

# Assign the movie titles to the index and inspect
tfidf_df.index = df_plots['Title']
tfidf_df.head()

Unnamed: 0_level_0,000,10,100,105,12,14,140,15,16,18,...,young,younger,youngest,youngsters,your,youth,yun,yvette,zhao,zombie
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Resident Evil: Apocalypse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283391
The Black Swan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Suffering Man's Charity,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.108572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Koodal Nagar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.095971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Crossed Swords,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The tfidf_df DataFrame contains the movies and their TF-IDF features.

We now are able to manipulate text data into DataFrames with each row representing an item, and each column represeting a word extracted from the texts. You will be able to use this in a similar way to the attribute DataFrames you generated previously to to measure similarities between items and make recommendations.

## Comparing all your movies with TF-IDF
Now that we have put in the hard work of getting our TF-IDF data into a usable format, it's time to put it to work generating finding similarities and generating recommendations.

This time as we are using TF-IDF scores (which are floats as opposed to Booleans) we will use the **cosine similarity metric** to find the similarities between items. We will generate a matrix of all of the movie cosine similarities and store them in a DataFrame for ease of lookup. This will allow us to compare movies and find recommendations quickly and easily.

In [96]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

# Print the top 5 rows of the DataFrame
cosine_similarity_df.head()

Title,Resident Evil: Apocalypse,The Black Swan,Suffering Man's Charity,Koodal Nagar,Crossed Swords,Pork Chop Hill,Rock 'n' Roll High School Forever,Por Baazar,All Mine to Give,American Gothic,...,Jude,Railroad Tigers,Lady with Red Hair,En Sakhiye,Fatal Attraction,Afghan Luke,Let's Go to Prison,Yoga,Don't Be a Menace to South Central While Drinking Your Juice in the Hood,Saka - Nankana Sahib De Shaheed
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Resident Evil: Apocalypse,1.0,0.02717,0.01433,0.026994,0.052774,0.036471,0.071161,0.011259,0.032736,0.047479,...,0.053815,0.050334,0.041521,0.043156,0.048659,0.056386,0.022188,0.044728,0.02928,0.019637
The Black Swan,0.02717,1.0,0.015093,0.019469,0.043647,0.032838,0.068747,0.00266,0.02799,0.029402,...,0.042907,0.011851,0.051017,0.034134,0.036204,0.03263,0.021406,0.039936,0.019115,0.003733
Suffering Man's Charity,0.01433,0.015093,1.0,0.01194,0.024692,0.005401,0.029164,0.0,0.02448,0.021353,...,0.018328,0.019509,0.034111,0.033582,0.031577,0.023987,0.092826,0.016407,0.020631,0.0
Koodal Nagar,0.026994,0.019469,0.01194,1.0,0.053017,0.031732,0.041397,0.011694,0.019272,0.023194,...,0.028821,0.024718,0.017626,0.054644,0.024932,0.023805,0.010924,0.012794,0.018482,0.01356
Crossed Swords,0.052774,0.043647,0.024692,0.053017,1.0,0.033743,0.068657,0.008876,0.03501,0.03835,...,0.061834,0.022102,0.053161,0.044603,0.044421,0.03604,0.025465,0.036325,0.036353,0.014866


The cosine_similarity_array and cosine_similarity_df contain a matrix of the similarity values between all movies.

As we can see in the table, each movie has its own row and its own column, so for example, the value in the cell where the 'Resident Evil: Apocalypse' row meets the 'The Black Swan' column represents the cosine distance between them. This allows us to look up any distance of any movie pairing by filtering on the two axes.

## Making recommendations with TF-IDF
We pre-calculated the similarity ratings between all movies in the dataset based on their plots transformed by TF-IDF.   
Now we will put these similarity ratings in a DataFrame for ease of use. Then we will use this new DataFrame to suggest a movie recommendation.

In [103]:
# Find the values for the movie Rio
cosine_similarity_series = cosine_similarity_df.loc['Resident Evil: Apocalypse']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Print the results
print(ordered_similarities)

Title
Resident Evil: Apocalypse    1.000000
Killing Me Softly            0.301498
Borrowed Wives               0.224706
Constantine                  0.167389
Titanic                      0.135106
                               ...   
Saadey CM Saab               0.005404
Batman and Robin             0.004234
Chhalia                      0.000000
Blazing Continent            0.000000
Pyar Ka Bandhan              0.000000
Name: Resident Evil: Apocalypse, Length: 349, dtype: float64


'Killing Me Softly' has the highest similarity value to 'Resident Evil: Apocalypse'. This means that viewers that liked 'Resident Evil: Apocalypse' are likely to enjoy 'Killing Me Softly' also. ???

## Build the user profiles
We are now able to generate suggestions for similar items based on their labeled features or based on their descriptions. But sometimes finding similar items might not be enough.   
Now, we will work through how one could create recommendations based on a user and all the items they liked as opposed to a singular item. We will first generate a profile for a user by aggregating all of the movies they have previously enjoyed.

In [None]:
list_of_movies_enjoyed = ['Captain America: The First Avenger', 'Green Lantern', 'The Avengers']

# Create a subset of only the movies the user has enjoyed
movies_enjoyed_df = tfidf_summary_df.reindex(list_of_movies_enjoyed)

# Inspect the DataFrame
print(movies_enjoyed_df)

In [None]:
list_of_movies_enjoyed = ['Captain America: The First Avenger', 'Green Lantern', 'The Avengers']

# Create a subset of only the movies the user has enjoyed
movies_enjoyed_df = tfidf_summary_df.reindex(list_of_movies_enjoyed)

# Generate the user profile by finding the average scores of movies they enjoyed
user_prof = movies_enjoyed_df.mean()

# Inspect the results
print(user_prof)

By aggregating the scores of the movies the user enjoyed, we have been able to create a summary of a user's tastes that we will be able to use to find new movies similar to what they usually enjoy.

## User profile based recommendations
Now that we have built the user profile based on the aggregate of the individual movies they enjoyed, we can compare it to the larger tfidf_summary_df DataFrame that we have been working with to generate suggestions. As we would not want to suggest movies that the user has already watched, we will first find a subset of the tfidf_summary_df DataFrame that does not contain any of the previously watched movies.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Find subset of tfidf_df that does not include movies in list_of_movies_enjoyed
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

In [None]:
# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

In [None]:
# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

# Inspect the most similar to the user preferences
print(sorted_similarity_df.head())

As you can see, the top recommendations are all action-packed blockbusters, similar to those previously enjoyed by the user.