### Step by step code - Genre based recommendation engine 

In [243]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [244]:
# using only two columns from the dataset

df = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])

In [245]:
# considering only 3 rows from the dataset

movies = df.iloc[:3,:]

In [246]:
movies

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance


In [247]:
# genres column data

movies["genres"][0]

'Adventure|Animation|Children|Comedy|Fantasy'

In [248]:
movies['genres'] = movies['genres'].str.split('|')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['genres'] = movies['genres'].str.split('|')


In [249]:
movies["genres"][0]

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

In [250]:
movies['genres'] = movies['genres'].fillna("").astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['genres'] = movies['genres'].fillna("").astype('str')


In [251]:
movies["genres"][0]

"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']"

In [252]:
# initializing the tfidfvectorizer
# ngram_range=(1, 2) => means unigrams and bigrams

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                     min_df=0, stop_words='english')

In [253]:
# min_df => When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.

tfidf_matrix = tf.fit_transform(movies['genres'])

In [254]:
# 3 documents or movies are there
# 13 unique words are there

tfidf_matrix.shape

(3, 13)

In [255]:
# toarray() => looking at the sparse matrix by converting into array

tfidf_matrix.toarray()

array([[0.28122142, 0.36977238, 0.        , 0.36977238, 0.36977238,
        0.28122142, 0.36977238, 0.        , 0.28122142, 0.36977238,
        0.        , 0.28122142, 0.        ],
       [0.3935112 , 0.        , 0.51741994, 0.        , 0.        ,
        0.3935112 , 0.        , 0.51741994, 0.        , 0.        ,
        0.        , 0.3935112 , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4736296 , 0.        ,
        0.62276601, 0.        , 0.62276601]])

In [256]:
# todense() method also does the same

tfidf_matrix.todense()

matrix([[0.28122142, 0.36977238, 0.        , 0.36977238, 0.36977238,
         0.28122142, 0.36977238, 0.        , 0.28122142, 0.36977238,
         0.        , 0.28122142, 0.        ],
        [0.3935112 , 0.        , 0.51741994, 0.        , 0.        ,
         0.3935112 , 0.        , 0.51741994, 0.        , 0.        ,
         0.        , 0.3935112 , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.4736296 , 0.        ,
         0.62276601, 0.        , 0.62276601]])

In [257]:
# looking at only the first document or row

tfidf_matrix.toarray()[0,:]

array([0.28122142, 0.36977238, 0.        , 0.36977238, 0.36977238,
       0.28122142, 0.36977238, 0.        , 0.28122142, 0.36977238,
       0.        , 0.28122142, 0.        ])

In [258]:
# all the column names or feature names extracted from document corpus

# document wise words
# doc1 - Adventure|Animation|Children|Comedy|Fantasy
# doc2 - Adventure|Children|Fantasy
# doc3 - Comedy|Romance

tf.get_feature_names_out()

array(['adventure', 'adventure animation', 'adventure children',
       'animation', 'animation children', 'children', 'children comedy',
       'children fantasy', 'comedy', 'comedy fantasy', 'comedy romance',
       'fantasy', 'romance'], dtype=object)

In [259]:
# 13 unique word combinations are there in the corpus

tf.get_feature_names_out().shape

(13,)

In [260]:
# tfidf score for the first document

pd.DataFrame(tfidf_matrix[0].T.todense(), index=tf.get_feature_names_out(), columns=["tfidf"])

Unnamed: 0,tfidf
adventure,0.281221
adventure animation,0.369772
adventure children,0.0
animation,0.369772
animation children,0.369772
children,0.281221
children comedy,0.369772
children fantasy,0.0
comedy,0.281221
comedy fantasy,0.369772


In [261]:
# tfidf for all the documents - Toy Story, Jumanji, Grumpier Old Men

pd.DataFrame(data = tfidf_matrix.toarray(),index = movies["title"],columns = tf.get_feature_names_out())

Unnamed: 0_level_0,adventure,adventure animation,adventure children,animation,animation children,children,children comedy,children fantasy,comedy,comedy fantasy,comedy romance,fantasy,romance
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Toy Story,0.281221,0.369772,0.0,0.369772,0.369772,0.281221,0.369772,0.0,0.281221,0.369772,0.0,0.281221,0.0
Jumanji,0.393511,0.0,0.51742,0.0,0.0,0.393511,0.0,0.51742,0.0,0.0,0.0,0.393511,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47363,0.0,0.622766,0.0,0.622766


In [262]:
# cosine similarity
# the first column represents the cosine similarity of the 'Toy Story' document with other documnets 'Toy Story', 'Jumanji', 'Grumpier Old Men'.
# The r1c1 is 'Toy Story' similarity with 'Toy Story', so its 1.
# then r2c1 => 'Toy Story' similarity with 'Jumanji'
# then r3c1 => 'Toy Story' similarity with 'Grumpier Old Men'

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.33199134, 0.13319479],
       [0.33199134, 1.        , 0.        ],
       [0.13319479, 0.        , 1.        ]])

In [263]:
# We can reset the index and column names with the movie names so that we can visusalize the similarities

pd.DataFrame(cosine_sim, index=movies["title"].values, columns=movies["title"].values)

Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men
Toy Story,1.0,0.331991,0.133195
Jumanji,0.331991,1.0,0.0
Grumpier Old Men,0.133195,0.0,1.0


In [264]:
# this will be useful to implement the recommendation function

titles_genres = movies[["title","genres"]]
titles_genres

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,Grumpier Old Men,"['Comedy', 'Romance']"


In [265]:
# Building a pandas series with movie titles as index and values as movie index 
# Just opposite of the `titles` variable

indices = pd.Series(movies.index, index=movies['title'])
indices

title
Toy Story            0
Jumanji              1
Grumpier Old Men     2
dtype: int64

In [266]:
# Function that get movie recommendations based on the cosine similarity score of movie genres

def genre_based_recommendations(title):
    # getting index against against movie title
    idx = indices[title]

    # getting the similarity scores for that particular movie with other movies 
    sim_scores = list(enumerate(cosine_sim[idx]))
    # [(0, 0.33199134146764564), (1, 1.0000000000000004), (2, 0.0)]
    # print(sim_scores)

    # arranging the scores in descending order
    # we are using `key=lambda x: x[1]` becasue we have enumerated the scores and noew we have a tuple with an index and then the score
    # the score is present in the second position of the tuple and based on those scores we want to perform the sorting
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # [(1, 1.0000000000000004), (0, 0.33199134146764564), (2, 0.0)]
    # print(sim_scores)

    # taking top 5 scores or movies
    sim_scores = sim_scores[1:5]
    # [(0, 0.33199134146764564), (2, 0.0)]
    # print(sim_scores)

    # getting the indices only for the top 5 movies
    # we are getting only those movies which has similarity score greater than 0
    movie_indices = [i[0] for i in sim_scores if i[1] > 0]
    # [0]
    # print(movie_indices)

    # returning the list of movies using the indices
    return movies.iloc[movie_indices]

In [267]:
genre_based_recommendations("Jumanji ")

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."


### Combined Code - Genre based recommendation engine 

In [268]:
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])

In [269]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

In [270]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                     min_df=0, stop_words='english')

tfidf_matrix_genre = tf.fit_transform(movies['genres'])

cosine_sim_genre = cosine_similarity(tfidf_matrix_genre, tfidf_matrix_genre)

In [271]:
indices = pd.Series(movies.index, index=movies['title'])

def genre_based_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_genre[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores if i[1] > 0]
    return movies.iloc[movie_indices]

In [272]:
genre_based_recommendations('Jersey Girl ')

Unnamed: 0,title,genres
10,"American President, The","['Comedy', 'Drama', 'Romance']"
47,Mighty Aphrodite,"['Comedy', 'Drama', 'Romance']"
52,"Postman, The (Postino, Il)","['Comedy', 'Drama', 'Romance']"
83,Beautiful Girls,"['Comedy', 'Drama', 'Romance']"
165,Something to Talk About,"['Comedy', 'Drama', 'Romance']"
191,Don Juan DeMarco,"['Comedy', 'Drama', 'Romance']"
198,Eat Drink Man Woman (Yin shi nan nu),"['Comedy', 'Drama', 'Romance']"
243,Nobody's Fool,"['Comedy', 'Drama', 'Romance']"
309,"Corrina, Corrina","['Comedy', 'Drama', 'Romance']"
317,I Like It Like That,"['Comedy', 'Drama', 'Romance']"


In [273]:
movies.sample(10)

Unnamed: 0,title,genres
6376,"Good German, The","['Drama', 'Mystery', 'Thriller']"
4973,Oklahoma!,"['Musical', 'Romance', 'Western']"
2483,"Flamingo Kid, The","['Comedy', 'Drama']"
2375,"Last Picture Show, The",['Drama']
6231,I Am a Sex Addict,"['Comedy', 'Documentary', 'Romance']"
2121,Iron Eagle IV,"['Action', 'War']"
9052,Steve Jobs: The Man in the Machine,['Documentary']
7136,Attack of the 50 Foot Woman,"['Comedy', 'Sci-Fi']"
897,Cheech and Chong's Up in Smoke,['Comedy']
8779,Johnny Express,"['Animation', 'Comedy', 'Sci-Fi']"


### Title based recommendation engine

In [274]:
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])

movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                     min_df=0, stop_words='english')

tfidf_matrix_title = tf.fit_transform(movies['title'])

cosine_sim_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
indices = pd.Series(movies.index, index=movies['title'])

def title_based_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_title[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores if i[1] > 0]
    return movies.iloc[movie_indices]

In [275]:
title_based_recommendations('Jersey Girl ')

Unnamed: 0,title,genres
542,Girl 6,"['Comedy', 'Drama']"
2531,Who's That Girl?,['Comedy']
4149,My Girl 2,"['Comedy', 'Drama', 'Romance']"
4150,My Girl,"['Comedy', 'Drama', 'Romance']"
8455,Jersey Boys,"['Drama', 'Musical']"
245,New Jersey Drive,"['Crime', 'Drama']"
3918,"Good Girl, The","['Comedy', 'Drama']"
158,Party Girl,['Comedy']
4901,"Girl Next Door, The","['Comedy', 'Romance']"
6753,"Girl Next Door, The","['Crime', 'Drama', 'Horror', 'Thriller']"


### Comparing the results

In [279]:
movies.sample(10)

Unnamed: 0,title,genres
6963,Outlander,"['Action', 'Adventure', 'Sci-Fi']"
4224,Venom,"['Horror', 'Thriller']"
8769,Crimson Peak,['Horror']
5302,Garden State,"['Comedy', 'Drama', 'Romance']"
4108,Brother (Brat),"['Crime', 'Drama']"
1225,Fire Down Below,"['Action', 'Drama', 'Thriller']"
1333,Wild Things,"['Crime', 'Drama', 'Mystery', 'Thriller']"
4952,Watch on the Rhine,['Drama']
4903,Spartan,['Thriller']
7730,Headhunters (Hodejegerne),"['Action', 'Crime', 'Thriller']"


In [282]:
genre_based_recommendations('Garden State ')

Unnamed: 0,title,genres
10,"American President, The","['Comedy', 'Drama', 'Romance']"
47,Mighty Aphrodite,"['Comedy', 'Drama', 'Romance']"
52,"Postman, The (Postino, Il)","['Comedy', 'Drama', 'Romance']"
83,Beautiful Girls,"['Comedy', 'Drama', 'Romance']"
165,Something to Talk About,"['Comedy', 'Drama', 'Romance']"
191,Don Juan DeMarco,"['Comedy', 'Drama', 'Romance']"
198,Eat Drink Man Woman (Yin shi nan nu),"['Comedy', 'Drama', 'Romance']"
243,Nobody's Fool,"['Comedy', 'Drama', 'Romance']"
309,"Corrina, Corrina","['Comedy', 'Drama', 'Romance']"
317,I Like It Like That,"['Comedy', 'Drama', 'Romance']"


In [283]:
title_based_recommendations('Garden State ')

Unnamed: 0,title,genres
464,"Secret Garden, The","['Children', 'Drama']"
9387,Over the Garden Wall,"['Adventure', 'Animation', 'Drama']"
7683,Red State,"['Action', 'Crime', 'Horror', 'Thriller']"
1104,"Cement Garden, The",['Drama']
7011,State of Play,"['Crime', 'Drama', 'Thriller']"
1755,Enemy of the State,"['Action', 'Thriller']"
4287,Head of State,['Comedy']
5557,State of Grace,"['Crime', 'Drama', 'Thriller']"
3875,Sunshine State,['Drama']
3011,State and Main,"['Comedy', 'Drama']"
