In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 


### term frequencey- inverse document frequncy is a numerical statistic that is intended to reflect how important a word is to document in a collecion or corpus

In [2]:
entertainment = pd.read_csv("Entertainment.csv")
entertainment

Unnamed: 0,Id,Titles,Category,Reviews
0,6973,Toy Story (1995),"Drama, Romance, School, Supernatural",-8.98
1,6778,Jumanji (1995),"Action, Adventure, Drama, Fantasy, Magic, Mili...",8.88
2,9702,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",99.0
3,6769,Waiting to Exhale (1995),"Sci-Fi, Thriller",99.0
4,1123,Father of the Bride Part II (1995),"Action, Comedy, Historical, Parody, Samurai, S...",-0.44
5,9860,Heat (1995),"Comedy, Drama, School, Shounen, Sports",-6.65
6,1803,Sabrina (1995),"Action, Adventure, Shounen, Super Power",99.0
7,9721,Tom and Huck (1995),"Drama, Military, Sci-Fi, Space",-5.19
8,6563,Sudden Death (1995),"Action, Comedy, Historical, Parody, Samurai, S...",-7.86
9,1323,GoldenEye (1995),"Action, Comedy, Historical, Parody, Samurai, S...",3.01


In [3]:
entertainment.isna().sum()

Id          0
Titles      0
Category    0
Reviews     0
dtype: int64

## No Missing Files

In [4]:
# Creating a Tfidf Vectorizer to remove all stop words

tfidf = TfidfVectorizer(stop_words = "english")    # taking stop words from tfid vectorizer

# taking top english top words

In [5]:
# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(entertainment.Category)  

 # Transform a count matrix to a normalized tf or tf-idf representation

tfidf_matrix.shape #12294, 46


(51, 34)

with the above matrix we need to find the similarity score¶

There are several metrics for this such as the euclidean,

the Pearson and the cosine similarity scores

For now we will be using cosine similarity matrix

A numeric quantity to represent the similarity between 2 movies

Cosine similarity - metric is independent of magnitude and easy to calculate

In [6]:
from sklearn.metrics.pairwise import linear_kernel

In [7]:
# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim_matrix

array([[1.        , 0.09421367, 0.        , ..., 0.12767481, 0.16772551,
        0.31295101],
       [0.09421367, 1.        , 0.16662513, ..., 0.22332745, 0.        ,
        0.        ],
       [0.        , 0.16662513, 1.        , ..., 0.13383076, 0.        ,
        0.        ],
       ...,
       [0.12767481, 0.22332745, 0.13383076, ..., 1.        , 0.47083158,
        0.17020003],
       [0.16772551, 0.        , 0.        , ..., 0.47083158, 1.        ,
        0.64107498],
       [0.31295101, 0.        , 0.        , ..., 0.17020003, 0.64107498,
        1.        ]])

In [8]:
# creating a mapping of entertainment name to index number 

entertainment_index = pd.Series(entertainment.index, index = entertainment['Titles']).drop_duplicates()
entertainment_index

Titles
Toy Story (1995)                                         0
Jumanji (1995)                                           1
Grumpier Old Men (1995)                                  2
Waiting to Exhale (1995)                                 3
Father of the Bride Part II (1995)                       4
Heat (1995)                                              5
Sabrina (1995)                                           6
Tom and Huck (1995)                                      7
Sudden Death (1995)                                      8
GoldenEye (1995)                                         9
American President, The (1995)                          10
Dracula: Dead and Loving It (1995)                      11
Balto (1995)                                            12
Nixon (1995)                                            13
Cutthroat Island (1995)                                 14
Casino (1995)                                           15
Sense and Sensibility (1995)                     

In [9]:
def get_recommendations(Titles, topN):   

    # topN = 10
    # Getting the movie index using its title 

    entertainment_id = entertainment_index[Titles]
    
    # Getting the pair wise similarity score for all the entertainment's with that entertainment

    cosine_scores = list(enumerate(cosine_sim_matrix[entertainment_id]))
    
    # Sorting the cosine_similarity scores based on scores 

    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    
    # Get the scores of top N most similar movies 

    cosine_scores_N = cosine_scores[0: topN+1]
    
    # Getting the movie index 

    entertainment_idx  =  [i[0] for i in cosine_scores_N]
    entertainment_scores =  [i[1] for i in cosine_scores_N]
    
    # Similar movies and scores

    entertainment_similar_show = pd.DataFrame(columns=["Score"])
    entertainment_similar_show["Titles"] = entertainment.loc[entertainment_idx, "Titles"]
    entertainment_similar_show["Score"] = entertainment_scores
    entertainment_similar_show.reset_index(inplace = True)  

    print (entertainment_similar_show)
    
   

In [10]:
# Enter your anime and number of anime's to be recommended 
get_recommendations("Clueless (1995)", topN = 10)
entertainment_index["Clueless (1995)"]

    index     Score                          Titles
0      35  1.000000                 Clueless (1995)
1      24  0.620614        Leaving Las Vegas (1995)
2      10  0.585207  American President, The (1995)
3      15  0.546160                   Casino (1995)
4      39  0.510425              Restoration (1995)
5       0  0.474735                Toy Story (1995)
6       1  0.426853                  Jumanji (1995)
7      45  0.370008    When Night Is Falling (1995)
8      17  0.354725               Four Rooms (1995)
9      27  0.354725               Persuasion (1995)
10     28  0.354725    City of Lost Children (1995)


35