In [7]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install emoji --upgrade

In [9]:
import pandas as pd
import numpy as np

In [10]:
# Importing Dataset
url = 'https://raw.githubusercontent.com/AnjaniSriya/RecommendationSystem/master/disney_plus_shows.csv'
shows = pd.read_csv(url)

In [11]:
shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdb_id      894 non-null    object 
 1   title        894 non-null    object 
 2   plot         866 non-null    object 
 3   type         894 non-null    object 
 4   rated        742 non-null    object 
 5   year         894 non-null    object 
 6   released_at  874 non-null    object 
 7   added_at     992 non-null    object 
 8   runtime      838 non-null    object 
 9   genre        885 non-null    object 
 10  director     689 non-null    object 
 11  writer       743 non-null    object 
 12  actors       870 non-null    object 
 13  language     865 non-null    object 
 14  country      869 non-null    object 
 15  awards       556 non-null    object 
 16  metascore    292 non-null    float64
 17  imdb_rating  879 non-null    float64
 18  imdb_votes   879 non-null    object 
dtypes: float

In [12]:
#Obtaining the necessary columns from the entire dataset
shows = shows[['imdb_id','title','type','plot','genre','director','actors','language','imdb_rating','imdb_votes']]
shows.head(5)

Unnamed: 0,imdb_id,title,type,plot,genre,director,actors,language,imdb_rating,imdb_votes
0,tt0147800,10 Things I Hate About You,movie,"A pretty, popular teenager can't go out on a d...","Comedy, Drama, Romance",Gil Junger,"Heath Ledger, Julia Stiles, Joseph Gordon-Levi...","English, French",7.3,283945
1,tt7019028,101 Dalmatian Street,series,This series follows the lives of Delilah and D...,"Animation, Comedy, Family",,"Josh Brener, Michaela Dietz, Bert Davis, Abiga...",English,6.2,124
2,tt0115433,101 Dalmatians,movie,An evil high-fashion designer plots to steal D...,"Adventure, Comedy, Crime, Family",Stephen Herek,"Glenn Close, Jeff Daniels, Joely Richardson, J...","English, Spanish",5.7,97785
3,tt0324941,101 Dalmatians 2: Patch's London Adventure,movie,"Being one of 101 takes its toll on Patch, who ...","Animation, Adventure, Comedy, Family, Musical","Jim Kammerud, Brian Smith","Barry Bostwick, Jason Alexander, Martin Short,...",English,5.8,7434
4,tt0211181,102 Dalmatians,movie,Cruella DeVil gets out of prison and goes afte...,"Adventure, Comedy, Family",Kevin Lima,"Glenn Close, Gérard Depardieu, Ioan Gruffudd, ...",English,4.9,33444


In [13]:
shows.isnull().sum()
#The number of missing values in the data set is returned by the function dataset.isnull().sum()

imdb_id         98
title           98
type            98
plot           126
genre          107
director       303
actors         122
language       127
imdb_rating    113
imdb_votes     113
dtype: int64

In [14]:
shows.dropna(inplace=True)
shows.isnull().sum()
#The dropna() method deletes rows with NULL values.

imdb_id        0
title          0
type           0
plot           0
genre          0
director       0
actors         0
language       0
imdb_rating    0
imdb_votes     0
dtype: int64

In [15]:
shows.duplicated().sum()
#analyzing duplicate values

0

# Weighted_rating
When producing its top 'n' movies, IMDB, for example, does not utilise simple movie vote averages as the score. Instead, it utilizes a formula that considers the overall number of votes received.

A formula for identifying the most popular films:


```
# ((v ÷ v + m) * R ) + ((m ÷ v + m) * C )
where:
  R = Rating average of the movie
  v = Number of votes for the movie
  m = minimum votes required
  C = the mean votes

```




In [16]:
def weighted_rating(dataframe):
  votes = dataframe['imdb_rating']
  mean = dataframe['imdb_rating'].mean()
  rating = dataframe['imdb_votes']
  min = dataframe['imdb_votes'].quantile(0.9)
  dataframe['wr'] = (votes/(votes+min) * rating) + (min/(min+votes) * mean)

In [17]:
shows['imdb_votes'] = shows['imdb_votes'].str.replace(',', '')
shows['imdb_votes'] = shows['imdb_votes'].astype(int)

In [18]:
weighted_rating(shows)
top_shows = shows.sort_values('wr', ascending=False)
top_shows.head(5)
#The top 5 shows available on Disney Plus are displayed.


Unnamed: 0,imdb_id,title,type,plot,genre,director,actors,language,imdb_rating,imdb_votes,wr
712,tt0076759,Star Wars: Episode IV - A New Hope,movie,Luke Skywalker joins forces with a Jedi Knight...,"Action, Adventure, Fantasy, Sci-Fi",George Lucas,"Mark Hamill, Harrison Ford, Carrie Fisher, Pet...",English,8.6,1188658,48.385802
498,tt0848228,The Avengers,movie,Earth's mightiest heroes must come together an...,"Action, Adventure, Sci-Fi",Joss Whedon,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...","English, Russian, Hindi",8.0,1225316,46.668074
719,tt0080684,Star Wars: Episode V - The Empire Strikes Back,movie,After the Rebels are brutally overpowered by t...,"Action, Adventure, Fantasy, Sci-Fi",Irvin Kershner,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",English,8.7,1109656,46.060375
40,tt0499549,Avatar,movie,A paraplegic Marine dispatched to the moon Pan...,"Action, Adventure, Fantasy, Sci-Fi",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","English, Spanish",7.8,1086714,41.243075
493,tt2015381,Guardians of the Galaxy,movie,A group of intergalactic criminals must pull t...,"Action, Adventure, Comedy, Sci-Fi",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",English,8.0,1007917,39.553411


In [19]:
movies = shows[shows.type=='movie']
weighted_rating(movies)
top_movies = movies.sort_values('wr', ascending=False)
top_movies.head(5)
#The top 5 movies available on Disney Plus are displayed

Unnamed: 0,imdb_id,title,type,plot,genre,director,actors,language,imdb_rating,imdb_votes,wr
712,tt0076759,Star Wars: Episode IV - A New Hope,movie,Luke Skywalker joins forces with a Jedi Knight...,"Action, Adventure, Fantasy, Sci-Fi",George Lucas,"Mark Hamill, Harrison Ford, Carrie Fisher, Pet...",English,8.6,1188658,47.50432
498,tt0848228,The Avengers,movie,Earth's mightiest heroes must come together an...,"Action, Adventure, Sci-Fi",Joss Whedon,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...","English, Russian, Hindi",8.0,1225316,45.822523
719,tt0080684,Star Wars: Episode V - The Empire Strikes Back,movie,After the Rebels are brutally overpowered by t...,"Action, Adventure, Fantasy, Sci-Fi",Irvin Kershner,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",English,8.7,1109656,45.227539
40,tt0499549,Avatar,movie,A paraplegic Marine dispatched to the moon Pan...,"Action, Adventure, Fantasy, Sci-Fi",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","English, Spanish",7.8,1086714,40.511011
493,tt2015381,Guardians of the Galaxy,movie,A group of intergalactic criminals must pull t...,"Action, Adventure, Comedy, Sci-Fi",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",English,8.0,1007917,38.856694


In [20]:
episodes = shows[shows.type=='episode']
weighted_rating(episodes)
top_episodes = episodes.sort_values('wr', ascending=False)
top_episodes.head(5)
#The top 5 movies available on Disney Plus are displayed

Unnamed: 0,imdb_id,title,type,plot,genre,director,actors,language,imdb_rating,imdb_votes,wr
552,tt0091566,Mr. Boogedy,episode,A novelty-salesman moves his family into a new...,"Adventure, Drama, Family",Oz Scott,"Richard Masur, Mimi Kennedy, Benji Gregory, Da...",English,7.3,1173,20.459764
185,tt3067144,Phineas and Ferb: Star Wars,episode,Phineas and Ferb travel to a galaxy far far aw...,"Animation, Action, Comedy, Family, Musical, Sc...","Robert Hughes, Sue Perrotto","Vincent Martella, Ella Kennedy, Ashley Tisdale...","English, Brazilian Sign Language",8.2,492,13.315022
604,tt2283584,Phineas and Ferb: Mission Marvel,episode,When Dr. Doofenshmirtz's latest invention caus...,"Animation, Action, Comedy, Family, Musical, Sc...","Robert Hughes, Sue Perrotto","Vincent Martella, Ashley Tisdale, Thomas Brodi...",Brazilian Sign Language,8.3,419,12.438303
102,tt1378121,El Materdor,episode,Mater tells Lightning McQueen about his former...,"Animation, Adventure, Comedy, Family","John Lasseter, Rob Gibbs(co-director), Victor ...","Keith Ferguson, Larry the Cable Guy",English,6.4,283,9.84948
109,tt1378123,Rescue Squad Mater,episode,Mater tells Lightning McQueen about his previo...,"Animation, Adventure, Comedy","John Lasseter, Rob Gibbs(co-director), Victor ...","Keith Ferguson, Larry the Cable Guy",English,6.3,272,9.69721


# Rapid Automation Keyword Extraction
To extract keywords from the plot(description) of the shows, we used the Rake keyword extraction algorithm.
For more information about rake_nlkt, check out https://pypi.org/project/rake-nltk/

This algorithm is from NLP


In [21]:
#installing
!pip install rake_nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Collecting nltk<4.0.0,>=3.6.2
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.0 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 43.8 MB/s 
[?25hInstalling collected packages: regex, nltk, rake-nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 rake-nltk-1.0.6 regex-2022.4.24


In [22]:
from rake_nltk import Rake   
import nltk

In [23]:
import nltk;
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

True

In [24]:
r = Rake()   # using rake to remove stop words from the plot
keys = []
for i in range(len(shows['plot'])):
  temp = str(shows.iat[i,3])
  r.extract_keywords_from_text(temp)  # to extract key words
  temp2 = r.get_ranked_phrases()   # to get keyword phrases ranked highest to lowest.
  keys.append(temp2)


#creating new column "keywords" and adding keywords to it
shows['keywords']=keys 

In [25]:
shows['keywords']
#keywords extracted from the plot

0      [tempered older sister, popular teenager, pret...
2      [steal dalmatian puppies, fashion designer plo...
3      [accidentally left behind, publicity campaign,...
4            [cruella devil gets, puppies, prison, goes]
5      [horrible blind date, young woman, follows kat...
                             ...                        
987    [new life, mom remarries, hard adjusting, zoey...
988    [suspected dodgy dealings, space station home,...
989    [space station helps, old girl living, mischie...
990    [important competition, heroine sets, win, suc...
991    [cynical con artist fox must work together, ro...
Name: keywords, Length: 655, dtype: object

**G**enre + **D**irector + **K**eywords + **A**ctors = **gdka**

In [26]:
gdka = shows[['imdb_id', 'title', 'genre', 'director', 'keywords', 'actors']]

In [27]:
# Extracting all genre into the list
gdka['genre'] = gdka['genre'].map(lambda x: x.split(','))
# Since there are so many actors on screen, we only add up the first three into the list
gdka['actors'] = gdka['actors'].map(lambda x: x.split(',')[:3])
# Extracting all genre into the list
gdka['director'] = gdka['director'].map(lambda x: x.split(','))

# By combining the first and last names into one word and converting it to lowercase
# we can establish unique identification names.
for index, row in gdka.iterrows():
    row['genre'] = [x.lower().replace(' ','') for x in row['genre']]
    row['actors'] = [x.lower().replace(' ','') for x in row['actors']]
    row['director'] = [x.lower().replace(' ','') for x in row['director']]
gdka

Unnamed: 0,imdb_id,title,genre,director,keywords,actors
0,tt0147800,10 Things I Hate About You,"[comedy, drama, romance]",[giljunger],"[tempered older sister, popular teenager, pret...","[heathledger, juliastiles, josephgordon-levitt]"
2,tt0115433,101 Dalmatians,"[adventure, comedy, crime, family]",[stephenherek],"[steal dalmatian puppies, fashion designer plo...","[glennclose, jeffdaniels, joelyrichardson]"
3,tt0324941,101 Dalmatians 2: Patch's London Adventure,"[animation, adventure, comedy, family, musical]","[jimkammerud, briansmith]","[accidentally left behind, publicity campaign,...","[barrybostwick, jasonalexander, martinshort]"
4,tt0211181,102 Dalmatians,"[adventure, comedy, family]",[kevinlima],"[cruella devil gets, puppies, prison, goes]","[glennclose, gérarddepardieu, ioangruffudd]"
5,tt1846442,12 Dates of Christmas,"[comedy, fantasy, romance]",[jameshayman],"[horrible blind date, young woman, follows kat...","[lauramiyata, vijaymehta, amysmart]"
...,...,...,...,...,...,...
987,tt3544734,Zapped,"[comedy, family, fantasy]",[peterdeluise],"[new life, mom remarries, hard adjusting, zoey...","[zendaya, chanellepeloso, spencerboldman]"
988,tt0186726,Zenon: Girl of the 21st Century,"[adventure, comedy, family, sci-fi]",[kennethjohnson],"[suspected dodgy dealings, space station home,...","[kirstenstorms, raven-symoné, stuartpankin]"
989,tt0271271,Zenon: The Zequel,"[comedy, family, adventure, sci-fi]",[mannycoto],"[space station helps, old girl living, mischie...","[kirstenstorms, shadiasimmons, laurenmaltby]"
990,tt0379060,Zenon: Z3,"[adventure, comedy, family, sci-fi]",[steverash],"[important competition, heroine sets, win, suc...","[kirstenstorms, laurenmaltby, alysonmorgan]"




**I 💗 clean code** 


Let's clean our data



In [28]:
#function 
def clean_code(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if it exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [29]:
features = ['keywords', 'actors', 'director', 'genre']
for feature in features:
  gdka[feature] = gdka[feature].apply(clean_code)

In [30]:
gdka['keywords'] = gdka['keywords'].agg(lambda x: ','.join(map(str, x)))
gdka['keywords'] = gdka['keywords'].str.replace(',','')

In [31]:
 # Filling all the Null value to empty string
for f in features:
    gdka[f] = gdka[f].fillna('')

In [32]:
#This function will collect all -> genre + director + keywords + actors
def bunchUp(row):
    return str(row['genre']) + " " + str(row['director']) + " " + str(row['keywords']) + " " + str(row['actors']) 
gdka['bunchUp'] = gdka.apply(bunchUp, axis = 1)

In [33]:
gdka['bunchUp'] = gdka['bunchUp'].str.replace(',','')

In [34]:
gdka['bunchUp']

0      ['comedy' 'drama' 'romance'] ['giljunger'] tem...
2      ['adventure' 'comedy' 'crime' 'family'] ['step...
3      ['animation' 'adventure' 'comedy' 'family' 'mu...
4      ['adventure' 'comedy' 'family'] ['kevinlima'] ...
5      ['comedy' 'fantasy' 'romance'] ['jameshayman']...
                             ...                        
987    ['comedy' 'family' 'fantasy'] ['peterdeluise']...
988    ['adventure' 'comedy' 'family' 'sci-fi'] ['ken...
989    ['comedy' 'family' 'adventure' 'sci-fi'] ['man...
990    ['adventure' 'comedy' 'family' 'sci-fi'] ['ste...
991    ['animation' 'adventure' 'comedy' 'crime' 'fam...
Name: bunchUp, Length: 655, dtype: object

We then apply two Scikit-learn concepts.
1.   Count Vectorizer
2.   Cosine Similarity

Scikit-learn is a free software machine learning library for the Python programming language. For more information about Scikit-learn, check out https://scikit-learn.org/stable/

For each movie, we'll calculate pairwise similarity scores. The cosine similarity will be used to create a numerical value that represents the similarity between two shows.
\begin{equation}
\cos ({\bf t},{\bf e})= {{\bf t} {\bf e} \over \|{\bf t}\| \|{\bf e}\|} = \frac{ \sum_{i=1}^{n}{{\bf t}_i{\bf e}_i} }{ \sqrt{\sum_{i=1}^{n}{({\bf t}_i)^2}} \sqrt{\sum_{i=1}^{n}{({\bf e}_i)^2}} }
\end{equation}

CountVectorizer is used to convert a given text into a vector based on the frequency (count) of each word that appears in the full text.

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(gdka['bunchUp'])
cosine_sim = cosine_similarity(count_matrix)

In [37]:
cosine_sim

array([[1.        , 0.11111111, 0.10050378, ..., 0.10540926, 0.10540926,
        0.0860663 ],
       [0.11111111, 1.        , 0.30151134, ..., 0.31622777, 0.31622777,
        0.34426519],
       [0.10050378, 0.30151134, 1.        , ..., 0.28603878, 0.28603878,
        0.31139958],
       ...,
       [0.10540926, 0.31622777, 0.28603878, ..., 1.        , 0.7       ,
        0.24494897],
       [0.10540926, 0.31622777, 0.28603878, ..., 0.7       , 1.        ,
        0.24494897],
       [0.0860663 , 0.34426519, 0.31139958, ..., 0.24494897, 0.24494897,
        1.        ]])

In [38]:
# to generate the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(gdka['bunchUp'])
count_matrix

<655x2553 sparse matrix of type '<class 'numpy.int64'>'
	with 6231 stored elements in Compressed Sparse Row format>

Fun Facts: 

👉 Because each show is identical to itself, all the numbers on the diagonal are 1.

👉 Because the similarity between A and B is the same as the similarity between B and A, the matrix is symmetrical.


In [39]:
#generating the cosine similarity matrix 
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         0.11111111 0.10050378 ... 0.10540926 0.10540926 0.0860663 ]
 [0.11111111 1.         0.30151134 ... 0.31622777 0.31622777 0.34426519]
 [0.10050378 0.30151134 1.         ... 0.28603878 0.28603878 0.31139958]
 ...
 [0.10540926 0.31622777 0.28603878 ... 1.         0.7        0.24494897]
 [0.10540926 0.31622777 0.28603878 ... 0.7        1.         0.24494897]
 [0.0860663  0.34426519 0.31139958 ... 0.24494897 0.24494897 1.        ]]


In [40]:
gdka.reset_index(inplace=True)

In [41]:
# creating a Series for show titles which can be used as indices (each index is mapped to a show title)
indices = pd.Series(gdka['title'])
indices[:5]

0                    10 Things I Hate About You
1                                101 Dalmatians
2    101 Dalmatians 2: Patch's London Adventure
3                                102 Dalmatians
4                         12 Dates of Christmas
Name: title, dtype: object

In [42]:
# this function takes in a movie title as input and returns the top 15 recommended (similar) shows

def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    recommended_posters = []
    idx = indices[indices == title].index[0]   # to get the index of the show title matching the input movie
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_15_indices = list(score_series.iloc[1:16].index)   
    # [1:16] to exclude 0 (index 0 is the input movie itself)
    
    for i in top_15_indices:   # to append the titles of top 15 similar shows 
        recommended_movies.append(list(gdka['title'])[i])
        
    return recommended_movies


In [43]:
recommend("A Bug's Life")

['El Materdor',
 'Finding Nemo',
 'Toy Story 2',
 'Cars 2',
 'Finding Dory',
 'Rescue Squad Mater',
 'Tokyo Mater',
 'Onward',
 'Monsters, Inc.',
 'Cars',
 'Up',
 'Moana',
 'Ice Age',
 'Mater and the Ghostlight',
 'Brave']

In [44]:
recommend("Zootopia")

['El Materdor',
 'Toy Story 2',
 'Monsters, Inc.',
 'Up',
 'Finding Nemo',
 'Tokyo Mater',
 'Rescue Squad Mater',
 'Moana',
 'Ice Age',
 'Finding Dory',
 "A Bug's Life",
 'Who Framed Roger Rabbit',
 'Cars 2',
 'Inside Out',
 'Ratatouille']

# ***Bidirectional Encoder Representations from Transformers (BERT) model:***

The weights of the pre-trained BERT model are used via feature extraction in the transfer learning technique, and the model is further trained on our dataset.
The hugging face transformers package is used to load the BERT model. We use **BERT-Distilled BERT**, which is lighter and faster.

In [45]:
bert = shows[['imdb_id', 'title', 'genre', 'director', 'keywords', 'actors', 'plot']]

In [46]:
#Installing HuggingFace Transformers framework 
!pip install transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [47]:
import torch

In [48]:
import transformers

In [49]:
#using the DistilBERT model from transformers
bert_model=transformers.DistilBertModel

#End-to-end tokenization for punctuation and word piece, similar to BERT Tokenizer.
berttokenizer=transformers.DistilBertTokenizer

#This model is uncased, which means it does not differentiate between english and English.
weights_type="distilbert-base-uncased"

In [50]:
from transformers import logging
logging.set_verbosity_error()

In [51]:
# Loading pretrained model/tokenizer
tokenizer=berttokenizer.from_pretrained(weights_type)
model=bert_model.from_pretrained(weights_type)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

In [52]:
#Converting string values to a sequence of ids (integer), using the tokenizer and vocabulary.
inputs=bert['plot'].apply((lambda plot: tokenizer.encode(plot, add_special_tokens=True,max_length=100,truncation=True)))

In [53]:
inputs[0]

[101,
 1037,
 3492,
 1010,
 2759,
 10563,
 2064,
 1005,
 1056,
 2175,
 2041,
 2006,
 1037,
 3058,
 2127,
 2014,
 5665,
 1011,
 22148,
 3080,
 2905,
 2515,
 1012,
 102]

In [54]:
#representing the input as one 2-d array, rather than a list of lists (of different lengths).
def padding(list_of_sent):
  output=[]
  max_len=100
  for sent in list_of_sent.values:
    padded_sent=sent+[0]*(max_len-len(sent))
    output.append(padded_sent)
  output=np.array(output)
  return output

In [55]:
inputs=padding(inputs)

In [56]:
inputs[11]

array([  101,  1037,  4562, 21987,  1998,  1037, 10958, 21408,  2239,
        2468,  3435,  2814,  2043,  2027,  1005,  2128,  7260,  2185,
        2091,  1037,  2314,  1010,  2185,  2013,  2037,  2945,  1012,
         102,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

If we directly send padded to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:

In [57]:
mask=np.where(inputs!=0,1,0)

In [58]:
embedded_inputs=torch.tensor(inputs)
attention_mask=torch.tensor(mask)

In [59]:
#The results of the processing will be returned into final_states.
with torch.no_grad():
  final_states = model(embedded_inputs, attention_mask=attention_mask)

In [60]:
#Extracted features
extracted_features=final_states[0][:,0,:].numpy()
extracted_features

array([[-0.08858871, -0.16396962, -0.02802457, ..., -0.12459738,
         0.355413  ,  0.26825094],
       [-0.25794283, -0.25276673, -0.18717615, ..., -0.05740415,
         0.37812752,  0.48238036],
       [-0.44859454, -0.35476866, -0.360466  , ...,  0.31453744,
         0.37856892,  0.24996652],
       ...,
       [-0.43022212, -0.40504026, -0.29433456, ..., -0.11832004,
         0.43650442,  0.41707504],
       [-0.41191915, -0.37585133, -0.02398869, ..., -0.07679958,
         0.23399453,  0.42657483],
       [-0.37286854, -0.2835495 , -0.14210361, ..., -0.06741582,
         0.42930105,  0.41614097]], dtype=float32)

In [61]:
#generating the cosine similarity matrix
cos_sim=cosine_similarity(extracted_features,extracted_features)
cos_sim

array([[0.9999999 , 0.88371813, 0.8533407 , ..., 0.9081422 , 0.93015504,
        0.9044118 ],
       [0.88371813, 1.0000001 , 0.89891803, ..., 0.95299524, 0.9349104 ,
        0.95558864],
       [0.8533407 , 0.89891803, 1.0000002 , ..., 0.8979055 , 0.883198  ,
        0.8919624 ],
       ...,
       [0.9081422 , 0.95299524, 0.8979055 , ..., 0.9999999 , 0.9392922 ,
        0.9577177 ],
       [0.93015504, 0.9349104 , 0.883198  , ..., 0.9392922 , 0.9999998 ,
        0.94566405],
       [0.9044118 , 0.95558864, 0.8919624 , ..., 0.9577177 , 0.94566405,
        1.0000001 ]], dtype=float32)

In [62]:
bert_new = bert[['imdb_id', 'title', 'genre', 'director', 'keywords', 'actors', 'plot']]

In [63]:
#We reset the index since the index values are incorrect.
bert_new.reset_index(inplace=True)

In [64]:
# creating a series for shows titles which can be used as indices (each index is mapped to a title)
indices = pd.Series(bert_new['title'])
indices[:5]

0                    10 Things I Hate About You
1                                101 Dalmatians
2    101 Dalmatians 2: Patch's London Adventure
3                                102 Dalmatians
4                         12 Dates of Christmas
Name: title, dtype: object

In [65]:
# this function takes in a movie title as input and returns the top 15 recommended (similar) shows

def recommenddd(title, cosine_sim = cos_sim):
    recommended = []
    idx = indices[indices == title].index[0]   # to get the index of the shows title matching the input movie
    similarity_scores = pd.Series(cos_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_15_indices = list(similarity_scores.iloc[1:16].index)   # to get the indices of top 15 most similar movies
    # [1:16] to exclude 0 (index 0 is the input movie itself)
    
    for i in top_15_indices:   # to append the titles of top 15 similar shows 
        recommended.append(list(bert_new['title'])[i])
        
    return recommended


In [66]:
recommenddd("A Bug's Life")

['The Million Dollar Duck',
 'Chicken Little',
 'Dumbo',
 'Frankenweenie',
 'Flubber',
 "Don't Look Under the Bed",
 'G-Force',
 'The Muppet Movie',
 'Dinosaur',
 'The BFG',
 'James and the Giant Peach',
 'Zootopia',
 'Atlantis: The Lost Empire',
 '101 Dalmatians',
 'John Carter']

In [67]:
recommenddd("Star Wars: Episode IV - A New Hope")

['Star Wars: Episode V - The Empire Strikes Back',
 'Star Wars: Episode VI - Return of the Jedi',
 'Star Wars: Episode VII - The Force Awakens',
 'Star Wars: Episode II - Attack of the Clones',
 'Star Wars: Episode III - Revenge of the Sith',
 'Star Wars: Episode VIII - The Last Jedi',
 'Ant-Man',
 'Leroy & Stitch',
 'The New Yoda Chronicles: Raid on Coruscant',
 'Stitch! The Movie',
 'Captain America: The Winter Soldier',
 "Pirates of the Caribbean: Dead Man's Chest",
 "Atlantis: Milo's Return",
 'Marvel Rising: Heart of Iron',
 'Thor: Ragnarok']

# The **frontend** section

In Python, any object can be pickled and stored to disc. Pickling is a method for converting a Python object (list, dictionary, etc.) into a character stream. The concept is that this character stream provides all of the information required to recreate the object in a another Python script.

In [68]:
import pickle

In [69]:
pickle.dump(shows,open('disney.pkl','wb'))

In [70]:
pickle.dump(shows.to_dict(),open('disney_dict.pkl','wb'))

In [71]:
pickle.dump(gdka.to_dict(),open('gdka-dataset.pkl','wb'))

In [72]:
pickle.dump(cosine_sim,open('gdka.pkl','wb'))

In [73]:
pickle.dump(cos_sim,open('bert.pkl','wb'))

In [74]:
pickle.dump(bert_new.to_dict(),open('bert_dataset.pkl','wb'))

In [75]:
pickle.dump(top_movies.to_dict(),open('top_movies.pkl','wb'))
pickle.dump(top_episodes.to_dict(),open('top_episodes.pkl','wb'))
pickle.dump(top_shows.to_dict(),open('top_shows.pkl','wb'))