In [1]:
import pandas as pd
import numpy as np

import warnings

# Suppress specific warning
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint at bert-large-uncased were not used")

# Reset the warning filters if needed
warnings.resetwarnings()

In [2]:
movies = pd.read_csv("top10K-TMDB-movies.csv")

In [3]:
movies.head(10)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...",14.358,2021-06-17,8.6,255
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",92.056,2001-07-20,8.5,13093
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...",51.345,2020-10-23,8.5,339
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...",14.285,2016-02-20,8.5,239
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,158.27,2016-08-26,8.5,8895


In [4]:
movies.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [5]:
movies.isna().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


# Feature Selection

In [7]:
#feature selection
#id needed to get info

#only relevant features
movies = movies[['id', 'title', 'genre', 'overview', 'original_language']]

In [8]:
movies['tags'] = movies['overview'] + ' Genre: ' + movies['genre'] + '. Language: ' + movies['original_language']

In [9]:
movies_updated  = movies.drop(['overview', 'genre', 'original_language'], axis = 1)

In [10]:
movies_updated

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
import spacy

# SpaCy English model
nlp = spacy.load("en_core_web_sm")

  import sre_constants


In [13]:
# Custom tokenizer function using SpaCy
def spacy_tokenizer(text):
    # Use SpaCy to tokenize the input text (lemmatization)
    tokens = [token.lemma_ for token in nlp(text)]
    return tokens

In [14]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, stop_words='english', analyzer='word', 
                     ngram_range=(1,2), max_features=20000)

In [15]:
#astype('U') = ensure that the data type of the array is Unicode ('U'). 
#This is relevant when working with text data, as 'U' represents Unicode strings.

#toarray(converts sparse matrix to an array)
vector=cv.fit_transform(movies_updated['tags'].values.astype('U')).toarray()



In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
similarity = cosine_similarity(vector)

In [18]:
similarity.shape

(10000, 10000)

In [19]:
movies_updated[movies_updated['title']=="The Godfather: Part II"].index[0]#get the row

4

In [20]:
sorted(list(enumerate(similarity[4])), reverse = True, key = lambda x: x[1])

[(4, 0.9999999999999996),
 (2, 0.6944826067288968),
 (1611, 0.6432675209026769),
 (3848, 0.6126357341930255),
 (4569, 0.6072132141477101),
 (9520, 0.6066047713783691),
 (5280, 0.6026007357305201),
 (1307, 0.5980200700475929),
 (1349, 0.5980200700475929),
 (1298, 0.5908789478687515),
 (3481, 0.5902607587595918),
 (3795, 0.5902607587595918),
 (2647, 0.5890639296071121),
 (1456, 0.5885842636743428),
 (8827, 0.5867355143396495),
 (5566, 0.5860738128103984),
 (2556, 0.5856728644680651),
 (7866, 0.5851084523168254),
 (994, 0.5829810534236405),
 (6188, 0.5820039474833743),
 (5100, 0.5812085000771592),
 (8771, 0.5808023194967747),
 (6104, 0.5802979318032874),
 (6402, 0.5801356143959375),
 (6070, 0.5798335076138015),
 (8495, 0.5789860038610238),
 (1133, 0.5774391401620893),
 (1451, 0.5773502691896261),
 (6297, 0.576630231159494),
 (7693, 0.5763539780776887),
 (4712, 0.5762604874753149),
 (1209, 0.5761545131199665),
 (2753, 0.5754510855264823),
 (2879, 0.5753559617824608),
 (4567, 0.575355961782

In [21]:
movies_updated.iloc[4].title

'The Godfather: Part II'

In [22]:
def recommend_movie(movie):
    index = movies_updated[movies_updated['title']==movie].index[0]
    print("Given Movie: ", movies_updated.iloc[index].title)
    print("Genre: ", movies.iloc[index].genre)
    print(" ")
    distance = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x: x[1])
    for i, closeness in distance[1: 6]:
        print("Movie: ", movies_updated.iloc[i].title)
        print("Genre: ", movies.iloc[i].genre)
        print(" ")

In [23]:
recommend_movie("Your Eyes Tell")

Given Movie:  Your Eyes Tell
Genre:  Romance,Drama
 
Movie:  Sliding Doors
Genre:  Comedy,Drama,Fantasy,Romance
 
Movie:  Your Name.
Genre:  Romance,Animation,Drama
 
Movie:  Barfi!
Genre:  Drama,Romance,Comedy
 
Movie:  40 Days and 40 Nights
Genre:  Comedy,Romance
 
Movie:  Twilight Zone: The Movie
Genre:  Fantasy,Horror,Science Fiction,Thriller
 


In [24]:
from summarizer import Summarizer

In [25]:
text = movies["tags"][7]
ind = text.lower().index("genre")
text = text[0:ind]
model = Summarizer()
summary = model(text)    
print(summary)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


A tragic accident lead to Kaori's blindness, but she clings to life and the smaller pleasures it can still afford her. Rui was once a promising kickboxer, but something happened in his past.


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer()
vector_tfIDF = vectorizer.fit_transform(movies_updated['tags'].values.astype('U')).toarray()

In [28]:
vector_tfIDF.shape

(10000, 28274)

In [29]:
cosine_sim = cosine_similarity(vector_tfIDF)

In [30]:
cosine_sim.shape

(10000, 10000)

In [31]:
def tfidf_recommend_movie(movie):
    index = movies_updated[movies_updated['title'] == movie].index[0]
    print("Given Movie: ", movies_updated.iloc[index]["title"])
    print("Genre: ", movies.iloc[index]["genre"])
    print("  ")
    top5 = sorted(list(enumerate(cosine_sim[index])), reverse = True, key = lambda x: x[1])[1:6]
    for i,value in top5:
        print("Movie: ", movies_updated.iloc[i]["title"])
        print("Genre: ", movies.iloc[i]["genre"])
        print("  ")

In [32]:
tfidf_recommend_movie('Your Eyes Tell')

Given Movie:  Your Eyes Tell
Genre:  Romance,Drama
  
Movie:  Ballsy Girl
Genre:  Comedy
  
Movie:  The Butterfly Effect 2
Genre:  Drama,Science Fiction,Thriller
  
Movie:  About Time
Genre:  Drama,Romance,Fantasy
  
Movie:  Escort in Love
Genre:  Comedy
  
Movie:  The Back-Up Plan
Genre:  Comedy,Romance
  


In [33]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [34]:
embeddings = model.encode(movies_updated['tags'].tolist())

In [35]:
sent_transformer_similarity = cosine_similarity(embeddings)

In [37]:
sent_transformer_similarity[4]

array([0.25208506, 0.1450231 , 0.809552  , ..., 0.09299377, 0.32817382,
       0.3495331 ], dtype=float32)

In [36]:
sent_transformer_similarity.shape

(10000, 10000)

In [121]:
"""bert_cosine_similarities = []
for movie in movies_updated['tags']:
    model1 = SentenceTransformer('all-MiniLM-L6-v2')
    movie_emb = model1.encode(movie)
    bert_cosine_similarities.append(cosine_similarity(movie_emb.reshape(1, -1), embeddings))""";

In [42]:
"""def bert_recommend_movie(movie):
    index = movies_updated[movies_updated['title'] == movie].index[0]
    
    model1 = SentenceTransformer('all-MiniLM-L6-v2')
    model2 = Summarizer()
    
    movie_emb = model1.encode(movies_updated['tags'][index])
    top5 = np.argsort(cosine_similarity(movie_emb.reshape(1, -1), embeddings)[0])[-6:-1][::-1]

    print("Given Movie: ", movies_updated.iloc[index]["title"])
    print("Genre: ", movies.iloc[index]["genre"])
    print("Summary: ",model2(movies_updated['tags'][index]))
    print("  ")
    
    for i in top5:
        print("Movie: ", movies_updated.iloc[i]["title"])
        print("Genre: ", movies.iloc[i]["genre"])
        print("Summary: ",model2(movies_updated['tags'][i]))
        print("  ")
    
    index = movies_updated[movies_updated['title'] == movie].index[0]
    print("Given Movie: ", movies_updated.iloc[index]["title"])
    print("Genre: ", movies.iloc[index]["genre"])
    print("  ")
    top5 = np.argsort(bert_cosine_similarities[index][0])[-6:-1][::-1]
    for i,value in top5:
        print("Movie: ", movies_updated.iloc[i]["title"])
        print("Genre: ", movies.iloc[i]["genre"])
        print("  ")
    """;

def bert_recommend_movie(movie):
    index = movies_updated[movies_updated['title'] == movie].index[0]
    print("Given Movie: ", movies_updated.iloc[index]["title"])
    print("Genre: ", movies.iloc[index]["genre"])
    print("  ")
    top5 = sorted(list(enumerate(sent_transformer_similarity[index])), reverse = True, key = lambda x: x[1])[1:6]
    for i,value in top5:
        print("Movie: ", movies_updated.iloc[i]["title"])
        print("Genre: ", movies.iloc[i]["genre"])
        model = Summarizer()
        print("Summary: ",model(movies_updated['tags'][i]))
        print("  ")

In [43]:
bert_recommend_movie('Your Name.')

Given Movie:  Your Name.
Genre:  Romance,Animation,Drama
  
Movie:  5 Centimeters per Second
Genre:  Animation,Drama,Romance


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary:  Three moments in Takaki's life: his relationship with Akari and their forced separation; his friendship with Kanae, who is secretly in love with him; the demands and disappointments of adulthood, an unhappy life in a cold city.
  
Movie:  The Garden of Words
Genre:  Animation,Drama,Romance


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary:  Takao, who is training to become a shoemaker, skipped school and is sketching shoes in a Japanese-style garden. Then, without arranging the times, the two start to see each other again and again, but only on rainy days.
  
Movie:  Ocean Waves
Genre:  Romance,Animation,Drama


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary:  At Kichijōji Station, Tokyo, Taku Morisaki glimpses a familiar woman on the platform opposite boarding a train. As the aeroplane takes off, he narrates the events that brought her into his life... Genre: Romance,Animation,Drama.
  
Movie:  Your Eyes Tell
Genre:  Romance,Drama


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary:  A tragic accident lead to Kaori's blindness, but she clings to life and the smaller pleasures it can still afford her. Rui was once a promising kickboxer, but something happened in his past.
  
Movie:  As the Gods Will
Genre:  Thriller,Horror,Comedy


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Summary:  High school student Shun Takahata is bored. Bored with the day-to-day monotony of school and life, he prays for change, for something exciting.
  




In [45]:
import pickle

In [55]:
pickle.dump(movies_updated, open('movies_updated.pkl', 'wb'))
pickle.dump(movies, open('movies.pkl', 'wb'))

  pickle.dump(movies_updated, open('movies_updated.pkl', 'wb'))
  pickle.dump(movies, open('movies.pkl', 'wb'))


In [47]:
pickle.dump(similarity, open('count_vectorizer_similarity.pkl', 'wb'))
pickle.dump(cosine_sim, open('tfidf_similarity.pkl', 'wb'))
pickle.dump(sent_transformer_similarity, open('sent_transformer_similarity.pkl', 'wb'))

  pickle.dump(similarity, open('count_vectorizer_similarity.pkl', 'wb'))
  pickle.dump(cosine_sim, open('tfidf_similarity.pkl', 'wb'))
  pickle.dump(sent_transformer_similarity, open('sent_transformer_similarity.pkl', 'wb'))


In [52]:
pickle.load(open('movies_updated.pkl', 'rb'))

  pickle.load(open('movies_updated.pkl', 'rb'))


Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [56]:
pickle.load(open('movies.pkl', 'rb'))

  pickle.load(open('movies.pkl', 'rb'))


Unnamed: 0,id,title,genre,overview,original_language,tags
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,en,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...",hi,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...",en,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,en,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,en,In the continuing saga of the Corleone crime f...
...,...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo...",en,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...,en,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ...",en,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...,en,A man named Farmer sets out to rescue his kidn...
