## Baseline Movie Recomender System

In [8]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("Data/IMDB top 1000.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Certificate,Duration,Genre,Rate,Metascore,Description,Cast,Info
0,0,1. The Shawshank Redemption (1994),R,142 min,Drama,9.3,80.0,Two imprisoned men bond over a number of years...,"Director: Frank Darabont | Stars: Tim Robbins,...","Votes: 2,295,987 | Gross: $28.34M"
1,1,2. The Godfather (1972),R,175 min,"Crime, Drama",9.2,100.0,The aging patriarch of an organized crime dyna...,Director: Francis Ford Coppola | Stars: Marlon...,"Votes: 1,584,782 | Gross: $134.97M"
2,2,3. The Dark Knight (2008),PG-13,152 min,"Action, Crime, Drama",9.0,84.0,When the menace known as the Joker wreaks havo...,Director: Christopher Nolan | Stars: Christian...,"Votes: 2,260,649 | Gross: $534.86M"
3,3,4. The Godfather: Part II (1974),R,202 min,"Crime, Drama",9.0,90.0,The early life and career of Vito Corleone in ...,Director: Francis Ford Coppola | Stars: Al Pac...,"Votes: 1,107,253 | Gross: $57.30M"
4,4,5. The Lord of the Rings: The Return of the Ki...,PG-13,201 min,"Action, Adventure, Drama",8.9,94.0,Gandalf and Aragorn lead the World of Men agai...,"Director: Peter Jackson | Stars: Elijah Wood, ...","Votes: 1,614,369 | Gross: $377.85M"


In [4]:
df.shape

(1000, 10)

In [20]:
plots = df['Description'].values.tolist()
for i in range(len(plots)):
    plots[i] = plots[i] + " " +df.loc[i,'Genre'] + " " + df.loc[i,'Cast']


## TF IDF recomender

In [34]:
df[df['Cast'].str.contains('Leonardo DiCaprio') ]

Unnamed: 0.1,Unnamed: 0,Title,Certificate,Duration,Genre,Rate,Metascore,Description,Cast,Info
8,8,9. Inception (2010),PG-13,148 min,"Action, Adventure, Sci-Fi",8.8,74.0,A thief who steals corporate secrets through t...,Director: Christopher Nolan | Stars: Leonardo ...,"Votes: 2,022,655 | Gross: $292.58M"
36,36,37. The Departed (2006),R,151 min,"Crime, Drama, Thriller",8.5,85.0,An undercover cop and a mole in the police att...,Director: Martin Scorsese | Stars: Leonardo Di...,"Votes: 1,167,751 | Gross: $132.38M"
61,61,62. Django Unchained (2012),R,165 min,"Drama, Western",8.4,81.0,"With the help of a German bounty hunter, a fre...",Director: Quentin Tarantino | Stars: Jamie Fox...,"Votes: 1,328,656 | Gross: $162.81M"
144,144,145. Shutter Island (2010),R,138 min,"Mystery, Thriller",8.2,63.0,"In 1954, a U.S. Marshal investigates the disap...",Director: Martin Scorsese | Stars: Leonardo Di...,"Votes: 1,105,456 | Gross: $128.01M"
146,146,147. The Wolf of Wall Street (2013),R,180 min,"Biography, Crime, Drama",8.2,75.0,"Based on the true story of Jordan Belfort, fro...",Director: Martin Scorsese | Stars: Leonardo Di...,"Votes: 1,152,702 | Gross: $116.90M"
239,239,242. Catch Me If You Can (2002),PG-13,141 min,"Biography, Crime, Drama",8.1,75.0,A seasoned FBI agent pursues Frank Abagnale Jr...,Director: Steven Spielberg | Stars: Leonardo D...,"Votes: 808,593 | Gross: $164.62M"
339,339,342. The Revenant (2015),R,156 min,"Action, Adventure, Drama",8.0,76.0,A frontiersman on a fur trading expedition in ...,Director: Alejandro G. Iñárritu | Stars: Leona...,"Votes: 690,094 | Gross: $183.64M"
356,356,359. Blood Diamond (2006),R,143 min,"Adventure, Drama, Thriller",8.0,64.0,"A fisherman, a smuggler, and a syndicate of bu...",Director: Edward Zwick | Stars: Leonardo DiCap...,"Votes: 492,148 | Gross: $57.37M"
439,439,342. The Revenant (2015),R,156 min,"Action, Adventure, Drama",8.0,76.0,A frontiersman on a fur trading expedition in ...,Director: Alejandro G. Iñárritu | Stars: Leona...,"Votes: 690,094 | Gross: $183.64M"
456,456,359. Blood Diamond (2006),R,143 min,"Adventure, Drama, Thriller",8.0,64.0,"A fisherman, a smuggler, and a syndicate of bu...",Director: Edward Zwick | Stars: Leonardo DiCap...,"Votes: 492,148 | Gross: $57.37M"


In [37]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=False)

# Fit the vectorizer to the documents and transform them into TF-IDF vectors
tfidf = tfidf_vectorizer.fit_transform(plots)

In [41]:
query = ["Action with Leonardo DiCaprio"]

tfidf_query = tfidf_vectorizer.transform(query)

cos_similarities = cosine_similarity(tfidf_query,tfidf)
idx = np.argmax(cos_similarities)
df.loc[idx]['Title'],plots[idx],cos_similarities[:,idx]

('62. Django Unchained (2012)',
 'With the help of a German bounty hunter, a freed slave sets out to rescue his wife from a brutal Mississippi plantation owner. Drama, Western Director: Quentin Tarantino | Stars: Jamie Foxx, Christoph Waltz, Leonardo DiCaprio, Kerry Washington',
 array([0.25724788]))

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [35]:
tokens = tokenizer(plots, padding=True, truncation=True, return_tensors='pt')

In [36]:
with torch.no_grad():
    outputs = model(**tokens)

embeddings = outputs.pooler_output

In [39]:

query = ["Action movie with horror "]
query_token = tokenizer(query, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    outputs = model(**query_token)

embedding_query = outputs.pooler_output
# Calculate cosine similarity between query and all descriptions
cos_similarities = cosine_similarity(embedding_query,embeddings)



idx = np.argmax(cos_similarities)

In [40]:
df.loc[idx]['Title'],plots[idx]

('297. The 400 Blows (1959)',
 'A young boy, left without attention, delves into a life of petty crime. Crime, Drama')