<a href="https://colab.research.google.com/github/samya-ravenXI/Movie-Recommendation-System/blob/main/Movie%20Recommendation%20System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System
---

<br>

**MovieLens 25M Dataset-**
> MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags.

Dataset~ [MovieLens 25M](https://grouplens.org/datasets/movielens/25m/)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Dataset and Dependencies

In [None]:
%%capture
!pip install plotly
!pip install tmdbv3api
!pip install python-igraph
!pip install networkx == 2.6.3
!pip install --user scipy == 1.8.1
!pip install surprise

In [None]:
import re
import ast
import numpy as np
import pandas as pd
import igraph as ig
import networkx as nx
from tmdbv3api import TMDb
from tmdbv3api import Movie
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict
from surprise import accuracy, Dataset, Reader, dump
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

> Importing API Key

In [None]:
# Make sure to put the apiPass.txt in your drive
file1 = open('/content/drive/MyDrive/Movie Recommendation System/webscraped/apiPass.txt', 'r')
  
# Writing a string to file
API_KEY = file1.read()
  
# Closing file
file1.close()

> Reading all the available datasets

In [None]:
tags = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/tags.csv')
links = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/links.csv')
movies = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/ratings.csv')

# Importing Web-Scraped data

posters = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/webscraped/posters.csv', lineterminator='\n')
trailers = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/webscraped/trailers.csv', lineterminator='\n')
desc_movies = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/webscraped/desc_movies.csv', lineterminator='\n')
desc2_movies = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/webscraped/desc2_movies.csv', lineterminator='\n')

## Initial Data Exploration
---

### Ratings Data Grouped by The Users

In [None]:
db = ratings.sort_values(['userId', 'movieId']).groupby('userId')
db.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
24999913,162541,29,5.0,1240953540
24999914,162541,32,5.0,1240949283
24999915,162541,47,4.5,1240953606
24999916,162541,50,5.0,1240953428


### Genre Seggregation of Movies

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
count = 0
genres = {}

for i in movies.values:
  for j in i[2].split('|'):
    if j not in genres:
      genres[j] = count
      count += 1
print(genres)

{'Adventure': 0, 'Animation': 1, 'Children': 2, 'Comedy': 3, 'Fantasy': 4, 'Romance': 5, 'Drama': 6, 'Action': 7, 'Crime': 8, 'Thriller': 9, 'Horror': 10, 'Mystery': 11, 'Sci-Fi': 12, 'IMAX': 13, 'Documentary': 14, 'War': 15, 'Musical': 16, 'Western': 17, 'Film-Noir': 18, '(no genres listed)': 19}


In [None]:
data = []
gen = list(genres.keys())

for i in movies.values:
  d = [i[0]] + [i[1]] + [0] * len(gen)
  for j in i[2].split('|'):
    d[genres[j] + 2] = 1
  data.append(d)

labels = ['movieId', 'title'] + gen
df = pd.DataFrame(columns=labels, data=data)
df.head()

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dist = [0] * len(gen)

for i in gen:
  dist[genres[i]] = df[df[i] == 1].size

db = pd.DataFrame({'genre': gen, 'value': dist})
db.head()

Unnamed: 0,genre,value
0,Adventure,91190
1,Animation,64438
2,Children,64570
3,Comedy,371140
4,Fantasy,60082


In [None]:
fig = px.bar(db, x='genre', y='value', color='value', color_continuous_scale=px.colors.sequential.RdBu)
fig.show()

### Top 100 Highest Rated Movies

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
db = ratings.iloc[:, 1:3].sort_values('rating', ascending=False)
db = db[db['rating'] == 5].copy()
db.drop_duplicates(keep='first', inplace=True)
db = db.sort_values('movieId')
db.head()

Unnamed: 0,movieId,rating
10470453,1,5.0
10470454,2,5.0
10470455,3,5.0
10409984,4,5.0
10523352,5,5.0


In [None]:
highRatedMovies = pd.merge(db['movieId'], movies, on='movieId', how='inner')
highRatedMovies.drop_duplicates(keep='first', inplace=True)
highRatedMovies = highRatedMovies.sort_values('movieId')
highRatedMovies = highRatedMovies.head(100)

In [None]:
highRatedMovies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Highest Rated Genres

In [None]:
highRatedGenres = {}

for i in highRatedMovies.values:
  for j in i[2].split('|'):
    if j not in highRatedGenres:
      highRatedGenres[j] = 1
    else:
      highRatedGenres[j] += 1

highRatedGenres = {k: v for k, v in sorted(highRatedGenres.items(), key=lambda item: item[1])}
db = pd.DataFrame({'genres': highRatedGenres.keys(), 'no. of ratings': highRatedGenres.values()})
db.head()

Unnamed: 0,genres,no. of ratings
0,Musical,1
1,IMAX,2
2,War,2
3,Animation,3
4,Documentary,3


In [None]:
fig = px.bar(db, x='genres', y='no. of ratings', color='no. of ratings', color_continuous_scale=px.colors.sequential.Magenta)
fig.show()

### Top 100 Most Rated Movies

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
mostRated = ratings.groupby('movieId')['movieId'].count().reset_index(name='count')
mostRated = mostRated.sort_values('count', ascending=False)
mostRated = mostRated.head(100)

In [None]:
mostRatedMovies = pd.merge(mostRated['movieId'], movies, on='movieId', how='inner').sort_values('movieId')
mostRatedMovies.head()

Unnamed: 0,movieId,title,genres
12,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
27,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
81,34,Babe (1995),Children|Drama
21,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
14,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### Most Rated Genres



In [None]:
mostRatedGenres = {}

for i in mostRatedMovies.values:
  for j in i[2].split('|'):
    if j not in mostRatedGenres:
      mostRatedGenres[j] = 1
    else:
      mostRatedGenres[j] += 1

mostRatedGenres = {k: v for k, v in sorted(highRatedGenres.items(), key=lambda item: item[1])}
db = pd.DataFrame({'genres': mostRatedGenres.keys(), 'no. of ratings': mostRatedGenres.values()})
db.head()

Unnamed: 0,genres,no. of ratings
0,Musical,1
1,IMAX,2
2,War,2
3,Animation,3
4,Documentary,3


In [None]:
fig = px.bar(db, x='genres', y='no. of ratings', color='no. of ratings', color_continuous_scale=px.colors.sequential.Purpor)
fig.show()

### Users With The Most Ratings

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
mostUsers = ratings.iloc[:, :1].groupby('userId')['userId'].count().reset_index(name='count')
mostUsers = mostUsers.sort_values('count')
mostUsers.head()

Unnamed: 0,userId,count
56218,56219,20
87800,87801,20
129588,129589,20
57942,57943,20
129585,129586,20


In [None]:
mostUsers.shape

(162541, 2)

## Web Scraping from TMDB API for Movie Overview, Populariy and Cast Information

In [None]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
tmdb = TMDb()
tmdb.api_key = API_KEY
tmdb.language = 'en'
tmdb.debug = True

In [None]:
movie = Movie()

ids = []
casts = []
titles = []
keywords = []
trailers = []
overviews = []
vote_counts = []
poster_links = []
popularities = []
vote_averages = []

for id in links.values:
  try:
    m = movie.details(id[2])
    titles.append(m.title)
    overviews.append(m.overview)
    vote_counts.append(m.vote_count)
    popularities.append(m.popularity)
    vote_averages.append(m.vote_average)
    trailers.append(m.trailers.youtube)
    keywords.append([i.name for i in m.keywords.keywords])
    cast = []
    for i in m.casts.cast:
      if i.popularity >= 25:
        cast.append(i.name)
    casts.append(cast[:5])
    poster_links.append('https://image.tmdb.org/t/p/original' + m.poster_path)
    ids.append(m.id)
  except:
    continue

In [None]:
# Formatting the trailer information

trailerContainer = []
for i in trailers:
    listTrailers = {}
    for j in i:
        listTrailers[j.name] = j.source
    trailerContainer.append(listTrailers)

In [None]:
desc_movies = pd.DataFrame({'movieId': ids, 'title': titles,'popularity': popularities, 'overview': overviews, 'casts': casts})
desc_movies.head()

Unnamed: 0,movieId,title,popularity,overview,casts
0,862,Toy Story,125.842,"Led by Woody, Andy's toys live happily in his ...","['Tom Hanks', 'Tim Allen', 'Wallace Shawn', 'A..."
1,8844,Jumanji,19.543,When siblings Judy and Peter discover an encha...,['Kirsten Dunst']
2,15602,Grumpier Old Men,11.176,A family wedding reignites the ancient feud be...,"['Sophia Loren', 'Daryl Hannah']"
3,31357,Waiting to Exhale,11.559,"Cheated on, mistreated and stepped on, the wom...","['Angela Bassett', 'Lamont Johnson', 'Wesley S..."
4,11862,Father of the Bride Part II,10.681,Just when George Banks has recovered from his ...,['BD Wong']


In [None]:
desc2_movies = pd.DataFrame({'movieId': ids, 'keywords': keywords, 'vote_count': vote_counts, 'vote_average': vote_averages})
desc2_movies.head()

Unnamed: 0,movieId,keywords,vote_count,vote_average
0,862,"['martial arts', 'jealousy', 'friendship', 'bu...",16335,7.966
1,8844,"['giant insect', 'board game', 'jungle', 'disa...",9429,7.237
2,15602,"['fishing', 'halloween', 'sequel', 'old man', ...",320,6.463
3,31357,"['based on novel or book', 'interracial relati...",128,6.3
4,11862,"['parent child relationship', 'baby', 'midlife...",623,6.226


In [None]:
trailers = pd.DataFrame({'movieId': ids, 'trailers': trailerContainer})
trailers.head()

Unnamed: 0,movieId,trailers
0,862,"{""Sid's House"": 'u9y2tXnjP_E', 'Teaser Trailer..."
1,8844,"{'International Teaser': '3aFLqO5I73E', 'Gag R..."
2,15602,{'Grumpier Old Men - Trailer': 'rEnOoWs3FuA'}
3,31357,{'Waiting To Exhale': 'j9xml1CxgXI'}
4,11862,{'Father of the Bride Part II (1995) 35mm film...


In [None]:
posters = pd.DataFrame({'movieId': ids[:len(poster_links)], 'posters': poster_links})
posters.head()

Unnamed: 0,movieId,posters
0,862,https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,8844,https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,15602,https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,31357,https://image.tmdb.org/t/p/original/4uw6HKq4vl...
4,11862,https://image.tmdb.org/t/p/original/rj4LBtwQ0u...


In [None]:
posters.to_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/posters.csv', index=False)
trailers.to_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/trailers.csv', index=False)
desc_movies.to_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/desc_movies.csv', index=False)
desc2_movies.to_csv('/content/drive/MyDrive/Movie Recommendation System/ml-25m/desc2_movies.csv', index=False)

## Web Scraped Data Exploration

### Top 50 Most Popular Movies

In [None]:
mostPopular = desc_movies.iloc[:, :3].sort_values('popularity', ascending=False)
mostPopular.head()

Unnamed: 0,movieId,title,popularity
14025,19995,Avatar,1355.52
10407,411,"The Chronicles of Narnia: The Lion, the Witch ...",748.264
28537,297270,Tinker Bell and the Legend of the NeverBeast,649.822
46037,111332,Avatar: Creating the World of Pandora,393.403
57432,335787,Uncharted,282.61


In [None]:
fig = px.bar(mostPopular[:50], y='title', x='popularity', color='popularity', color_continuous_scale=px.colors.sequential.thermal)
fig.show()

### Top 50 Most Voted Movies

In [None]:
db = pd.merge(desc_movies, desc2_movies, on='movieId', how='inner')
mostVoted = db.loc[:, db.columns.intersection(['movieId', 'title', 'vote_count'])].sort_values('vote_count', ascending=False)
mostVoted.head()

Unnamed: 0,movieId,title,vote_count
14918,27205,Inception,33120
21072,157336,Interstellar,30451
12216,155,The Dark Knight,29127
14098,19995,Avatar,28203
17035,24428,The Avengers,28145


In [None]:
fig = px.bar(mostVoted[:50], y='title', x='vote_count', color='vote_count', color_continuous_scale=px.colors.sequential.deep)
fig.show()

### Average Vote Distribution

In [None]:
db = pd.merge(desc_movies, desc2_movies, on='movieId', how='inner')
db = db.loc[:, db.columns.intersection(['movieId', 'title', 'vote_average'])]
voteDist = db.groupby('vote_average')['vote_average'].count().reset_index(name='vote_dist').sort_values('vote_dist', ascending=False)
voteDist.head()

Unnamed: 0,vote_average,vote_dist
1800,6.0,2393
1997,6.2,1797
2296,6.5,1711
2197,6.4,1642
2795,7.0,1639


In [None]:
fig = px.scatter(voteDist, x='vote_average', y='vote_dist', size='vote_dist', color='vote_dist', color_continuous_scale=px.colors.sequential.Burg, marginal_x='histogram', marginal_y='rug')
fig.show()

### Cross Appearance of Popularity and Count

In [None]:
db = pd.merge(desc_movies, desc2_movies, on='movieId', how='inner')[['popularity', 'vote_count']]
db.shape

(61656, 2)

In [None]:
fig = px.scatter(db.sample(n=10000), x='popularity', y='vote_count', size='vote_count', color='vote_count', color_continuous_scale=px.colors.sequential.Darkmint, marginal_x='rug', marginal_y='rug')
fig.show()

### Keyword Seggregation of Movies

In [None]:
db = pd.merge(desc_movies, desc2_movies, on='movieId', how='inner')
db = db.loc[:, db.columns.intersection(['movieId', 'title', 'keywords'])]
db.head()

Unnamed: 0,movieId,title,keywords
0,862,Toy Story,"['martial arts', 'jealousy', 'friendship', 'bu..."
1,8844,Jumanji,"['giant insect', 'board game', 'jungle', 'disa..."
2,15602,Grumpier Old Men,"['fishing', 'halloween', 'sequel', 'old man', ..."
3,31357,Waiting to Exhale,"['based on novel or book', 'interracial relati..."
4,11862,Father of the Bride Part II,"['parent child relationship', 'baby', 'midlife..."


In [None]:
keywords = {}
for i in db['keywords'].values:
  for j in i.strip("']['").split("', '"):
    if j in keywords:
      keywords[j] += 1
    else:
      keywords[j] = 1
keywords = dict(sorted(keywords.items(), key=lambda item:item[1], reverse=True))

In [None]:
# Top 100 Most Common Keywords

keywords = dict(sorted(keywords.items(), key=lambda item:item[1], reverse=True)[1:101])     # First value is an empty string
keys = list(keywords.keys())
counts = list(keywords.values())

fig = px.bar(x=keys, y=counts, color=counts, color_continuous_scale=px.colors.sequential.Teal)
fig.show()

### Coapperance Network of Cast Members (Actors)

In [None]:
desc_movies.head()

Unnamed: 0,movieId,title,popularity,overview,casts
0,862,Toy Story,125.842,"Led by Woody, Andy's toys live happily in his ...","['Tom Hanks', 'Tim Allen', 'Wallace Shawn', 'A..."
1,8844,Jumanji,19.543,When siblings Judy and Peter discover an encha...,['Kirsten Dunst']
2,15602,Grumpier Old Men,11.176,A family wedding reignites the ancient feud be...,"['Sophia Loren', 'Daryl Hannah']"
3,31357,Waiting to Exhale,11.559,"Cheated on, mistreated and stepped on, the wom...","['Angela Bassett', 'Lamont Johnson', 'Wesley S..."
4,11862,Father of the Bride Part II,10.681,Just when George Banks has recovered from his ...,['BD Wong']


In [None]:
def casts(x):
  return ', '.join(ast.literal_eval(x.casts))

tempdb = pd.concat([desc_movies.iloc[:, :-3], desc_movies.apply(casts, axis=1)], axis=1)
tempdb = tempdb.rename(columns={0: 'casts'})
tempdb = tempdb[tempdb['casts'] != '']
tempdb.head()

Unnamed: 0,movieId,title,casts
0,862,Toy Story,"Tom Hanks, Tim Allen, Wallace Shawn, Annie Potts"
1,8844,Jumanji,Kirsten Dunst
2,15602,Grumpier Old Men,"Sophia Loren, Daryl Hannah"
3,31357,Waiting to Exhale,"Angela Bassett, Lamont Johnson, Wesley Snipes,..."
4,11862,Father of the Bride Part II,BD Wong


In [None]:
tempdb = tempdb[tempdb['casts'].str.contains(u', ')]

In [None]:
from pandas.io.parsers.python_parser import count_empty_vals

actors = {}
casts = list(tempdb.casts)
for i in casts:
  j = i.split(', ')
  if j[0] not in actors:
    actors[j[0]] = j[1:]
  if j[0] in actors:
    actors[j[0]] += j[1:]

graphActors = {key: val for key, val in actors.items() if val != []}

count = 0
actors = {}
for i in casts:
  for j in i.split(', '):
    if j not in actors:
      actors[j] = count
      count += 1

source = []
target = []
weight = []

for key, value in graphActors.items():
  worked = {}

  source += [key] * len(set(value))

  for i in value:
    if i not in worked:
      worked[i] = 1
    else:
      worked[i] += 1
  
  target += worked.keys()
  weight += worked.values()

graphWeights = pd.DataFrame({'source': source, 'target': target, 'weight': weight})
graphWeights = graphWeights[graphWeights['source'] != graphWeights['target']]
graphWeights

Unnamed: 0,source,target,weight
0,Tom Hanks,Tim Allen,6
1,Tom Hanks,Wallace Shawn,9
2,Tom Hanks,Annie Potts,4
3,Tom Hanks,Kevin Bacon,1
4,Tom Hanks,Ed Harris,1
...,...,...,...
10379,Jessie Buckley,Rufus Sewell,2
10380,Jessie Buckley,Michael Gambon,2
10381,Jessie Buckley,Bella Ramsey,2
10382,Judith Light,Mike Colter,2


In [None]:
group, count = 1, 1
nodes = []

for key, val in actors.items():
  node = {}
  node['name'] = key
  node['group'] = group
  if count == 2:
    count = 1
    group += 1
  nodes.append(node)
  count += 1

links = []
for i in graphWeights.values:
  link = {}
  link['source'] = actors[i[0]]
  link['target'] = actors[i[1]]
  link['value'] = i[2]
  links.append(link)

N=len(nodes)
L=len(links)
Edges=[(links[k]['source'], links[k]['target']) for k in range(L)]

G=ig.Graph(Edges, directed=False)

labels=[]
group=[]
for node in nodes:
    labels.append(node['name'])
    group.append(node['group'])

layt=G.layout('kk', dim=3)

In [None]:
Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
Yn=[layt[k][1] for k in range(N)]# y-coordinates
Zn=[layt[k][2] for k in range(N)]# z-coordinates
Xe=[]
Ye=[]
Ze=[]
for e in Edges:
    Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
    Ye+=[layt[e[0]][1],layt[e[1]][1], None]
    Ze+=[layt[e[0]][2],layt[e[1]][2], None]

In [None]:
Gnx = nx.Graph(Edges) 
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(Gnx.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])))

In [None]:
import plotly.graph_objs as go

trace1=go.Scatter3d(x=Xe,
               y=Ye,
               z=Ze,
               mode='lines',
               line=dict(color='rgb(0,0,0)', width=0.5),
               hoverinfo='text'
               )

trace2=go.Scatter3d(x=Xn,
               y=Yn,
               z=Zn,
               mode='markers',
               name='actors',
               marker=dict(symbol='circle',
                             size=10,
                             color=node_adjacencies,
                            #  reversescale=True,
                             colorscale='Blackbody',
                             colorbar=dict(title='Number of Connections', title_side='right'),
                             line=dict(color='rgb(50,50,50)', width=0.5)
                             ),
               text=labels,
               hoverinfo='text'
               )

axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

layout = go.Layout(
         title="Network of Coappearances of Casts in the Movies",
         showlegend=False,
         scene=dict(
             xaxis=dict(axis),
             yaxis=dict(axis),
             zaxis=dict(axis),
        ),
        margin=dict(
            t=100
        ),
        hovermode='closest',
        annotations=[
              dict(
              showarrow=False,
                text="",
                xref='paper',
                yref='paper',
                x=0,
                y=0.1,
                xanchor='left',
                yanchor='bottom',
                font=dict(
                size=14
                )
                )
            ])

data=[trace1, trace2]
fig=go.Figure(data=data, layout=layout)
fig.show()

## Recommendation Systems

### Popularity Based System
---

#### Based on The Popularity Measure Web Scraped Data from TMDB

In [None]:
popularity = desc_movies.sort_values('popularity', ascending=False)
popularity.head()

Unnamed: 0,movieId,title,popularity,overview,casts
14025,19995,Avatar,1355.52,"In the 22nd century, a paraplegic Marine is di...","['Sam Worthington', 'Zoe Saldaña', 'Sigourney ..."
10407,411,"The Chronicles of Narnia: The Lion, the Witch ...",748.264,"Siblings Lucy, Edmund, Susan and Peter step th...","['Georgie Henley', 'Anna Popplewell', 'Tilda S..."
28537,297270,Tinker Bell and the Legend of the NeverBeast,649.822,An ancient myth of a massive creature sparks t...,"['Rosario Dawson', 'Lucy Liu', 'Danai Gurira']"
46037,111332,Avatar: Creating the World of Pandora,393.403,The Making-of James Cameron's Avatar. It shows...,"['Stephen Lang', 'Sigourney Weaver']"
57432,335787,Uncharted,282.61,"A young street-smart, Nathan Drake and his wis...","['Tom Holland', 'Mark Wahlberg', 'Tati Gabriel..."


In [None]:
def popularMeasureTMDB(num):

  '''
      num - number of recommendations
      output - list of dictionaries for each movie recommendation
  '''

  suggestions = []
  res = pd.merge(popularity[:num], trailers, on='movieId', how='inner')
  for i in res.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[3]
    result['Cast'] = ', '.join(ast.literal_eval(i[4]))
    for j in ast.literal_eval(i[5]).items():
      if "trailer" in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

popularMeasureTMDB(3)

[{'Title': 'Avatar',
  'Overview': 'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.',
  'Cast': 'Sam Worthington, Zoe Saldaña, Sigourney Weaver, Stephen Lang',
  'Official Trailer': 'https://www.youtube.com/watch?v=5PSNL1qE6VY'},
 {'Title': 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe',
  'Overview': "Siblings Lucy, Edmund, Susan and Peter step through a magical wardrobe and find the land of Narnia. There, they discover a charming, once peaceful kingdom that has been plunged into eternal winter by the evil White Witch, Jadis. Aided by the wise and magnificent lion, Aslan, the children lead Narnia into a spectacular, climactic battle to be free of the Witch's glacial powers forever.",
  'Cast': 'Georgie Henley, Anna Popplewell, Tilda Swinton, James McAvoy, Liam Neeson',
  'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe (2005) 

#### Based on IMDB's Weighted Rating Formula 

---

$W = \frac{Rv + Cm}{v + m}$

where,

W = Weighted rating for the movie

R = Average rating for the movie

v = Number of votes for the movie

m = Minimum votes required to be listed

C = The mean vote of whole dataset

##### Using Movie-Lens Dataset

> Inference: The dataset is not evenly distributing i.e. the number of ratings and the rating itself is very skewed as a certain movie with 

In [None]:
tempdb = ratings.loc[:, ratings.columns.intersection(['movieId', 'rating'])]

vote_average = tempdb.groupby('movieId')['rating'].mean().reset_index(name='vote_average')    # R
vote_count = tempdb.groupby('movieId')['movieId'].count().reset_index(name='vote_count')      # v
min_votes = vote_count['vote_count'].quantile(0.90)                                           # m
mean_vote = tempdb['rating'].mean()                                                           # C

popularity = pd.merge(vote_average, vote_count, on='movieId', how='inner')
popularity = popularity[popularity['vote_count'] >= min_votes]                                # Eliminating entries that are below the minimum vote count
popularity = popularity.sort_values(['vote_count', 'vote_average'], ascending=False)
popularity.head()

Unnamed: 0,movieId,vote_average,vote_count
351,356,4.048011,81491
314,318,4.413576,81482
292,296,4.188912,79672
585,593,4.151342,74127
2480,2571,4.154099,72674


In [None]:
def weighted_rating(x, m = min_votes, C = mean_vote):
  v = x['vote_count']
  R = x['vote_average']
  return (v/(v+m)*R) + (m/(v+m)*C)

In [None]:
popularity['weighted_rating'] = popularity.apply(weighted_rating, axis=1)
popularity = popularity.sort_values('weighted_rating', ascending=False)
popularity = pd.merge(popularity, desc_movies, on='movieId', how='inner')
popularity = popularity.loc[:, popularity.columns.intersection(['movieId', 'title', 'overview', 'casts'])]
popularity.head()

Unnamed: 0,movieId,title,overview,casts
0,318,The Million Dollar Hotel,The Million Dollar Hotel starts with a jump fr...,"['Mel Gibson', 'Milla Jovovich', 'Julian Sands']"
1,858,Sleepless in Seattle,"When Sam Baldwin's wife dies, he is left to br...","['Tom Hanks', 'Bill Pullman']"
2,527,Once Were Warriors,A drama about a Maori family living in Aucklan...,['Cliff Curtis']
3,2019,Hard Target,"When a woman's father goes missing, she enlist...",['Jean-Claude Van Damme']
4,2959,License to Wed,"Newly engaged, Ben and Sadie can't wait to sta...",['John Krasinski']


In [None]:
def popularMeasureMovieLens(num):

  '''
      num - number of recommendations
      output - list of dictionaries for each movie recommendation
  '''

  suggestions = []
  res = pd.merge(popularity[:num], trailers, on='movieId', how='inner')
  for i in res.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[2]
    result['Cast'] = ', '.join(ast.literal_eval(i[3]))
    for j in ast.literal_eval(i[4]).items():
      if "trailer" in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

popularMeasureMovieLens(3)

[{'Title': 'The Million Dollar Hotel',
  'Overview': 'The Million Dollar Hotel starts with a jump from a roof top that clears up a death in a hotel that was burning to the ground where a lot of strange people had been living.',
  'Cast': 'Mel Gibson, Milla Jovovich, Julian Sands',
  'Million Dollar Hotel / Official Trailer (1999)': 'https://www.youtube.com/watch?v=S4Ft6C8LTKU'},
 {'Title': 'Sleepless in Seattle',
  'Overview': "When Sam Baldwin's wife dies, he is left to bring up his eight-year-old son Jonah alone, and decides to move to Seattle to make a new start. On Christmas Eve, Jonah rings a radio phone-in with his Christmas wish to find a new wife for his dad. Meanwhile in Baltimore, journalist Annie Reed, who is having doubts about her own relationship, is listening in.",
  'Cast': 'Tom Hanks, Bill Pullman',
  'Sleepless In Seattle - Trailer': 'https://www.youtube.com/watch?v=-Lj2U-cmyek'},
 {'Title': 'Once Were Warriors',
  'Overview': 'A drama about a Maori family living in A

##### Using Web Scraped Data from TMDB

In [None]:
tempdb = desc2_movies.loc[:, desc2_movies.columns.intersection(['movieId', 'vote_count', 'vote_average'])]

vote_average = tempdb.copy()['vote_average']                                    # R
vote_count = tempdb.copy()['vote_count']                                        # v
min_votes = vote_count.quantile(0.90)                                           # m
mean_vote = vote_average.mean()                                                 # C

popularity = tempdb[tempdb['vote_count'] >= min_votes]                          # Eliminating entries that are below the minimum vote count
popularity = popularity.sort_values(['vote_count', 'vote_average'], ascending=False)
popularity.head()

Unnamed: 0,movieId,vote_count,vote_average
14846,27205,33120,8.362
20988,157336,30451,8.389
12165,155,29127,8.506
14025,19995,28203,7.567
16955,24428,28145,7.707


In [None]:
def weighted_rating(x, m = min_votes, C = mean_vote):
  v = x['vote_count']
  R = x['vote_average']
  return (v/(v+m)*R) + (m/(v+m)*C)

In [None]:
popularity['weighted_rating'] = popularity.apply(weighted_rating, axis=1)
popularity = popularity.sort_values('weighted_rating', ascending=False)
popularity = pd.merge(popularity, desc_movies, on='movieId', how='inner')
popularity = popularity.loc[:, popularity.columns.intersection(['movieId', 'title', 'overview', 'casts'])]
popularity.head()

Unnamed: 0,movieId,title,overview,casts
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"['Morgan Freeman', 'Clancy Brown']"
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Al Pacino', 'Robert Duvall']"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Al Pacino', 'Robert Duvall', 'Robert De Niro..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"['Liam Neeson', 'Ralph Fiennes', 'Steven Spiel..."
4,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,"['Christian Bale', 'Heath Ledger', 'Michael Ca..."


In [None]:
def popularMeasureTMDB(num):

  '''
      num - number of recommendations
      output - list of dictionaries for each movie recommendation
  '''

  suggestions = []
  res = pd.merge(popularity[:num], trailers, on='movieId', how='inner')
  for i in res.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[2]
    result['Cast'] = ', '.join(ast.literal_eval(i[3]))
    for j in ast.literal_eval(i[4]).items():
      if "trailer" in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

popularMeasureTMDB(3)

[{'Title': 'The Shawshank Redemption',
  'Overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
  'Cast': 'Morgan Freeman, Clancy Brown',
  'Trailer': 'https://www.youtube.com/watch?v=PLl99DlL6b4',
  'The Shawshank Redemption (1994) original theatrical trailer': 'https://www.youtube.com/watch?v=vYsQ5mu5Xow',
  'The Shawshank Redemption (1994) OFFICIAL TRAILER [HD 1080p]': 'https://www.youtube.com/watch?v=P9mwtI82k6E'},
 {'Title': 'The Godfather',
  'Overview': 'Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime family. When organized crime family patriarch, Vito Corleone barely survives an attempt on 

### Context Based System

##### Based on Web Scraped Title, Overview, Cast, Genre, Keywords etc

In [None]:
tempdb = pd.merge(movies, links, on='movieId', how='inner')
tempdb = tempdb.loc[:, tempdb.columns.intersection(['tmdbId', 'title', 'genres'])]
tempdb = tempdb[['tmdbId', 'title', 'genres']]
tempdb.rename(columns={'tmdbId': 'movieId'}, inplace=True)
tempdb.head()

Unnamed: 0,movieId,title,genres
0,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,8844.0,Jumanji (1995),Adventure|Children|Fantasy
2,15602.0,Grumpier Old Men (1995),Comedy|Romance
3,31357.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,11862.0,Father of the Bride Part II (1995),Comedy


In [None]:
tempdb = pd.merge(tempdb, desc_movies.loc[:, desc_movies.columns.intersection(['movieId', 'overview', 'casts'])], on='movieId', how='inner')
tempdb = pd.merge(tempdb, desc2_movies.loc[:, desc2_movies.columns.intersection(['movieId', 'keywords'])], on='movieId', how='inner')
db = pd.merge(trailers, posters, on='movieId', how='inner')
tempdb = pd.merge(tempdb, db, on='movieId', how='inner')
tempdb.head()

Unnamed: 0,movieId,title,genres,overview,casts,keywords,trailers,posters
0,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"Led by Woody, Andy's toys live happily in his ...","['Tom Hanks', 'Tim Allen', 'Wallace Shawn', 'A...","['martial arts', 'jealousy', 'friendship', 'bu...","{""Sid's House"": 'u9y2tXnjP_E', 'Teaser Trailer...",https://image.tmdb.org/t/p/original/uXDfjJbdP4...
1,8844.0,Jumanji (1995),Adventure|Children|Fantasy,When siblings Judy and Peter discover an encha...,['Kirsten Dunst'],"['giant insect', 'board game', 'jungle', 'disa...","{'International Teaser': '3aFLqO5I73E', 'Gag R...",https://image.tmdb.org/t/p/original/vgpXmVaVyU...
2,15602.0,Grumpier Old Men (1995),Comedy|Romance,A family wedding reignites the ancient feud be...,"['Sophia Loren', 'Daryl Hannah']","['fishing', 'halloween', 'sequel', 'old man', ...",{'Grumpier Old Men - Trailer': 'rEnOoWs3FuA'},https://image.tmdb.org/t/p/original/1FSXpj5e8l...
3,31357.0,Waiting to Exhale (1995),Comedy|Drama|Romance,"Cheated on, mistreated and stepped on, the wom...","['Angela Bassett', 'Lamont Johnson', 'Wesley S...","['based on novel or book', 'interracial relati...",{'Waiting To Exhale': 'j9xml1CxgXI'},https://image.tmdb.org/t/p/original/4uw6HKq4vl...
4,11862.0,Father of the Bride Part II (1995),Comedy,Just when George Banks has recovered from his ...,['BD Wong'],"['parent child relationship', 'baby', 'midlife...",{'Father of the Bride Part II (1995) 35mm film...,https://image.tmdb.org/t/p/original/rj4LBtwQ0u...


In [None]:
def desc(x):

  '''
      Takes each instance from the dataframe and concatenates the title, overview, genres, casts and keywords together, after removing spaces from proper/important nouns
  '''

  title = x.title[:-7].lower()
  overview = str(x.overview).lower()
  genres = ' '.join(x.genres.lower().split('|'))
  casts = ' '.join(i.replace(' ', '').lower() for i in ast.literal_eval(x.casts))
  keywords = ' '.join(i.replace(' ', '').lower() for i in ast.literal_eval(x.keywords))
  result = re.sub("[^a-zA-Z0-9 ]", "", title + ' ' + overview + ' ' +  casts + ' ' +  genres + ' ' +  keywords)
  return result

In [None]:
# Stop word removal for english words
count = CountVectorizer(stop_words='english', ngram_range=(1,3))

# Imputing missing values
tempdb['description'] = tempdb.apply(desc, axis=1)
tempdb.head()

Unnamed: 0,movieId,title,genres,overview,casts,keywords,trailers,posters,description
0,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"Led by Woody, Andy's toys live happily in his ...","['Tom Hanks', 'Tim Allen', 'Wallace Shawn', 'A...","['martial arts', 'jealousy', 'friendship', 'bu...","{""Sid's House"": 'u9y2tXnjP_E', 'Teaser Trailer...",https://image.tmdb.org/t/p/original/uXDfjJbdP4...,toy story led by woody andys toys live happily...
1,8844.0,Jumanji (1995),Adventure|Children|Fantasy,When siblings Judy and Peter discover an encha...,['Kirsten Dunst'],"['giant insect', 'board game', 'jungle', 'disa...","{'International Teaser': '3aFLqO5I73E', 'Gag R...",https://image.tmdb.org/t/p/original/vgpXmVaVyU...,jumanji when siblings judy and peter discover ...
2,15602.0,Grumpier Old Men (1995),Comedy|Romance,A family wedding reignites the ancient feud be...,"['Sophia Loren', 'Daryl Hannah']","['fishing', 'halloween', 'sequel', 'old man', ...",{'Grumpier Old Men - Trailer': 'rEnOoWs3FuA'},https://image.tmdb.org/t/p/original/1FSXpj5e8l...,grumpier old men a family wedding reignites th...
3,31357.0,Waiting to Exhale (1995),Comedy|Drama|Romance,"Cheated on, mistreated and stepped on, the wom...","['Angela Bassett', 'Lamont Johnson', 'Wesley S...","['based on novel or book', 'interracial relati...",{'Waiting To Exhale': 'j9xml1CxgXI'},https://image.tmdb.org/t/p/original/4uw6HKq4vl...,waiting to exhale cheated on mistreated and st...
4,11862.0,Father of the Bride Part II (1995),Comedy,Just when George Banks has recovered from his ...,['BD Wong'],"['parent child relationship', 'baby', 'midlife...",{'Father of the Bride Part II (1995) 35mm film...,https://image.tmdb.org/t/p/original/rj4LBtwQ0u...,father of the bride part ii just when george b...


In [None]:
count_matrix = count.fit_transform(tempdb['description'])
count_matrix.shape

(61050, 3488779)

###### Based on TMDB API Response

In [None]:
def overTitle(title):

  '''
      Concatenates the title and overview together, after removing spaces from proper/important nouns, post web scraping
  '''

  tmdb = TMDb()
  movie = Movie()
  tmdb.api_key = API_KEY
  tmdb.language = 'en'
  tmdb.debug = True

  mov = movie.search(title)[0]
  title = mov.title.lower()
  overview = mov.overview.lower()
  result = re.sub("[^a-zA-Z0-9 ]", "", title + ' ' + overview)
  return result

In [None]:
def descTitle(title):

  '''
      Concatenates the title, overview, genres, casts and keywords together, after removing spaces from proper/important nouns, post web scraping
  '''

  tmdb = TMDb()
  movie = Movie()
  tmdb.api_key = API_KEY
  tmdb.language = 'en'
  tmdb.debug = True

  mov = movie.details(movie.search(title)[0].id)
  title = mov.title.lower()
  overview = mov.overview.lower()
  genres = ' '.join(i.name.lower() for i in mov.genres)
  casts = ' '.join(i.name.replace(' ', '').lower() for i in mov.casts.cast[:5])
  keywords = ' '.join(i.name.replace(' ', '').lower() for i in mov.keywords.keywords)
  result = re.sub("[^a-zA-Z0-9 ]", "", title + ' ' + overview + ' ' +  casts + ' ' +  genres + ' ' +  keywords)
  return result

In [None]:
def contextBasedRecommendations(title, num):
  
  try:
    desc = descTitle(title)
  except:
    desc = overTitle(title)
  else:
    desc = popularMeasureTMDB(num)

  query_vec = count.transform([desc])                                           # Transforming the modified title into a query vec using the count vectorizer
  similarity = cosine_similarity(query_vec, count_matrix).flatten()             # Computing cosine similarity measure

  inx = np.argsort(similarity)[::-1][1:num+1]                                   # Getting the most relevant recommendations
  res = tempdb.iloc[inx]

  suggestions = []
  for i in res.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[3]
    result['Cast'] = ', '.join(ast.literal_eval(i[4]))
    for j in ast.literal_eval(i[6]).items():
      if "trailer" in j[0].lower() or i[1][:-6].lower() in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

contextBasedRecommendations('Toy Story', 3)

[{'Title': 'Toy Story 2 (1999)',
  'Overview': "Andy heads off to Cowboy Camp, leaving his toys to their own devices. Things shift into high gear when an obsessive toy collector named Al McWhiggen, owner of Al's Toy Barn kidnaps Woody. Andy's toys mount a daring rescue mission, Buzz Lightyear meets his match and Woody has to decide where he and his heart truly belong.",
  'Cast': 'Tom Hanks, Tim Allen, Wallace Shawn, Annie Potts, Frank Welker',
  'Toy Story 2 (1999) Trailer #1 | Movieclips Classic Trailers': 'https://www.youtube.com/watch?v=xNWSGRD5CzU'},
 {'Title': 'Buzz Lightyear of Star Command: The Adventure Begins (2000)',
  'Overview': 'Buzz Lightyear must battle Emperor Zurg with the help of three hopefuls who insist on being his partners.',
  'Cast': 'Patrick Warburton, Frank Welker, Wallace Shawn',
  'Buzz Lightyear of Star Command: The Adventure Begins Commercial': 'https://www.youtube.com/watch?v=qGQ4TucPSys'},
 {'Title': 'Toy Story 3 (2010)',
  'Overview': "Woody, Buzz, and

###### Based on Pre-recorded Entries

In [None]:
import ipywidgets as widgets
movieIds = list(i for i in tempdb.movieId)
movieNames = list(i[:-7] for i in tempdb.title)
movieDict = dict(zip(movieNames, movieIds))

moviePicker = widgets.Dropdown(options=movieNames, value='Toy Story')
moviePicker

Dropdown(options=('Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale', 'Father of the Bride Part I…

In [None]:
def contextBasedRecommendations(num, title=tempdb[tempdb['movieId'] == movieDict[moviePicker.value]].description):
  
  query_vec = tfidf.transform(title)
  cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()

  inx = np.argsort(cosine_sim)[::-1][1:num+1]
  res = tempdb.loc[inx]

  suggestions = []
  for i in res.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[3]
    result['Cast'] = ', '.join(ast.literal_eval(i[4]))
    for j in ast.literal_eval(i[6]).items():
      if "trailer" in j[0].lower() or i[1][:-6].lower() in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

contextBasedRecommendations(3)

[{'Title': 'Game Box 1.0 (2004)',
  'Overview': "Miserable after the shooting death of his girlfriend, video game tester Charlie immerses himself in his work. When a new 3D game comes in the mail, he becomes caught up in its bizarre fantasy world -- literally. Trapped inside the game, Charlie's only way out is to win a brutal fight-to-the-death battle.",
  'Cast': ''},
 {'Title': 'Jumanji: Welcome to the Jungle (2017)',
  'Overview': 'Four teenagers in detention discover an old video game console with a game they’ve never heard of. When they decide to play, they are immediately sucked into the jungle world of Jumanji in the bodies of their avatars. They’ll have to complete the adventure of their lives filled with fun, thrills and danger or be stuck in the game forever!',
  'Cast': 'Dwayne Johnson, Kevin Hart, Jack Black, Karen Gillan, Alex Wolff',
  'JUMANJI: WELCOME TO THE JUNGLE - Official Trailer (HD)': 'https://www.youtube.com/watch?v=2QKg5SZ_35I',
  'JUMANJI: WELCOME TO THE JUNGLE

##### Based on User Profiles on Genre Segregation
----

Reference: [Based on Binary Representation](https://www.analyticsvidhya.com/blog/2015/08/beginners-guide-learn-content-based-recommender-systems/)

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
count = 0
genres = {}

for i in movies.values:
  for j in i[2].split('|'):
    if j not in genres:
      genres[j] = count
      count += 1
print(genres)

{'Adventure': 0, 'Animation': 1, 'Children': 2, 'Comedy': 3, 'Fantasy': 4, 'Romance': 5, 'Drama': 6, 'Action': 7, 'Crime': 8, 'Thriller': 9, 'Horror': 10, 'Mystery': 11, 'Sci-Fi': 12, 'IMAX': 13, 'Documentary': 14, 'War': 15, 'Musical': 16, 'Western': 17, 'Film-Noir': 18, '(no genres listed)': 19}


In [None]:
data = []
gen = list(genres.keys())

for i in movies.values:
  d = [i[0]] + [i[1]] + [0] * len(gen)
  for j in i[2].split('|'):
    d[genres[j] + 2] = 1
  data.append(d)

labels = ['movieId', 'title'] + gen
df = pd.DataFrame(columns=labels, data=data)
df['Id'] = df.index
df.head()

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),Id
0,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,Grumpier Old Men (1995),0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
3,4,Waiting to Exhale (1995),0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,3
4,5,Father of the Bride Part II (1995),0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [None]:
genreSeg = np.array(df)

TF = np.sum(genreSeg[:, 2:], axis=0)                                            # Calculating the term frequency
genreSeg[:, 2:] = genreSeg[:, 2:]/np.sum(genreSeg[:, 2:], axis=1)[:, None]      # Normalising the genre segmentation

corpus = movies.shape[0]                                                        # Entire document corpus

IDF = np.log10(corpus/TF.astype(float))                                         # Calculating inverse document frequency

In [None]:
# Enter user ID
userId = 3

# Computing user liking based on given ratings
userLikings = [0 for i in range(corpus)]
userData = ratings[ratings['userId'] == userId].sort_values('movieId').iloc[:, :-1]

for i in userData.values:
  if i[2] <= 2 and i[2] > 0:
    userLikings[int(df[df['movieId'] == i[1]].Id)] = -1
  elif i[2] > 2:
    userLikings[int(df[df['movieId'] == i[1]].Id)] = 1
  else:
    userLikings[int(df[df['movieId'] == i[1]].Id)] = 0

userLikings = np.array(userLikings)

# Computing user prefered movies by performing dot product with user likings
userMovProfile = np.concatenate((genreSeg[:, :2], genreSeg[:, 2:] * userLikings[:, np.newaxis]), 1)

# Computing user profile by summing all values in each genre
userProfile = np.sum(userMovProfile[:, 2:], axis=0)

# Calculating user recommendations post dot product of user movie weights, user profile and idf
userRecommendations = np.sum(userMovProfile[:, 2:] * userProfile * IDF, axis=1)
userRecommendations = np.concatenate((genreSeg[:, :2], userRecommendations[:, np.newaxis]), 1)

In [None]:
genres = {}
for i in pd.merge(userData, movies, on='movieId', how='inner').genres:
  for j in i.split('|'):
    if j in genres:
      genres[j] += 1
    else:
      genres[j] = 1
genres = sorted(genres.items(), key=lambda item: item[1], reverse=True)
print(genres)

[('Action', 334), ('Thriller', 239), ('Drama', 232), ('Sci-Fi', 224), ('Adventure', 198), ('Comedy', 176), ('Crime', 132), ('IMAX', 81), ('Fantasy', 78), ('Mystery', 60), ('Romance', 60), ('Animation', 50), ('Children', 48), ('Horror', 45), ('War', 26), ('Western', 8), ('Musical', 6), ('Film-Noir', 5), ('Documentary', 3), ('(no genres listed)', 1)]


> Inference: User prefers to watch 'Action', 'Thriller', 'Drama', 'Sci-Fi' etc.

In [None]:
df = pd.DataFrame(userRecommendations)
df = df.rename(columns={0: 'movieId', 1: 'title', 2: 'score'})
df = df.sort_values('score', ascending=False)[:5]
df = pd.merge(df, links, on='movieId', how='inner')
df = df[['tmdbId', 'title']]
df = df.rename(columns={'tmdbId': 'movieId'})
df = pd.merge(df, trailers, on='movieId', how='inner')
df

Unnamed: 0,movieId,title,trailers
0,42684.0,Skyline (2010),"{'Skyline - Trailer': 'W3NrX5IMoNc', 'Skyline ..."
1,23048.0,Hot Tub Time Machine (2010),"{'Official Trailer': 'u1FnYvk6KP0', 'Official ..."
2,10327.0,"Legally Blonde 2: Red, White & Blonde (2003)","{""Legally Blonde 2 (1/11) Movie CLIP - Elle's ..."
3,10152.0,Dumb and Dumberer: When Harry Met Lloyd (2003),{'Dumb And Dumberer Trailer HD': 'tdjX8Voj5vI'}
4,38575.0,"Karate Kid, The (2010)",{'Watch the Official THE KARATE KID Trailer in...


###### Wrapped Together

In [None]:
count = 0
genres = {}

for i in movies.values:
  for j in i[2].split('|'):
    if j not in genres:
      genres[j] = count
      count += 1

data = []
gen = list(genres.keys())

for i in movies.values:
  d = [i[0]] + [i[1]] + [0] * len(gen)
  for j in i[2].split('|'):
    d[genres[j] + 2] = 1
  data.append(d)

labels = ['movieId', 'title'] + gen
df = pd.DataFrame(columns=labels, data=data)
df['Id'] = df.index

In [None]:
def recommendForUser(userId, num, df=df):

  # Computing user liking based on given ratings
  userLikings = [0 for i in range(corpus)]
  userData = ratings[ratings['userId'] == userId].sort_values('movieId').iloc[:, :-1]

  for i in userData.values:
    if i[2] <= 2 and i[2] > 0:
      userLikings[int(df[df['movieId'] == i[1]].Id)] = -1
    elif i[2] > 2:
      userLikings[int(df[df['movieId'] == i[1]].Id)] = 1
    else:
      userLikings[int(df[df['movieId'] == i[1]].Id)] = 0

  userLikings = np.array(userLikings)

  # Computing user prefered movies by performing dot product with user likings
  userMovProfile = np.concatenate((genreSeg[:, :2], genreSeg[:, 2:] * userLikings[:, np.newaxis]), 1)

  # Computing user profile by summing all values in each genre
  userProfile = np.sum(userMovProfile[:, 2:], axis=0)

  # Calculating user recommendations post dot product of user movie weights, user profile and idf
  userRecommendations = np.sum(userMovProfile[:, 2:] * userProfile * IDF, axis=1)
  userRecommendations = np.concatenate((genreSeg[:, :2], userRecommendations[:, np.newaxis]), 1)

  df = pd.DataFrame(userRecommendations)
  df = df.rename(columns={0: 'movieId', 1: 'title', 2: 'score'})

  # Sorting in descending to get the ones with highest score
  df = df.sort_values('score', ascending=False)[:num]

  df = pd.merge(df, links, on='movieId', how='inner')[['tmdbId']]
  df = df.rename(columns={'tmdbId': 'movieId'})
  df = pd.merge(df, desc_movies, on='movieId', how='inner')
  df = pd.merge(df, trailers, on='movieId', how='inner')

  suggestions = []
  for i in df.values:
    result = {}
    result['Title'] = i[1]
    result['Overview'] = i[3]
    result['Cast'] = ', '.join(ast.literal_eval(i[4]))
    for j in ast.literal_eval(i[5]).items():
      if "trailer" in j[0].lower() or i[1][:-6].lower() in j[0].lower():
        result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
    suggestions.append(result)
  return suggestions

In [None]:
recommendForUser(1307, 3)

[{'Title': 'X-Men: The Last Stand',
  'Overview': "When a cure is found to treat mutations, lines are drawn amongst the X-Men—led by Professor Charles Xavier—and the Brotherhood, a band of powerful mutants organised under Xavier's former ally, Magneto.",
  'Cast': 'Famke Janssen, Anna Paquin, Halle Berry, Hugh Jackman, Patrick Stewart',
  'X-Men: The Last Stand (2006) Original Trailer [FHD]': 'https://www.youtube.com/watch?v=X8ozc_dQprk'},
 {'Title': 'Toy Story',
  'Overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
  'Cast': 'Tom Hanks, Tim Allen, Wallace Shawn, Annie Potts',
  'Teaser Trailer': 'https://www.youtube.com/watch?v=4j_qfJN0qd4',
  'Trailer #1': 'https://www.youtube.com/watch?v=v-PjgYDrg70',
  'Disney Throwback

> Inference: Based on the given ratings the predictions are matching

In [None]:
pd.merge(ratings[ratings['userId'] == userId].sort_values('movieId').iloc[:, :-1], movies, on='movieId', how='inner').sort_values('rating', ascending = False).head()

Unnamed: 0,userId,movieId,rating,title,genres
628,3,136449,5.0,Ghost in the Shell 2.0 (2008),Action|Animation|Sci-Fi
20,3,745,5.0,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy
479,3,81591,5.0,Black Swan (2010),Drama|Thriller
23,3,858,5.0,"Godfather, The (1972)",Crime|Drama
26,3,924,5.0,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi


### Collaborative Based System

#### Using Surprise

> Take a larger portion of the data for better results in exchange for a larger model complexity

In [None]:
# Top 1% users and movies with the most ratings
minRate = ratings.groupby('movieId')['movieId'].count().reset_index(name='count')['count'].quantile(0.99)
minRatings = ratings.groupby('userId')['userId'].count().reset_index(name='count')['count'].quantile(0.99)

> Taking all the data into consideration by making the minimum 0

In [None]:
tempdb = ratings.groupby('userId')['userId'].count().reset_index(name='count')
userdb = tempdb[tempdb['count'] >= minRatings]
tempdb = ratings.groupby('movieId')['movieId'].count().reset_index(name='count')
moviedb = tempdb[tempdb['count'] >= minRate]
tempdb = pd.merge(userdb, ratings, on='userId', how='inner')[['userId', 'movieId', 'rating']]
tempdb = pd.merge(tempdb, moviedb, on='movieId', how='inner')[['userId', 'movieId', 'rating']]
tempdb.shape

(640707, 3)

In [None]:
reader = Reader(rating_scale = (0, 5))
data = Dataset.load_from_df(tempdb, reader)
trainset = data.build_full_trainset()

##### SVD (Singular Value Decomposition)

In [None]:
from surprise import SVD

svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fae5d5a7340>

In [None]:
dump.dump('/content/drive/MyDrive/Movie Recommendation System/systems/svd.pkl', algo=svd)

In [None]:
_, svd = dump.load('/content/drive/MyDrive/Movie Recommendation System/systems/svd.pkl')

In [None]:
def get_top_n(userId, num):
    
    '''
    output- the top-N recommendation for given user from a set of predictions
    '''
    
    movieIdx = np.unique(tempdb.movieId)
    
    predictions = []
    for i in movieIdx:
        predictions.append(svd.predict(userId, i))

    # First map the predictions for the given user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for the given user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:num]

    uid = list(top_n.items())[0][0]
    user_ratings = list(top_n.items())[0][1]
    movieIds = [iid for (iid, _) in user_ratings]
    ids = pd.DataFrame({'movieId': movieIds})
    res = pd.merge(movies, ids, on='movieId', how='inner')
    res = pd.merge(links, res, on='movieId', how='inner')[['tmdbId', 'genres']].rename(columns={'tmdbId': 'movieId'})
    res = pd.merge(desc_movies, res, on='movieId', how='inner')
    res = pd.merge(trailers, res, on='movieId', how='inner')[['movieId', 'title', 'overview', 'genres', 'casts', 'trailers']]

    suggestions = []
    for i in res.values:
        result = {}
        result['Title'] = i[1]
        result['Overview'] = i[2]
        result['Genres'] = i[3].split('|')
        result['Cast'] = ', '.join(ast.literal_eval(i[4]))
        for j in ast.literal_eval(i[5]).items():
            if "trailer" in j[0].lower():
                result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
        suggestions.append(result)
            
    return suggestions

# For userId 626
get_top_n(626, 3)

[{'Title': 'The Sixth Sense',
  'Overview': 'Following an unexpected tragedy, a child psychologist named Malcolm Crowe meets an nine year old boy named Cole Sear, who is hiding a dark secret.',
  'Genres': ['Drama', 'Horror', 'Mystery'],
  'Cast': 'Bruce Willis, Haley Joel Osment, Toni Collette',
  'The Sixth Sense Trailer HD': 'https://www.youtube.com/watch?v=VG9AGf66tXM'},
 {'Title': 'The Lord of the Rings: The Fellowship of the Ring',
  'Overview': 'Young hobbit Frodo Baggins, after inheriting a mysterious ring from his uncle Bilbo, must leave his home in order to keep it from falling into the hands of its evil creator. Along the way, a fellowship is formed to protect the ringbearer and make sure that the ring arrives at its final destination: Mt. Doom, the only place where it can be destroyed.',
  'Genres': ['Adventure', 'Fantasy'],
  'Cast': 'Ian McKellen, Viggo Mortensen, Orlando Bloom, Sean Bean, Liv Tyler',
  'Official Trailer': 'https://www.youtube.com/watch?v=_nZdmwHrcnw'},
 

##### KNNWithMeans

In [None]:
from surprise.prediction_algorithms import KNNWithMeans

knnwithmeans = KNNWithMeans()
knnwithmeans.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fae5d846280>

In [None]:
dump.dump('/content/drive/MyDrive/Movie Recommendation System/systems/knnwithmeans.pkl', algo=knnwithmeans)

In [None]:
_, knnwithmeans = dump.load('/content/drive/MyDrive/Movie Recommendation System/systems/knnwithmeans.pkl')

In [None]:
def get_top_n(userId, num):
    
    '''
    output- the top-N recommendation for given user from a set of predictions
    '''
    
    movieIdx = np.unique(tempdb.movieId)
    
    predictions = []
    for i in movieIdx:
        predictions.append(knnwithmeans.predict(userId, i))

    # First map the predictions for the given user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for the given user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:num]

    uid = list(top_n.items())[0][0]
    user_ratings = list(top_n.items())[0][1]
    movieIds = [iid for (iid, _) in user_ratings]
    ids = pd.DataFrame({'movieId': movieIds})
    res = pd.merge(movies, ids, on='movieId', how='inner')
    res = pd.merge(links, res, on='movieId', how='inner')[['tmdbId', 'genres']].rename(columns={'tmdbId': 'movieId'})
    res = pd.merge(desc_movies, res, on='movieId', how='inner')
    res = pd.merge(trailers, res, on='movieId', how='inner')[['movieId', 'title', 'overview', 'genres', 'casts', 'trailers']]

    suggestions = []
    for i in res.values:
        result = {}
        result['Title'] = i[1]
        result['Overview'] = i[2]
        result['Genres'] = i[3].split('|')
        result['Cast'] = ', '.join(ast.literal_eval(i[4]))
        for j in ast.literal_eval(i[5]).items():
            if "trailer" in j[0].lower():
                result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
        suggestions.append(result)
            
    return suggestions

# For userId 413
get_top_n(413, 3)

[{'Title': 'Toy Story',
  'Overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
  'Genres': ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
  'Cast': 'Tom Hanks, Tim Allen, Wallace Shawn, Annie Potts',
  'Teaser Trailer': 'https://www.youtube.com/watch?v=4j_qfJN0qd4',
  'Trailer #1': 'https://www.youtube.com/watch?v=v-PjgYDrg70',
  'Disney Throwback Trailer': 'https://www.youtube.com/watch?v=tN1A2mVnrOM',
  'Original Trailer': 'https://www.youtube.com/watch?v=CxwTLktovTU',
  'TOY STORY Theatrical Trailer [1995] 4K': 'https://www.youtube.com/watch?v=ciPU28QhT1Y'},
 {'Title': 'Jumanji',
  'Overview': "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittin

##### NMF (Non-negative Matrix Factorization)

In [None]:
from surprise import NMF

nmf = NMF()
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fae5d92d2b0>

In [None]:
dump.dump('/content/drive/MyDrive/Movie Recommendation System/systems/nmf.pkl', algo=nmf)

In [None]:
_, nmf = dump.load('/content/drive/MyDrive/Movie Recommendation System/systems/nmf.pkl')

In [None]:
def get_top_n(userId, num):
    
    '''
    output- the top-N recommendation for given user from a set of predictions
    '''
    
    movieIdx = np.unique(tempdb.movieId)
    
    predictions = []
    for i in movieIdx:
        predictions.append(nmf.predict(userId, i))

    # First map the predictions for the given user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for the given user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:num]

    uid = list(top_n.items())[0][0]
    user_ratings = list(top_n.items())[0][1]
    movieIds = [iid for (iid, _) in user_ratings]
    ids = pd.DataFrame({'movieId': movieIds})
    res = pd.merge(movies, ids, on='movieId', how='inner')
    res = pd.merge(links, res, on='movieId', how='inner')[['tmdbId', 'genres']].rename(columns={'tmdbId': 'movieId'})
    res = pd.merge(desc_movies, res, on='movieId', how='inner')
    res = pd.merge(trailers, res, on='movieId', how='inner')[['movieId', 'title', 'overview', 'genres', 'casts', 'trailers']]

    suggestions = []
    for i in res.values:
        result = {}
        result['Title'] = i[1]
        result['Overview'] = i[2]
        result['Genres'] = i[3].split('|')
        result['Cast'] = ', '.join(ast.literal_eval(i[4]))
        for j in ast.literal_eval(i[5]).items():
            if "trailer" in j[0].lower():
                result[j[0]] = 'https://www.youtube.com/watch?v=' + str(j[1])
        suggestions.append(result)
            
    return suggestions

# For userId 187
get_top_n(187, 3)

[{'Title': 'Pulp Fiction',
  'Overview': "A burger-loving hit man, his philosophical partner, a drug-addled gangster's moll and a washed-up boxer converge in this sprawling, comedic crime caper. Their adventures unfurl in three stories that ingeniously trip back and forth in time.",
  'Genres': ['Comedy', 'Crime', 'Drama', 'Thriller'],
  'Cast': 'John Travolta, Samuel L. Jackson, Uma Thurman, Bruce Willis, Ving Rhames',
  'Official Trailer': 'https://www.youtube.com/watch?v=tGpTpVyI_OQ'},
 {'Title': 'Raiders of the Lost Ark',
  'Overview': 'When Dr. Indiana Jones – the tweed-suited professor who just happens to be a celebrated archaeologist – is hired by the government to locate the legendary Ark of the Covenant, he finds himself up against the entire Nazi regime.',
  'Genres': ['Action', 'Adventure'],
  'Cast': 'Harrison Ford, Karen Allen, Alfred Molina, Frank Welker',
  'Paramount Movies Trailer': 'https://www.youtube.com/watch?v=0xQSIdSRlAk',
  'Indiana Jones and the Raiders of the 