# Content based recommendation system for movies

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from  sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [None]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [None]:
df.shape

(4803, 24)

### Handling missing values

In [None]:
features = ['genres','title','cast','director','keywords','original_language']

for feature in features:
  df[feature] = df[feature].fillna("") #Filling null values with blank strings for relevant columns

### Combining Selected Features

In [None]:
def combine_features(row):
  return row['title']+", " + row['genres']+", " + row['cast']+", " + row['director']+", " + row['keywords']+", " + row['original_language']

df['combined_features'] = df.apply(combine_features, axis = 1)
df['combined_features'].head()

Unnamed: 0,combined_features
0,"Avatar, Action Adventure Fantasy Science Ficti..."
1,"Pirates of the Caribbean: At World's End, Adve..."
2,"Spectre, Action Adventure Crime, Daniel Craig ..."
3,"The Dark Knight Rises, Action Crime Drama Thri..."
4,"John Carter, Action Adventure Science Fiction,..."


### Creating Vector Representations of the combined features

In [None]:
tfidf = TfidfVectorizer()
tf = tfidf.fit_transform(df['combined_features'])
tf.toarray()
tf.shape

(4803, 17503)

### Obtaining cosine similarity for each movie with the other and itself.


In [None]:
cm = cosine_similarity(tf)
cm.shape

(4803, 4803)

In [None]:
cm

array([[1.00000000e+00, 2.48272960e-02, 4.22228416e-02, ...,
        1.12200797e-03, 1.34747833e-03, 1.11917020e-03],
       [2.48272960e-02, 1.00000000e+00, 1.42630716e-02, ...,
        4.07129084e-02, 1.11032424e-03, 9.22197982e-04],
       [4.22228416e-02, 1.42630716e-02, 1.00000000e+00, ...,
        1.15436886e-03, 5.90455012e-02, 1.15144924e-03],
       ...,
       [1.12200797e-03, 4.07129084e-02, 1.15436886e-03, ...,
        1.00000000e+00, 1.17574481e-03, 5.83911241e-02],
       [1.34747833e-03, 1.11032424e-03, 5.90455012e-02, ...,
        1.17574481e-03, 1.00000000e+00, 1.17277112e-03],
       [1.11917020e-03, 9.22197982e-04, 1.15144924e-03, ...,
        5.83911241e-02, 1.17277112e-03, 1.00000000e+00]])

### Taking user input for movie title

In [None]:
def get_index(title):
  return df.loc[df['title'] == title]['index'].index[0] #Obtaining the row number for the particular title entered by the user

title = input("Enter a movie title: ")
user_index = get_index(title)
user_index

Enter a movie title: Avatar


np.int64(0)

### Enumerating over the cosine similarities and obtaining those for the movie entered by the user.

In [None]:
sm = list(enumerate(cm[user_index]))
print(sm)

[(0, np.float64(1.0)), (1, np.float64(0.02482729598662068)), (2, np.float64(0.04222284156915126)), (3, np.float64(0.008160607303690772)), (4, np.float64(0.10625865464875474)), (5, np.float64(0.06782484284827672)), (6, np.float64(0.001339525235903688)), (7, np.float64(0.03695123127145235)), (8, np.float64(0.022342422412007634)), (9, np.float64(0.027496115325422912)), (10, np.float64(0.06727359460847544)), (11, np.float64(0.015687532413989798)), (12, np.float64(0.026892800234230893)), (13, np.float64(0.02622582580409013)), (14, np.float64(0.05361238394260148)), (15, np.float64(0.020132121464363568)), (16, np.float64(0.038741260845388134)), (17, np.float64(0.027494314158529484)), (18, np.float64(0.06237344854351263)), (19, np.float64(0.027289742758032448)), (20, np.float64(0.02857546008400283)), (21, np.float64(0.015171695489496222)), (22, np.float64(0.021260491317757695)), (23, np.float64(0.04444890691031616)), (24, np.float64(0.01579353802901616)), (25, np.float64(0.04646807786430684)),

In [None]:
#Sorting the similarities based on the descending order.
sm = sorted(sm, key = lambda x:x[1], reverse = True)
sm

[(0, np.float64(1.0)),
 (94, np.float64(0.26040478925536387)),
 (2403, np.float64(0.2437656473812525)),
 (3158, np.float64(0.2380362491795586)),
 (1053, np.float64(0.2019474988599292)),
 (47, np.float64(0.198667215098271)),
 (56, np.float64(0.19712445427856704)),
 (2696, np.float64(0.18977958787271806)),
 (1951, np.float64(0.18881613734762132)),
 (838, np.float64(0.18663143698859144)),
 (239, np.float64(0.1849614833931389)),
 (461, np.float64(0.1836843903277265)),
 (661, np.float64(0.18144973578608867)),
 (3730, np.float64(0.17715289892247288)),
 (4593, np.float64(0.1768467770260661)),
 (1354, np.float64(0.1754166772597968)),
 (812, np.float64(0.17150770465155143)),
 (2198, np.float64(0.1670677490181101)),
 (643, np.float64(0.16288809609689386)),
 (2229, np.float64(0.15723227093502493)),
 (278, np.float64(0.153193680156277)),
 (206, np.float64(0.15022386032411833)),
 (4332, np.float64(0.14558380840014096)),
 (1922, np.float64(0.144587146923363)),
 (1531, np.float64(0.14290969941766687)

In [None]:
#Function to return the cast and title of a movie based on the index
def get_info(user_index):
  return df[df.index == user_index]['title'].values[0] + ": "+ df[df.index == user_index]['cast'].values[0]

#Printing top 10 most similar movies
i = 0
for movie in sm:
  #Skipping the first movie since it will be the user entered itself
  if i==0:
    i = i+1
    continue
  print(get_info(movie[0]))
  i=i+1
  if i>10:
    break

Guardians of the Galaxy: Chris Pratt Zoe Saldana Dave Bautista Vin Diesel Bradley Cooper
Aliens: Sigourney Weaver Michael Biehn James Remar Paul Reiser Lance Henriksen
Alien: Tom Skerritt Sigourney Weaver Veronica Cartwright Harry Dean Stanton John Hurt
Galaxy Quest: Tim Allen Sigourney Weaver Alan Rickman Tony Shalhoub Sam Rockwell
Star Trek Into Darkness: Chris Pine Zachary Quinto Zoe Saldana Karl Urban Simon Pegg
Star Trek Beyond: Chris Pine Zachary Quinto Karl Urban Simon Pegg Zoe Saldana
Jason X: Kane Hodder Lexa Doig Chuck Campbell Lisa Ryder David Cronenberg
Space Dogs: Anna Bolshova Evgeny Mironov Sergey Garmash Aleksandr Bashirov Elena Yakovleva
Alien³: Sigourney Weaver Charles S. Dutton Charles Dance Pete Postlethwaite Ralph Brown
Gravity: Sandra Bullock George Clooney Ed Harris Orto Ignatiussen Phaldut Sharma
