# Types
    1. Content-Base System 
       eg : Item Content and Movies(actors, Directors, Types)
    2. User Base

### 1. Content Based System


In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv("movies_metadata.csv",low_memory=False)

In [3]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

df = df[0:1000]

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(1000, 9397)

In [20]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]



['lisette',
 'list',
 'listener',
 'listening',
 'listens',
 'literacy',
 'literally',
 'literary',
 'literature',
 'little']

In [12]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [18]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [21]:
get_recommendations('Jumanji')

8                  Sudden Death
363                    Maverick
976        D3: The Mighty Ducks
954                      Picnic
96                     Shopping
894            My Favorite Year
839                 Small Faces
591             Window to Paris
717    Getting Away with Murder
309           The Swan Princess
Name: title, dtype: object

Task
1. short credits and 1 to 1000 (credit.csv)
2. Short the data and take 1 to 100 (keywords.csv)

geurn
derectors 
cast (actors)
Crew
overview

-----------------------------

Cast , crew , directors - mearge it and create count vector and then cosian similarity


In [5]:
# Calculate mean of vote average column
C = df['vote_average'].mean()
print(C)

5.983399999999998


In [6]:
# Calculate the minimum number of votes required to be in the chart, m
m = df['vote_count'].quantile(0.90)
print(m)

630.2


In [7]:
# Filter out all qualified movies into a new DataFrame
q_movies = df.copy().loc[df['vote_count'] >= m]
q_movies.shape

(100, 24)

In [8]:
df.shape

(1000, 24)

In [9]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [11]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.323551
834,The Godfather,6024.0,8.5,8.26166
292,Pulp Fiction,8670.0,8.3,8.143023
351,Forrest Gump,8147.0,8.2,8.040849
522,Schindler's List,4436.0,8.3,8.011831
256,Star Wars,6778.0,8.1,7.919945
289,Leon: The Professional,4293.0,8.2,7.916262
46,Se7en,5915.0,8.1,7.896205
586,The Silence of the Lambs,4549.0,8.1,7.842454
359,The Lion King,5520.0,8.0,7.793363
