In [None]:
%matplotlib inline
!pip install surprise
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from surprise import SVD, Reader
from surprise import Dataset
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.model_selection import cross_validate


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/772.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m614.4/772.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163764 sha256=e7d539de70

In [None]:
def load_data():

    movie_credits_df       = pd.read_csv('credits.csv')
    movie_keywords_df      = pd.read_csv('keywords.csv')
    movie_links_small_df   = pd.read_csv('links_small.csv')
    movie_metadata_df      = pd.read_csv('movies_metadata.csv')
    movie_ratings_small_df = pd.read_csv('ratings_small.csv')
    return movie_credits_df, movie_keywords_df, movie_links_small_df, movie_metadata_df, movie_ratings_small_df


The load_data() function in Python reads data from five CSV files (‘credits.csv’, ‘keywords.csv’, ‘links_small.csv’, ‘movies_metadata.csv’, ‘ratings_small.csv’) and stores them in respective pandas DataFrames. It then returns these DataFrames. The function assumes the files are in the same directory as the script and requires the pandas library.

In [None]:
def process_genres(movie_metadata_df):

    movie_metadata_df['genres'] = movie_metadata_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
    return movie_metadata_df

In [None]:
def calculate_constants(movie_metadata_df):

    vote_count = movie_metadata_df[movie_metadata_df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_average = movie_metadata_df[movie_metadata_df['vote_average'].notnull()]['vote_average'].astype('int')


    a = vote_average.mean()
    b = vote_count.quantile(0.95)
    return a, b

In [None]:
movie_credits_df, movie_keywords_df, movie_links_small_df, movie_metadata_df, movie_ratings_small_df = load_data()
movie_metadata_df = process_genres(movie_metadata_df)
a, b = calculate_constants(movie_metadata_df)

a, b

  movie_metadata_df      = pd.read_csv('movies_metadata.csv')


(5.244896612406511, 434.0)

Average Rating (a):

The calculated average rating is approximately 5.24, reflecting the central tendency of movie ratings in the dataset.

User Engagement Criterion (b):

The threshold of 434.0 for vote count ensures a focus on movies with a significant level of popularity, contributing to the system's accuracy.

Rating Scale Context:

The discrete rating scale, with 5.24 as the average, provides a context for interpreting user sentiments within the recommendation algorithm.

In [None]:
movie_metadata_df['year'] = pd.to_datetime(movie_metadata_df['release_date'], errors='coerce').dt.year

filtered_movies = movie_metadata_df[(movie_metadata_df['vote_count'] >= b) &
                                     movie_metadata_df['vote_count'].notnull() &
                                     movie_metadata_df['vote_average'].notnull()]

selected_columns = ['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']
filtered_movies = filtered_movies[selected_columns]

filtered_movies['vote_count'] = filtered_movies['vote_count'].astype('int')
filtered_movies['vote_average'] = filtered_movies['vote_average'].astype('int')

filtered_movies.shape


(2274, 6)

Data Size:

The output shape (2274, 6) indicates that the dataset now contains 2274 movies with 6 relevant attributes. This filtered subset likely includes movies with a significant vote count and non-null values for vote average.
Year Extraction:

The 'year' column, derived from the 'release_date,' provides a temporal dimension to the dataset, allowing for analyses based on the release year of movies.
Popularity and Genres:

The selected attributes, including 'popularity' and 'genres,' suggest that the filtered dataset retains information about movie popularity and genre composition. This refined dataset can be valuable for building a recommendation system focused on well-received and genre-specific movies

In [None]:
filtered_movies['weighted_rating'] = filtered_movies.apply(
    lambda x: (x['vote_count'] / (x['vote_count'] + b) * x['vote_average']) + (b / (b + x['vote_count']) * a),
    axis=1
)

top_250_movies = filtered_movies.sort_values('weighted_rating', ascending=False).head(250)
top_250_movies.head(5)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating
15480,Inception,2010.0,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008.0,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014.0,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999.0,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001.0,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787


Genre Expansion:

The code expands the 'genres' column in the movie_metadata DataFrame, transforming it into individual rows for each genre associated with a movie.
Granular Data Representation:

By using stack() and reset_index(), the code achieves a more detailed representation of movie genres, facilitating genre-specific analyses in the resulting general_metadata DataFrame.
Enhanced Data Structure:

The transformation improves the data structure for genre-related information, allowing for easier genre-based exploration and analysis within the movie metadata

In [None]:

transformed_genres = movie_metadata_df.apply(lambda row: pd.Series(row['genres']), axis=1).stack().reset_index(level=1, drop=True)
transformed_genres.name = 'movie_genre'
optimized_metadata = movie_metadata_df.drop('genres', axis=1).join(transformed_genres)
optimized_metadata.head(3).transpose()


  transformed_genres = movie_metadata_df.apply(lambda row: pd.Series(row['genres']), axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,0,0.1,0.2
adult,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ..."
budget,30000000,30000000,30000000
homepage,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story
id,862,862,862
imdb_id,tt0114709,tt0114709,tt0114709
original_language,en,en,en
original_title,Toy Story,Toy Story,Toy Story
overview,"Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ..."
popularity,21.946943,21.946943,21.946943


The below code defines is used for generating movie recommendations based on a specified genre.

Genre-Based Filtering:

The function filters movies in the general_metadata DataFrame based on a specified genre, creating a subset of data for genre-specific analysis.
Weighted Rating Calculation:

Utilizing a weighted rating formula, the function calculates ratings for the filtered movies, considering both vote count and average. This approach prioritizes movies with higher engagement.
Top Recommendations:

The function sorts and selects the top 250 movies based on the calculated weighted ratings, providing personalized recommendations for the specified genre.

In [None]:
print(optimized_metadata.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'homepage', 'id', 'imdb_id',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'video', 'vote_average', 'vote_count', 'year',
       'movie_genre'],
      dtype='object')


In [None]:
def genre_recommendation(genre, percentile=0.85):
    data_frames = optimized_metadata[optimized_metadata['movie_genre'] == genre]
    vote_counts = data_frames['vote_count'].dropna().astype('int')
    vote_averages = data_frames['vote_average'].dropna().astype('int')

    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    recommendations = data_frames[(data_frames['vote_count'] >= m) &
                                  data_frames['vote_count'].notnull() &
                                  data_frames['vote_average'].notnull()][['title', 'year', 'vote_count', 'vote_average', 'popularity']]

    recommendations['vote_count'] = recommendations['vote_count'].astype('int')
    recommendations['vote_average'] = recommendations['vote_average'].astype('int')

    recommendations['weighted_rating'] = recommendations.apply(lambda x:
                        (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C),
                        axis=1)

    recommendations = recommendations.sort_values('weighted_rating', ascending=False).head(250)

    return recommendations

genre_recommendation('Romance').head(5)


Unnamed: 0,title,year,vote_count,vote_average,popularity,weighted_rating
10309,Dilwale Dulhania Le Jayenge,1995.0,661,9,34.457024,8.565285
351,Forrest Gump,1994.0,8147,8,48.307194,7.971357
876,Vertigo,1958.0,1162,8,18.20822,7.811667
40251,Your Name.,2016.0,1030,8,34.461252,7.789489
883,Some Like It Hot,1959.0,835,8,11.845107,7.745154


Content based recommendation system

In [None]:
print(movie_links_small_df.columns)

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')


In [None]:
print(movie_links_small_df)

      movieId   imdbId    tmdbId
0           1   114709     862.0
1           2   113497    8844.0
2           3   113228   15602.0
3           4   114885   31357.0
4           5   113041   11862.0
...       ...      ...       ...
9120   162672  3859980  402672.0
9121   163056  4262980  315011.0
9122   163949  2531318  391698.0
9123   164977    27660  137608.0
9124   164979  3447228  410803.0

[9125 rows x 3 columns]


This code links TMDb IDs from the 'tmdbId' column in movie_links_small_df, maps 'id' values to create a new 'tmdbId' column in movie_metadata_df, and identifies rows with null TMDb IDs. Following this, it cleans movie_metadata_df by removing specific rows and filters the dataset to retain movies with TMDb IDs present in the linked set. The resulting DataFrame, filtered_movies, represents a cleaned subset of movie metadata with linked TMDb IDs, excluding specific rows and ensuring data consistency.

In [None]:
import numpy as np
linked_tmdb_ids = movie_links_small_df[movie_links_small_df['tmdbId'].notnull()]['tmdbId'].astype('int')
movie_metadata_df['tmdbId'] = movie_metadata_df['id'].apply(lambda x: int(x) if x.isdigit() else np.nan)
null_tmdb_ids = movie_metadata_df[movie_metadata_df['tmdbId'].isnull()]

null_tmdb_ids



Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,tmdbId
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,


In [None]:
movie_metadata_cleaned = movie_metadata_df.drop([19730, 29503, 35587])
movie_metadata_cleaned['id'] = movie_metadata_cleaned['id'].astype('int')
filtered_movies = movie_metadata_cleaned[movie_metadata_cleaned['id'].isin(movie_links_small_df['tmdbId'])]

filtered_movies.shape


(9099, 26)

Inference :Linked TMDb IDs:

The resulting DataFrame, filtered_movies, has 9,099 rows and 26 columns, indicating successful linking of TMDb IDs from movie_links_small_df to the movie_metadata_df.
Data Consistency:

The code has effectively cleaned and filtered the movie metadata, ensuring data consistency by excluding specific rows and mapping 'id' values to create a new 'tmdbId' column.
Subset Representation:

The output shape of (9099, 26) implies that the resulting DataFrame, filtered_movies, represents a subset of movie metadata with linked TMDb IDs, providing a refined dataset for further analysis.


This code is designed to process movie descriptions and taglines in the DataFrame small_mov by combining them into a new 'description' column and filling missing values. It then utilizes TF-IDF vectorization with a word and bigram analyzer to create a matrix representation of the textual data. The resulting cosine similarity matrix, calculated using linear kernel, quantifies the similarity between movie descriptions, serving as a foundation for content-based recommendation systems.

The code is used to enhance movie content analysis, integrating textual information for improved recommendation system capabilities. By leveraging TF-IDF vectorization and cosine similarity, it facilitates the identification of movies with similar descriptions, enriching the dataset for content-based recommendation models.

In [None]:
filtered_movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,tmdbId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995.0,862.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995.0,8844.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995.0,15602.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995.0,31357.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995.0,11862.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40224,False,,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",,315011,tt4262980,ja,シン・ゴジラ,From the mind behind Evangelion comes a hit la...,...,120.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,2016.0,315011.0
40503,False,,0,"[Documentary, Music]",http://www.thebeatlesliveproject.com/,391698,tt2531318,en,The Beatles: Eight Days a Week - The Touring Y...,"The band stormed Europe in 1963, and, in 1964,...",...,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The band you know. The story you don't.,The Beatles: Eight Days a Week - The Touring Y...,False,7.6,92.0,2016.0,391698.0
44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000,"[Adventure, Fantasy, Animation, Action, Family]",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,...,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0,2000.0,10991.0
44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0,"[Adventure, Fantasy, Animation, Science Fictio...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",...,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0,2001.0,12600.0


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Assuming filtered_movies is the DataFrame
filtered_movies['tagline'] = filtered_movies['tagline'].fillna('')
filtered_movies['description'] = filtered_movies['overview'] + filtered_movies['tagline']
filtered_movies['description'] = filtered_movies['description'].fillna('')

tf_vector = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_vect_matrix = tf_vector.fit_transform(filtered_movies['description'])
print(tfidf_vect_matrix.shape)

cosine_sim = linear_kernel(tfidf_vect_matrix, tfidf_vect_matrix)
print(cosine_sim[0])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_movies['tagline'] = filtered_movies['tagline'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_movies['description'] = filtered_movies['overview'] + filtered_movies['tagline']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_movies['description'] = filtered_movies

(9099, 268124)
[1.         0.00680476 0.         ... 0.         0.00344913 0.        ]


Inference:
Matrix Dimensions:

The TF-IDF matrix has a shape of (9099, 268124), indicating that it consists of 9099 rows (representing movies) and 268,124 columns (representing unique word or word combinations in the movie descriptions).
Cosine Similarity:

The computed cosine similarity values provide a pairwise similarity score for each movie in the dataset. The first value, 1.0, represents the movie's self-similarity, while other values indicate the similarity between the first movie and the rest in the dataset.
Sparse Similarities:

The majority of cosine similarity values are close to zero, suggesting sparse similarities between movies in the TF-IDF space. This is expected, as most movies have distinct descriptions, resulting in low similarity scores.

In [None]:
filtered_movies = filtered_movies.reset_index()
movie_titles = filtered_movies['title']
movie_indices = pd.Series(filtered_movies.index, index=filtered_movies['title'])

movie_title = 'Made'
index = movie_indices[movie_title]
similarity_scores = list(enumerate(cosine_sim[index]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similar_movie_indices = [i[0] for i in similarity_scores[1:6]]  # Adjust the number of recommendations as needed
recommended_movies = movie_titles.iloc[similar_movie_indices]

print(recommended_movies)


4196       Johnny Dangerously
3108       The Way of the Gun
618                   Thinner
8387               The Family
6201    The Constant Gardener
Name: title, dtype: object


In [None]:
filtered_movies = filtered_movies.reset_index()
movie_titles = filtered_movies['title']
movie_indices = pd.Series(filtered_movies.index, index=filtered_movies['title'])

movie_title = 'JFK'
index = movie_indices[movie_title]
similarity_scores = list(enumerate(cosine_sim[index]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similar_movie_indices = [i[0] for i in similarity_scores[1:6]]  # Adjust the number of recommendations as needed
recommended_movies = movie_titles.iloc[similar_movie_indices]

print(recommended_movies)

7242     The File on Thelma Jordon
5987    A Love Song for Bobby Long
1135      Night Falls on Manhattan
4489                         Q & A
8680             The Young Savages
Name: title, dtype: object


Content based Recommendation System with movie description, taglines, keywords, cast, director and genres

In [None]:
# Assuming movie_metadata_df, movie_credits_df, movie_keywords_df, and movie_links_small_df are the DataFrames
metadata = movie_metadata_df.copy()
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
movie_credits_df['id'] = movie_credits_df['id'].astype('int')
movie_keywords_df['id'] = movie_keywords_df['id'].astype('int')

merged_metadata = metadata.merge(movie_credits_df, on='id').merge(movie_keywords_df, on='id')
sm_movies = merged_metadata[merged_metadata['id'].isin(movie_links_small_df['tmdbId'])]
sm_movies.shape


(9219, 29)

In [None]:
import pandas as pd
from ast import literal_eval

movies = sm_movies.copy()

movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies['cast_size'] = movies['cast'].apply(lambda x: len(x))
movies['crew_size'] = movies['crew'].apply(lambda x: len(x))

def extract_director(crew_list):
    for i in crew_list:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

movies['director'] = movies['crew'].apply(extract_director)

movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)

movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

movies['cast'] = movies['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies['director'] = movies['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
movies['director'] = movies['director'].apply(lambda x: [x, x, x])

keywords_series = movies.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
keywords_series.name = 'keyword'
keywords_count = keywords_series.value_counts()

top_keywords = keywords_count[:5]



  keywords_series = movies.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)


In [None]:
top_keywords

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [None]:
filtered_keywords_count = keywords_count[keywords_count > 1]
def filter_words(x):
    return [word for word in x if word in filtered_keywords_count]
stemmed_word = [SnowballStemmer('english').stem('dogs')]

print(filtered_keywords_count)
print(filter_words(['dogs', 'cats', 'birds']))
print(stemmed_word)



independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
                       ... 
boarder                   2
social climbing           2
covert operation          2
prisoners                 2
crystal                   2
Name: keyword, Length: 6709, dtype: int64
[]
['dog']


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
stemmer.stem('dogs')

movies['keywords'] = movies['keywords'].apply(filter_words)
movies['keywords'] = movies['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies['soup'] = movies['keywords'] + movies['cast'] + movies['director'] + movies['genres']
movies['soup'] = movies['soup'].apply(lambda x: ' '.join(x))

cv = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_vect_matrix = cv.fit_transform(movies['soup'])
cosine_similarity_matrix = cosine_similarity(count_vect_matrix, count_vect_matrix)

movies = movies.reset_index()
movie_titles = movies['title']
movie_indices = pd.Series(movies.index, index=movies['title'])

def recommends(x):
    index = movie_indices[x]
    scr = list(enumerate(cosine_sim[index]))
    scr = sorted(scr, key=lambda x: x[1], reverse=True)
    scr = scr[1:31]
    mi = [i[0] for i in scr]
    return movie_titles.iloc[mi]

recommends('Inception').head(5)

print(recommended_movies)


7242     The File on Thelma Jordon
5987    A Love Song for Bobby Long
1135      Night Falls on Manhattan
4489                         Q & A
8680             The Young Savages
Name: title, dtype: object


Adding the system with Popularity and Ratings

In [None]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + b) * R) + (b / (b + v) * a)

def recommends_improve(movie_title):
    movie_index = movie_indices[movie_title]
    similarities = list(enumerate(cosine_similarity_matrix[movie_index]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    similarities = similarities[1:26]
    indices = [i[0] for i in similarities]

    similar_movies = movies.iloc[indices][['title', 'vote_count', 'vote_average', 'year']]

    vote_counts = similar_movies[similar_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = similar_movies[similar_movies['vote_average'].notnull()]['vote_average'].astype('int')

    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)

    qualified_movies = similar_movies[(similar_movies['vote_count'] >= m) &
                                      (similar_movies['vote_count'].notnull()) &
                                      (similar_movies['vote_average'].notnull())]

    qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
    qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')

    qualified_movies['wr'] = qualified_movies.apply(weighted_rating, axis=1)
    qualified_movies = qualified_movies.sort_values('wr', ascending=False).head(10)

    return qualified_movies

recommends_improve('Interstellar')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_movies['wr'] = qualified_movies.app

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010.0,7.917588
6981,The Dark Knight,12269,8,2008.0,7.905871
6623,The Prestige,4510,8,2006.0,7.758148
3381,Memento,4168,8,2000.0,7.740175
8031,The Dark Knight Rises,9263,7,2012.0,6.921448
6218,Batman Begins,7511,7,2005.0,6.904127
8983,The Martian,7442,7,2015.0,6.903287
756,2001: A Space Odyssey,3075,7,1968.0,6.782925
8384,Oblivion,4862,6,2013.0,5.93812
8854,Terminator Genisys,3677,5,2015.0,5.025854


Hybrid recommendation system

In [None]:
ratings_reader = Reader()
ratings_data = Dataset.load_from_df(movie_ratings_small_df[['userId', 'movieId', 'rating']], ratings_reader)
svd_algorithm = SVD()
cv_results = cross_validate(svd_algorithm, ratings_data, measures=['RMSE', 'MAE'], cv=5)
cv_results


{'test_rmse': array([0.89630755, 0.89464044, 0.89858359, 0.88784593, 0.90333051]),
 'test_mae': array([0.69104556, 0.68949464, 0.68921531, 0.6866257 , 0.6933138 ]),
 'fit_time': (2.7341482639312744,
  0.9824020862579346,
  1.0271315574645996,
  0.9710016250610352,
  0.9583349227905273),
 'test_time': (0.08815264701843262,
  0.08349275588989258,
  0.24646973609924316,
  0.0821378231048584,
  0.0801856517791748)}

In [None]:
def convert_to_int(x):
    try:
        return int(x)
    except:
        return np.nan

movies_links = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
movies_links['tmdbId'] = movies_links['tmdbId'].apply(convert_to_int)
movies_links.columns = ['movieId', 'id']
movies_links = movies_links.merge(filtered_movies[['title', 'id']], on='id').set_index('title')

movies_mapper = movies_links.set_index('id')

def hybrid_recommendation(user_id, movie_name):
    movie_index = movie_indices[movie_name]
    tmdb_id = movies_links.loc[movie_name]['id']
    movie_id = movies_links.loc[movie_name]['movieId']

    similar_movies_cosine = list(enumerate(cosine_similarity_matrix[int(movie_index)]))
    similar_movies_cosine = sorted(similar_movies_cosine, key=lambda x: x[1], reverse=True)
    similar_movies_cosine = similar_movies_cosine[1:26]
    similar_movies_indices = [i[0] for i in similar_movies_cosine]

    recommended_movies = filtered_movies.iloc[similar_movies_indices][['title', 'vote_count', 'vote_average', 'release_date', 'id']]
    recommended_movies['est'] = recommended_movies['id'].apply(lambda x: svd_algorithm.predict(user_id, movies_mapper.loc[x]['movieId']).est)
    recommended_movies = recommended_movies.sort_values('est', ascending=False)

    return recommended_movies.head(10)

hybrid_recommendation(1, 'Aliens')


Unnamed: 0,title,vote_count,vote_average,release_date,id,est
6905,21,1406.0,6.5,2008-03-27,8065,3.297367
7939,Mission: Impossible - Ghost Protocol,4026.0,6.8,2011-12-07,56292,3.279022
522,Terminator 2: Judgment Day,4274.0,7.7,1991-07-01,280,3.175958
7498,Aria,11.0,6.2,1987-05-27,25832,3.166275
7828,The Woman,145.0,5.9,2011-10-14,65599,3.131817
7488,Kick-Ass,4747.0,7.1,2010-03-22,23483,3.074131
1011,Evil Dead II,760.0,7.5,1987-03-13,765,3.034895
2967,American Pimp,15.0,6.4,1999-01-01,24587,2.976088
6923,Super High Me,71.0,5.9,2007-10-13,14236,2.965331
7903,J. Edgar,766.0,6.0,2011-11-09,88794,2.959392
