In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings; warnings.simplefilter('ignore')

In [2]:
movies = pd.read_csv('../Dataset/Clean_Dataset/clean_all.csv')

In [3]:
movies.shape

(45640, 18)

In [4]:
movies['genres'] = movies['genres'].apply(literal_eval)
movies['cast'] = movies['cast'].apply(literal_eval)

# Finding the movie in the table

In [5]:
# movie_input = input("Movie Name: ").title()
movie_input = 'The Hunger Games'.title()

In [6]:
movie_details = (movies.loc[movies['title'] == movie_input])

In [7]:
movie_details

Unnamed: 0,belongs_to_collection,genres,id,original_language,original_title,overview,popularity,production_companies,release_date,tagline,title,vote_average,vote_count,keywords,cast,cast_size,crew_size,director
18236,The Hunger Games Collection,"[Science Fiction, Adventure, Fantasy]",70160,en,The Hunger Games,Every year in the ruins of what was once North...,20.031668,"['Lionsgate', 'Color Force']",2012-03-12,May The Odds Be Ever In Your Favor.,The Hunger Games,6.898438,9634,"['hallucination', 'dystopia', 'female protagon...","[Jennifer Lawrence, Josh Hutcherson, Liam Hems...",48,61,Gary Ross


In [8]:
movie_input_coll =  movie_details['belongs_to_collection'].to_numpy()[0]

In [9]:
movie_input_lang = movie_details['original_language'].to_numpy()[0]

In [10]:
genre_list = movie_details['genres'].to_numpy()[0]

In [11]:
# try:
#     movie_details = (movies.loc[movies['title'] == movie_input])
#     movie_input_lang = movie_details['original_language'].to_numpy()[0]
#     genre_list = movie_details['genres'].to_numpy()[0]
# except:
#     print('Sorry, Not found in the database')

# Sorting on the basis of language

In [12]:
movies_lang = movies[movies['original_language'] == movie_input_lang]

# Sorting on the basis of collections

In [13]:
collections = movies[movies['belongs_to_collection'] == movie_input_coll].sort_values(by="release_date")[['title','id']]

In [14]:
collections.empty

False

In [15]:
collections[collections['title'] != movie_input].index

Int64Index([22044, 24855, 30373], dtype='int64')

In [16]:
movies_lang =movies_lang.drop(collections[collections['title'] != movie_input].index)

In [17]:
movies_lang.shape

(32431, 18)

In [18]:
# s = movies_lang.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
# s.name = 'genre'
# gen_md = movies_lang.drop('genres', axis=1).join(s)

In [19]:
# filtered_movies = gen_md[gen_md['genre'].isin(genre_list)]

In [20]:
# ids = filtered_movies['id'].drop_duplicates().reset_index(drop=True).to_frame()

In [21]:
# movies_lang = movies_lang.merge(ids,how='inner',on='id')

In [22]:
movies_lang.shape

(32431, 18)

# Use content based filtering on
- overview
- Tagline
- 3 x Keywords

In [23]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [24]:
movies_lang['tagline'] = movies_lang['tagline'].fillna('')
movies_lang['description'] = movies_lang['overview'] + movies_lang['tagline']
movies_lang['description'] = movies_lang['description'].fillna('')

In [25]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies_lang['description'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
movies_lang = movies_lang.reset_index(drop=True)
titles = movies_lang['title']
indices = pd.Series(movies_lang.index, index=movies_lang['title'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[movie_indices]

In [None]:
collections[collections['title'] != movie_input]

In [None]:
get_recommendations(movie_input)

In [None]:
# def weighted_rating(x):
#     C = vote_averages.mean()
#     m = vote_counts.quantile(0.60)
#     v = x['vote_count']
#     R = x['vote_average']
#     return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = movies_lang.iloc[movie_indices][['id','title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.70)
    
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [None]:
improved_recommendations(movie_input)

In [None]:
movies_lang.head()

# Use content based filtering on
- cast
- crew

### keywords processing

In [None]:
movies_lang['keywords'] = movies_lang['keywords'].apply(literal_eval)

In [None]:
s = movies_lang.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [None]:
s = s.value_counts()
s[:5]

In [None]:
s = s[s > 1]

In [None]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

In [None]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
movies_lang['keywords'] = movies_lang['keywords'].apply(filter_keywords)
movies_lang['keywords'] = movies_lang['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies_lang['keywords'] = movies_lang['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [None]:
movies_lang['director'] = movies_lang['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
movies_lang['director'] = movies_lang['director'].apply(lambda x: [x,x])

In [None]:
movies_lang['soup'] = movies_lang['director'] + movies_lang['cast'] + + movies_lang['keywords'] #.apply(lambda x: ' '.join(x*3))

In [None]:
# movies_lang['cast'] = movies_lang['cast'].apply(literal_eval)

In [None]:
movies_lang['soup'] = movies_lang['soup'].apply(lambda x: ' '.join(x))

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies_lang['soup'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
# movies_lang = movies_lang.reset_index(drop)
titles = movies_lang['title']
indices = pd.Series(movies_lang.index, index=movies_lang['title'])

In [None]:
improved_recommendations(movie_input)

## to do done optimize the code -- so that it find cosine similarity of a single movie

# solve dark knight error and 1857 - transformers error