In [None]:
!pip install contractions

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Handling Movie Data file

In [None]:
df_movie = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
# df_movie[["title","year"]] = df_movie["title"].str.split("(",expand=True)[[0,1]]
df_movie["year"] = df_movie["title"].str[-5:-1]
df_movie["title"] = df_movie["title"].str[:-7]
# df_movie["year"] = df_movie["year"].str.replace(")","")
df_movie["genres"] = df_movie["genres"].str.replace("|"," , ")

In [None]:
df_movie

## Handling Tags Data File

In [None]:
df_tags = pd.read_csv("/kaggle/input/movielens-20m-dataset/tag.csv").drop("timestamp",axis=1)
df_tags["tag"] = df_tags["tag"].astype(str)

In [None]:
df_tags

In [None]:
df_tags = df_tags.groupby("movieId")["tag"].apply(lambda x: ' , '.join(x))

In [None]:
df = pd.merge(df_tags,df_movie,on="movieId")

In [None]:
df

## Handling Ratings Data File (for avg rating and popularity)

In [None]:
df_rating = pd.read_csv("/kaggle/input/movielens-20m-dataset/rating.csv").drop("timestamp",axis=1)

In [None]:
movie_avg_rating = df_rating.groupby("movieId").agg({"rating":["mean","count"]}).reset_index()
movie_avg_rating.columns = ["movieId", "Avg_Rating" , "Popularity"]

In [None]:
df2 = pd.merge(df,movie_avg_rating,on="movieId")

In [None]:
df3 = df2.sort_values(by= ["Popularity","Avg_Rating"],ascending = False)
df3 = df3.drop_duplicates(subset=['movieId']).reset_index()

In [None]:
df3

# Getting more tags

#### According to the Genome_tages I am getting 95+% relevance tags

In [None]:
df_more_tags = pd.read_csv("/kaggle/input/movielens-20m-dataset/genome_scores.csv")
more_tags_data = pd.read_csv("/kaggle/input/movielens-20m-dataset/genome_tags.csv")
df_more_tags = pd.merge(df_more_tags,more_tags_data,on="tagId",how="left")
df_more_tags = df_more_tags[df_more_tags["relevance"]>=0.95]
more_tags = df_more_tags.groupby("movieId")["tag"].apply(lambda x: ' , '.join(x)).reset_index()
more_tags.columns = ["movieId","tag2"]

In [None]:
more_tags

In [None]:
df4 = pd.merge(df3,more_tags,on = "movieId",how="left")
df4 = df4.fillna(" ")

In [None]:
df4

In [None]:
df4["Final_Tags"] = df4["tag"] +" "+df4["genres"]+" "+df4["tag2"]+" "+df4["year"]

In [None]:
final_df = df4[["title","Final_Tags","year"]]

## Stripping the tags to 400 strings

In [None]:
final_df["Final_Tags"] = final_df["Final_Tags"].str[:400]


In [None]:
final_df

# Build a Movie Recommender System

In [None]:
import nltk
import re
import numpy as np
import contractions

stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    doc = contractions.fix(doc)
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    #filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(final_df['Final_Tags']))
len(norm_corpus)

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

In [None]:
movies_list = final_df['title'].values
movies_list, movies_list.shape

In [None]:
movies_list[0:20]

# Find Top Similar Movies for a Sample Movie
#### Let's take the most popular movie "Apollo 13" in the dataframe above and try and find the most similar movies which can be recommended



In [None]:
movie_idx = np.where(movies_list == 'Thor')[0][0]
movie_idx

In [None]:
doc_sim_df

In [None]:
movie_similarities = doc_sim_df.iloc[movie_idx].values
movie_similarities


In [None]:
similar_movie_idxs = np.argsort(-movie_similarities)[1:11]
similar_movie_idxs

In [None]:
similar_movies = movies_list[similar_movie_idxs]
similar_movies

In [None]:
def movie_recommender(movie_title, movies=movies_list, doc_sims=doc_sim_df,num=11):
    # find movie id
    movie_idx = np.where(movies == movie_title)[0][0]
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values
    # get top 5 similar movie IDs
    similar_movie_idxs = np.argsort(-movie_similarities)[1:num]
    # get top 5 movies
    similar_movies = movies[similar_movie_idxs]
    # return the top 5 movies
    return similar_movies

# Get popular Movie Recommendations


In [None]:
final_df[final_df["year"]=="2011"].iloc[0:20]

In [None]:
popular_movies = ['How to Train Your Dragon 2','Man of Steel','Lucy' ,'Thor', 'Hunger Games: Catching Fire, The','Super 8','Wolf of Wall Street, The',
                  'World War Z','Frozen','Now You See Me']

In [None]:
for movie in popular_movies:
    print('Movie:', movie)
    recommends = movie_recommender(movie_title=movie, movies=movies_list, doc_sims=doc_sim_df)
    for fold, i in enumerate(recommends):
        print('Top',str(fold+1),'recommended Movies : '+ str(i))
    print()

# Movie Recommendation with Embeddings(Better Results)

In [None]:
from gensim.models import FastText
tokenized_docs = [doc.split() for doc in norm_corpus]

In [None]:
ft_model = FastText(tokenized_docs, size=100, window=10, min_count=2, workers=1, sg=1, iter=50)

In [None]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [None]:
doc_vecs_ft = averaged_word2vec_vectorizer(tokenized_docs, ft_model, 100)
doc_vecs_ft.shape

## Get Movie Recommendations

In [None]:
doc_sim = cosine_similarity(doc_vecs_ft)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

In [None]:
for movie in popular_movies:
    print('Movie:', movie)
    recommends = movie_recommender(movie_title=movie, movies=movies_list, doc_sims=doc_sim_df)
    for fold, i in enumerate(recommends):
        print('Top',str(fold+1),'recommended Movies : '+ str(i))
    print()